124 lines
4.4 KiB
Python
124 lines
4.4 KiB
Python
import time
|
|
import random
|
|
import os
|
|
from datetime import datetime
|
|
from .base import BaseScraper, ScrapeResult
|
|
from flask import current_app
|
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
|
from ..db import db
|
|
|
|
class Scraper(BaseScraper):
|
|
"""Retry scraper that attempts to re-process failed papers with different strategies."""
|
|
|
|
# This scraper specifically targets "Failed" papers and retries them
|
|
INPUT_STATUSES = ["Failed"]
|
|
OUTPUT_STATUS_SUCCESS = "Done"
|
|
OUTPUT_STATUS_FAILURE = "Failed"
|
|
OUTPUT_STATUS_PROCESSING = "Retrying"
|
|
|
|
def scrape(self, doi: str) -> ScrapeResult:
|
|
"""Retry scraping a failed paper with enhanced error handling."""
|
|
start_time = time.time()
|
|
|
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
|
if not paper:
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=f"No paper found for DOI {doi}",
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Log retry attempt
|
|
ActivityLog.log_scraper_activity(
|
|
action="retry_failed_paper",
|
|
status="info",
|
|
description=f"Retrying failed paper: {paper.title}",
|
|
paper_id=paper.id
|
|
)
|
|
|
|
# Simulate longer processing time for retry (2-5 seconds)
|
|
processing_time = random.uniform(2, 5)
|
|
time.sleep(processing_time)
|
|
|
|
# Simulate 60% success rate on retry (lower than initial attempt)
|
|
success = random.random() < 0.6
|
|
|
|
result_data = {}
|
|
|
|
if success:
|
|
# Get download path and create dummy file
|
|
download_path = DownloadPathConfig.get_path()
|
|
file_name = f"{doi.replace('/', '_')}_retry.pdf"
|
|
file_path = f"{download_path}/{file_name}"
|
|
|
|
try:
|
|
# Ensure directory exists
|
|
os.makedirs(download_path, exist_ok=True)
|
|
|
|
# Create a dummy PDF file
|
|
with open(file_path, 'w') as f:
|
|
f.write(f"Dummy PDF content for retry of {doi}")
|
|
|
|
result_data = {"file_path": file_path}
|
|
|
|
# Log success
|
|
ActivityLog.log_scraper_activity(
|
|
action="retry_scrape_success",
|
|
status="success",
|
|
description=f"Successfully retried {doi} on second attempt",
|
|
paper_id=paper.id
|
|
)
|
|
|
|
result = ScrapeResult(
|
|
status="success",
|
|
message=f"Successfully retried paper {doi}",
|
|
data=result_data,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Failed to save retry file: {str(e)}"
|
|
ActivityLog.log_scraper_activity(
|
|
action="retry_scrape_file_error",
|
|
status="error",
|
|
description=error_msg,
|
|
paper_id=paper.id
|
|
)
|
|
|
|
result = ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
else:
|
|
# Retry failed - generate different error message
|
|
error_messages = [
|
|
"Retry failed: Still no access to publisher",
|
|
"Retry failed: Alternative download methods exhausted",
|
|
"Retry failed: DOI appears permanently inaccessible",
|
|
"Retry failed: Network timeout persists"
|
|
]
|
|
error_msg = random.choice(error_messages)
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="retry_scrape_failure",
|
|
status="error",
|
|
description=f"Retry failed for {doi}: {error_msg}",
|
|
paper_id=paper.id
|
|
)
|
|
|
|
result = ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
return result
|