import time import requests import re from urllib.parse import urlparse from datetime import datetime from typing import Optional from .base import BaseScraper, ScrapeResult from flask import current_app from ..models import PaperMetadata, ActivityLog, DownloadPathConfig from ..db import db class Scraper(BaseScraper): """Publisher detection scraper that identifies the publisher from the final URL after DOI redirect.""" # This scraper processes "New" papers and outputs "PublisherDetected"/"Failed" INPUT_STATUSES = ["New"] OUTPUT_STATUS_SUCCESS = "PublisherDetected" OUTPUT_STATUS_FAILURE = "Failed" OUTPUT_STATUS_PROCESSING = "DetectingPublisher" # Publisher detection patterns based on URL domains and paths PUBLISHER_URL_PATTERNS = { 'elsevier': [ r'sciencedirect\.com', r'elsevier\.com', r'.*\.elsevier\.com' ], 'springer': [ r'link\.springer\.com', r'springer\.com', r'.*\.springer\.com' ], 'wiley': [ r'onlinelibrary\.wiley\.com', r'wiley\.com', r'.*\.wiley\.com' ], 'ieee': [ r'ieeexplore\.ieee\.org', r'ieee\.org', r'.*\.ieee\.org' ], 'plos': [ r'journals\.plos\.org', r'plos\.org', r'.*\.plos\.org' ], 'nature': [ r'nature\.com', r'.*\.nature\.com' ], 'sage': [ r'journals\.sagepub\.com', r'sagepub\.com', r'.*\.sagepub\.com' ], 'taylor_francis': [ r'tandfonline\.com', r'.*\.tandfonline\.com' ], 'acs': [ r'pubs\.acs\.org', r'acs\.org', r'.*\.acs\.org' ], 'arxiv': [ r'arxiv\.org', r'export\.arxiv\.org' ], 'pubmed': [ r'pubmed\.ncbi\.nlm\.nih\.gov', r'ncbi\.nlm\.nih\.gov' ], 'oxford': [ r'academic\.oup\.com', r'oup\.com', r'.*\.oup\.com' ], 'cambridge': [ r'cambridge\.org', r'.*\.cambridge\.org' ], 'biorxiv': [ r'biorxiv\.org', r'.*\.biorxiv\.org' ], 'researchgate': [ r'researchgate\.net', r'.*\.researchgate\.net' ] } def scrape(self, doi: str) -> ScrapeResult: """Detect publisher from the final URL after DOI redirect.""" start_time = time.time() paper = PaperMetadata.query.filter_by(doi=doi).first() if not paper: return ScrapeResult( status="error", message=f"No paper found for DOI {doi}", data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Log start of scraping self.log_scrape_start(doi, paper.id) # Update status to processing paper.status = self.OUTPUT_STATUS_PROCESSING db.session.commit() try: # Get the final URL by following the DOI redirect final_url = self._get_final_url(doi) if not final_url: error_msg = f"Could not resolve DOI {doi} to a URL" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "doi_resolution_failed"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Detect publisher from URL detected_publisher = self._detect_publisher_from_url(final_url) if detected_publisher: # Update paper with detected publisher paper.publisher = detected_publisher paper.status = self.OUTPUT_STATUS_SUCCESS paper.error_msg = None db.session.commit() success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}" self.log_scrape_success(doi, success_msg, paper.id) return ScrapeResult( status="success", message=success_msg, data={ "publisher": detected_publisher, "final_url": final_url }, duration=time.time() - start_time, timestamp=datetime.utcnow() ) else: error_msg = f"Could not detect publisher from URL: {final_url}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={ "final_url": final_url, "error_code": "publisher_not_detected" }, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except Exception as e: error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "publisher_detection_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) def _get_final_url(self, doi: str) -> Optional[str]: """ Get the final URL after following DOI redirects. Args: doi: The DOI to resolve Returns: Final URL after redirects, or None if resolution fails """ try: doi_url = f"https://doi.org/{doi}" headers = { 'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } # Make a HEAD request to get the final URL without downloading content response = requests.head( doi_url, headers=headers, timeout=15, allow_redirects=True ) # If HEAD is not allowed, try GET but with minimal content if response.status_code == 405: # Method Not Allowed response = requests.get( doi_url, headers=headers, timeout=15, allow_redirects=True, stream=True # Don't download the full content ) response.close() # Close connection after getting headers if response.status_code in [200, 302, 301]: return response.url else: return None except Exception as e: # Log error but don't raise - we'll handle this gracefully return None def _detect_publisher_from_url(self, url: str) -> Optional[str]: """ Detect publisher from URL using domain patterns. Args: url: The URL to analyze Returns: Publisher name if detected, None otherwise """ if not url: return None # Parse the URL to get the domain parsed_url = urlparse(url) domain = parsed_url.netloc.lower() # Remove 'www.' prefix if present if domain.startswith('www.'): domain = domain[4:] # Score each publisher based on URL pattern matches publisher_scores = {} for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items(): score = 0 for pattern in patterns: if re.search(pattern, domain, re.IGNORECASE): score += 10 # Strong match for domain patterns # Also check the full URL for path-based patterns if re.search(pattern, url.lower(), re.IGNORECASE): score += 5 if score > 0: publisher_scores[publisher] = score # Return the publisher with the highest score if publisher_scores: best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x]) # Only return if we have a reasonable confidence (score > 5) if publisher_scores[best_publisher] > 5: return best_publisher return None