SciPaperLoader/scipaperloader/scrapers/publisher_detector.py

import time
import requests
import re
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db

class Scraper(BaseScraper):
    """Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""

    # This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
    INPUT_STATUSES = ["New"]
    OUTPUT_STATUS_SUCCESS = "PublisherDetected"
    OUTPUT_STATUS_FAILURE = "Failed"
    OUTPUT_STATUS_PROCESSING = "DetectingPublisher"

    # Publisher detection patterns based on URL domains and paths
    PUBLISHER_URL_PATTERNS = {
        'elsevier': [
            r'sciencedirect\.com',
            r'elsevier\.com',
            r'.*\.elsevier\.com'
        ],
        'springer': [
            r'link\.springer\.com',
            r'springer\.com',
            r'.*\.springer\.com'
        ],
        'wiley': [
            r'onlinelibrary\.wiley\.com',
            r'wiley\.com',
            r'.*\.wiley\.com'
        ],
        'ieee': [
            r'ieeexplore\.ieee\.org',
            r'ieee\.org',
            r'.*\.ieee\.org'
        ],
        'plos': [
            r'journals\.plos\.org',
            r'plos\.org',
            r'.*\.plos\.org'
        ],
        'nature': [
            r'nature\.com',
            r'.*\.nature\.com'
        ],
        'sage': [
            r'journals\.sagepub\.com',
            r'sagepub\.com',
            r'.*\.sagepub\.com'
        ],
        'taylor_francis': [
            r'tandfonline\.com',
            r'.*\.tandfonline\.com'
        ],
        'acs': [
            r'pubs\.acs\.org',
            r'acs\.org',
            r'.*\.acs\.org'
        ],
        'arxiv': [
            r'arxiv\.org',
            r'export\.arxiv\.org'
        ],
        'pubmed': [
            r'pubmed\.ncbi\.nlm\.nih\.gov',
            r'ncbi\.nlm\.nih\.gov'
        ],
        'oxford': [
            r'academic\.oup\.com',
            r'oup\.com',
            r'.*\.oup\.com'
        ],
        'cambridge': [
            r'cambridge\.org',
            r'.*\.cambridge\.org'
        ],
        'biorxiv': [
            r'biorxiv\.org',
            r'.*\.biorxiv\.org'
        ],
        'researchgate': [
            r'researchgate\.net',
            r'.*\.researchgate\.net'
        ]
    }

    def scrape(self, doi: str) -> ScrapeResult:
        """Detect publisher from the final URL after DOI redirect."""
        start_time = time.time()

        paper = PaperMetadata.query.filter_by(doi=doi).first()
        if not paper:
            return ScrapeResult(
                status="error",
                message=f"No paper found for DOI {doi}",
                data=None,
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )

        # Log start of scraping
        self.log_scrape_start(doi, paper.id)

        # Update status to processing
        paper.status = self.OUTPUT_STATUS_PROCESSING
        db.session.commit()

        try:
            # Get the final URL by following the DOI redirect
            final_url = self._get_final_url(doi)

            if not final_url:
                error_msg = f"Could not resolve DOI {doi} to a URL"
                paper.status = self.OUTPUT_STATUS_FAILURE
                paper.error_msg = error_msg
                db.session.commit()

                self.log_scrape_failure(doi, error_msg, paper.id)

                return ScrapeResult(
                    status="error",
                    message=error_msg,
                    data={"error_code": "doi_resolution_failed"},
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )

            # Detect publisher from URL
            detected_publisher = self._detect_publisher_from_url(final_url)

            if detected_publisher:
                # Update paper with detected publisher
                paper.publisher = detected_publisher
                paper.status = self.OUTPUT_STATUS_SUCCESS
                paper.error_msg = None
                db.session.commit()

                success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
                self.log_scrape_success(doi, success_msg, paper.id)

                return ScrapeResult(
                    status="success",
                    message=success_msg,
                    data={
                        "publisher": detected_publisher,
                        "final_url": final_url
                    },
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )
            else:
                error_msg = f"Could not detect publisher from URL: {final_url}"
                paper.status = self.OUTPUT_STATUS_FAILURE
                paper.error_msg = error_msg
                db.session.commit()

                self.log_scrape_failure(doi, error_msg, paper.id)

                return ScrapeResult(
                    status="error",
                    message=error_msg,
                    data={
                        "final_url": final_url,
                        "error_code": "publisher_not_detected"
                    },
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )

        except Exception as e:
            error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
            paper.status = self.OUTPUT_STATUS_FAILURE
            paper.error_msg = error_msg
            db.session.commit()

            self.log_scrape_failure(doi, error_msg, paper.id)

            return ScrapeResult(
                status="error",
                message=error_msg,
                data={"error_code": "publisher_detection_error"},
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )

    def _get_final_url(self, doi: str) -> Optional[str]:
        """
        Get the final URL after following DOI redirects.

        Args:
            doi: The DOI to resolve

        Returns:
            Final URL after redirects, or None if resolution fails
        """
        try:
            doi_url = f"https://doi.org/{doi}"
            headers = {
                'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
            }

            # Make a HEAD request to get the final URL without downloading content
            response = requests.head(
                doi_url,
                headers=headers,
                timeout=15,
                allow_redirects=True
            )

            # If HEAD is not allowed, try GET but with minimal content
            if response.status_code == 405:  # Method Not Allowed
                response = requests.get(
                    doi_url,
                    headers=headers,
                    timeout=15,
                    allow_redirects=True,
                    stream=True  # Don't download the full content
                )
                response.close()  # Close connection after getting headers

            if response.status_code in [200, 302, 301]:
                return response.url
            else:
                return None

        except Exception as e:
            # Log error but don't raise - we'll handle this gracefully
            return None

    def _detect_publisher_from_url(self, url: str) -> Optional[str]:
        """
        Detect publisher from URL using domain patterns.

        Args:
            url: The URL to analyze

        Returns:
            Publisher name if detected, None otherwise
        """
        if not url:
            return None

        # Parse the URL to get the domain
        parsed_url = urlparse(url)
        domain = parsed_url.netloc.lower()

        # Remove 'www.' prefix if present
        if domain.startswith('www.'):
            domain = domain[4:]

        # Score each publisher based on URL pattern matches
        publisher_scores = {}

        for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
            score = 0
            for pattern in patterns:
                if re.search(pattern, domain, re.IGNORECASE):
                    score += 10  # Strong match for domain patterns

                # Also check the full URL for path-based patterns
                if re.search(pattern, url.lower(), re.IGNORECASE):
                    score += 5

            if score > 0:
                publisher_scores[publisher] = score

        # Return the publisher with the highest score
        if publisher_scores:
            best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])

            # Only return if we have a reasonable confidence (score > 5)
            if publisher_scores[best_publisher] > 5:
                return best_publisher

        return None