SciPaperLoader/scipaperloader/scrapers/publisher_detector.py

282 lines
9.3 KiB
Python

import time
import requests
import re
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
# This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "PublisherDetected"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
# Publisher detection patterns based on URL domains and paths
PUBLISHER_URL_PATTERNS = {
'elsevier': [
r'sciencedirect\.com',
r'elsevier\.com',
r'.*\.elsevier\.com'
],
'springer': [
r'link\.springer\.com',
r'springer\.com',
r'.*\.springer\.com'
],
'wiley': [
r'onlinelibrary\.wiley\.com',
r'wiley\.com',
r'.*\.wiley\.com'
],
'ieee': [
r'ieeexplore\.ieee\.org',
r'ieee\.org',
r'.*\.ieee\.org'
],
'plos': [
r'journals\.plos\.org',
r'plos\.org',
r'.*\.plos\.org'
],
'nature': [
r'nature\.com',
r'.*\.nature\.com'
],
'sage': [
r'journals\.sagepub\.com',
r'sagepub\.com',
r'.*\.sagepub\.com'
],
'taylor_francis': [
r'tandfonline\.com',
r'.*\.tandfonline\.com'
],
'acs': [
r'pubs\.acs\.org',
r'acs\.org',
r'.*\.acs\.org'
],
'arxiv': [
r'arxiv\.org',
r'export\.arxiv\.org'
],
'pubmed': [
r'pubmed\.ncbi\.nlm\.nih\.gov',
r'ncbi\.nlm\.nih\.gov'
],
'oxford': [
r'academic\.oup\.com',
r'oup\.com',
r'.*\.oup\.com'
],
'cambridge': [
r'cambridge\.org',
r'.*\.cambridge\.org'
],
'biorxiv': [
r'biorxiv\.org',
r'.*\.biorxiv\.org'
],
'researchgate': [
r'researchgate\.net',
r'.*\.researchgate\.net'
]
}
def scrape(self, doi: str) -> ScrapeResult:
"""Detect publisher from the final URL after DOI redirect."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
try:
# Get the final URL by following the DOI redirect
final_url = self._get_final_url(doi)
if not final_url:
error_msg = f"Could not resolve DOI {doi} to a URL"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "doi_resolution_failed"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Detect publisher from URL
detected_publisher = self._detect_publisher_from_url(final_url)
if detected_publisher:
# Update paper with detected publisher
paper.publisher = detected_publisher
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.error_msg = None
db.session.commit()
success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=success_msg,
data={
"publisher": detected_publisher,
"final_url": final_url
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
error_msg = f"Could not detect publisher from URL: {final_url}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={
"final_url": final_url,
"error_code": "publisher_not_detected"
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "publisher_detection_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
def _get_final_url(self, doi: str) -> Optional[str]:
"""
Get the final URL after following DOI redirects.
Args:
doi: The DOI to resolve
Returns:
Final URL after redirects, or None if resolution fails
"""
try:
doi_url = f"https://doi.org/{doi}"
headers = {
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
# Make a HEAD request to get the final URL without downloading content
response = requests.head(
doi_url,
headers=headers,
timeout=15,
allow_redirects=True
)
# If HEAD is not allowed, try GET but with minimal content
if response.status_code == 405: # Method Not Allowed
response = requests.get(
doi_url,
headers=headers,
timeout=15,
allow_redirects=True,
stream=True # Don't download the full content
)
response.close() # Close connection after getting headers
if response.status_code in [200, 302, 301]:
return response.url
else:
return None
except Exception as e:
# Log error but don't raise - we'll handle this gracefully
return None
def _detect_publisher_from_url(self, url: str) -> Optional[str]:
"""
Detect publisher from URL using domain patterns.
Args:
url: The URL to analyze
Returns:
Publisher name if detected, None otherwise
"""
if not url:
return None
# Parse the URL to get the domain
parsed_url = urlparse(url)
domain = parsed_url.netloc.lower()
# Remove 'www.' prefix if present
if domain.startswith('www.'):
domain = domain[4:]
# Score each publisher based on URL pattern matches
publisher_scores = {}
for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
score = 0
for pattern in patterns:
if re.search(pattern, domain, re.IGNORECASE):
score += 10 # Strong match for domain patterns
# Also check the full URL for path-based patterns
if re.search(pattern, url.lower(), re.IGNORECASE):
score += 5
if score > 0:
publisher_scores[publisher] = score
# Return the publisher with the highest score
if publisher_scores:
best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
# Only return if we have a reasonable confidence (score > 5)
if publisher_scores[best_publisher] > 5:
return best_publisher
return None