282 lines
9.3 KiB
Python
282 lines
9.3 KiB
Python
import time
|
|
import requests
|
|
import re
|
|
from urllib.parse import urlparse
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from .base import BaseScraper, ScrapeResult
|
|
from flask import current_app
|
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
|
from ..db import db
|
|
|
|
class Scraper(BaseScraper):
|
|
"""Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
|
|
|
|
# This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
|
|
INPUT_STATUSES = ["New"]
|
|
OUTPUT_STATUS_SUCCESS = "PublisherDetected"
|
|
OUTPUT_STATUS_FAILURE = "Failed"
|
|
OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
|
|
|
|
# Publisher detection patterns based on URL domains and paths
|
|
PUBLISHER_URL_PATTERNS = {
|
|
'elsevier': [
|
|
r'sciencedirect\.com',
|
|
r'elsevier\.com',
|
|
r'.*\.elsevier\.com'
|
|
],
|
|
'springer': [
|
|
r'link\.springer\.com',
|
|
r'springer\.com',
|
|
r'.*\.springer\.com'
|
|
],
|
|
'wiley': [
|
|
r'onlinelibrary\.wiley\.com',
|
|
r'wiley\.com',
|
|
r'.*\.wiley\.com'
|
|
],
|
|
'ieee': [
|
|
r'ieeexplore\.ieee\.org',
|
|
r'ieee\.org',
|
|
r'.*\.ieee\.org'
|
|
],
|
|
'plos': [
|
|
r'journals\.plos\.org',
|
|
r'plos\.org',
|
|
r'.*\.plos\.org'
|
|
],
|
|
'nature': [
|
|
r'nature\.com',
|
|
r'.*\.nature\.com'
|
|
],
|
|
'sage': [
|
|
r'journals\.sagepub\.com',
|
|
r'sagepub\.com',
|
|
r'.*\.sagepub\.com'
|
|
],
|
|
'taylor_francis': [
|
|
r'tandfonline\.com',
|
|
r'.*\.tandfonline\.com'
|
|
],
|
|
'acs': [
|
|
r'pubs\.acs\.org',
|
|
r'acs\.org',
|
|
r'.*\.acs\.org'
|
|
],
|
|
'arxiv': [
|
|
r'arxiv\.org',
|
|
r'export\.arxiv\.org'
|
|
],
|
|
'pubmed': [
|
|
r'pubmed\.ncbi\.nlm\.nih\.gov',
|
|
r'ncbi\.nlm\.nih\.gov'
|
|
],
|
|
'oxford': [
|
|
r'academic\.oup\.com',
|
|
r'oup\.com',
|
|
r'.*\.oup\.com'
|
|
],
|
|
'cambridge': [
|
|
r'cambridge\.org',
|
|
r'.*\.cambridge\.org'
|
|
],
|
|
'biorxiv': [
|
|
r'biorxiv\.org',
|
|
r'.*\.biorxiv\.org'
|
|
],
|
|
'researchgate': [
|
|
r'researchgate\.net',
|
|
r'.*\.researchgate\.net'
|
|
]
|
|
}
|
|
|
|
def scrape(self, doi: str) -> ScrapeResult:
|
|
"""Detect publisher from the final URL after DOI redirect."""
|
|
start_time = time.time()
|
|
|
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
|
if not paper:
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=f"No paper found for DOI {doi}",
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Log start of scraping
|
|
self.log_scrape_start(doi, paper.id)
|
|
|
|
# Update status to processing
|
|
paper.status = self.OUTPUT_STATUS_PROCESSING
|
|
db.session.commit()
|
|
|
|
try:
|
|
# Get the final URL by following the DOI redirect
|
|
final_url = self._get_final_url(doi)
|
|
|
|
if not final_url:
|
|
error_msg = f"Could not resolve DOI {doi} to a URL"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "doi_resolution_failed"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Detect publisher from URL
|
|
detected_publisher = self._detect_publisher_from_url(final_url)
|
|
|
|
if detected_publisher:
|
|
# Update paper with detected publisher
|
|
paper.publisher = detected_publisher
|
|
paper.status = self.OUTPUT_STATUS_SUCCESS
|
|
paper.error_msg = None
|
|
db.session.commit()
|
|
|
|
success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
|
|
self.log_scrape_success(doi, success_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="success",
|
|
message=success_msg,
|
|
data={
|
|
"publisher": detected_publisher,
|
|
"final_url": final_url
|
|
},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
else:
|
|
error_msg = f"Could not detect publisher from URL: {final_url}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={
|
|
"final_url": final_url,
|
|
"error_code": "publisher_not_detected"
|
|
},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "publisher_detection_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
def _get_final_url(self, doi: str) -> Optional[str]:
|
|
"""
|
|
Get the final URL after following DOI redirects.
|
|
|
|
Args:
|
|
doi: The DOI to resolve
|
|
|
|
Returns:
|
|
Final URL after redirects, or None if resolution fails
|
|
"""
|
|
try:
|
|
doi_url = f"https://doi.org/{doi}"
|
|
headers = {
|
|
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
|
}
|
|
|
|
# Make a HEAD request to get the final URL without downloading content
|
|
response = requests.head(
|
|
doi_url,
|
|
headers=headers,
|
|
timeout=15,
|
|
allow_redirects=True
|
|
)
|
|
|
|
# If HEAD is not allowed, try GET but with minimal content
|
|
if response.status_code == 405: # Method Not Allowed
|
|
response = requests.get(
|
|
doi_url,
|
|
headers=headers,
|
|
timeout=15,
|
|
allow_redirects=True,
|
|
stream=True # Don't download the full content
|
|
)
|
|
response.close() # Close connection after getting headers
|
|
|
|
if response.status_code in [200, 302, 301]:
|
|
return response.url
|
|
else:
|
|
return None
|
|
|
|
except Exception as e:
|
|
# Log error but don't raise - we'll handle this gracefully
|
|
return None
|
|
|
|
def _detect_publisher_from_url(self, url: str) -> Optional[str]:
|
|
"""
|
|
Detect publisher from URL using domain patterns.
|
|
|
|
Args:
|
|
url: The URL to analyze
|
|
|
|
Returns:
|
|
Publisher name if detected, None otherwise
|
|
"""
|
|
if not url:
|
|
return None
|
|
|
|
# Parse the URL to get the domain
|
|
parsed_url = urlparse(url)
|
|
domain = parsed_url.netloc.lower()
|
|
|
|
# Remove 'www.' prefix if present
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
|
|
# Score each publisher based on URL pattern matches
|
|
publisher_scores = {}
|
|
|
|
for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
|
|
score = 0
|
|
for pattern in patterns:
|
|
if re.search(pattern, domain, re.IGNORECASE):
|
|
score += 10 # Strong match for domain patterns
|
|
|
|
# Also check the full URL for path-based patterns
|
|
if re.search(pattern, url.lower(), re.IGNORECASE):
|
|
score += 5
|
|
|
|
if score > 0:
|
|
publisher_scores[publisher] = score
|
|
|
|
# Return the publisher with the highest score
|
|
if publisher_scores:
|
|
best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
|
|
|
|
# Only return if we have a reasonable confidence (score > 5)
|
|
if publisher_scores[best_publisher] > 5:
|
|
return best_publisher
|
|
|
|
return None |