adds scraper modules and modular publisher parser system

This commit is contained in:
Michael Beck 2025-06-13 10:11:59 +02:00
parent ce6bc03b46
commit a7964a2f3d
22 changed files with 2877 additions and 71 deletions

View File

@ -1,5 +1,5 @@
# List of phony targets (targets that don't represent files)
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics clean-papers purge-db
# Define Python and pip executables inside virtual environment
PYTHON := venv/bin/python
@ -14,7 +14,7 @@ clean:
rm -rf venv build dist .pytest_cache .mypy_cache *.egg-info
# Define database path
DB_PATH=scipaperloader/papers.db
DB_PATH=instance/papers.db
# Backup the database with timestamp
backup-db:
@ -90,6 +90,24 @@ reset-db: venv
$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
$(PYTHON) -m flask --app scipaperloader db upgrade
# Clean all papers from the database (keep other tables intact)
clean-papers: venv
@echo "Cleaning all papers from the database..."
@$(PYTHON) -c "from scipaperloader.db import db; from scipaperloader.models import PaperMetadata; from scipaperloader import create_app; app = create_app(); app.app_context().push(); PaperMetadata.query.delete(); db.session.commit(); print('All papers have been removed from the database')"
# Completely purge all database contents (removes all tables and data)
purge-db: venv
@echo "WARNING: This will completely wipe all database contents!"
@read -p "Are you sure you want to continue? (y/N) " -n 1 -r; \
echo; \
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
echo "Purging database..."; \
rm -f $(DB_PATH); \
echo "Database completely purged"; \
else \
echo "Operation cancelled"; \
fi
# Create and set up virtual environment
venv:
python3 -m venv venv && \

View File

@ -15,6 +15,8 @@ dependencies = [
"pandas>=2.2.3,<3",
"APScheduler>=3.10.4,<4",
"flask-migrate>=4.1.0,<5",
"beautifulsoup4>=4.13.4,<5 ",
"requests>=2.32.4,<3"
]
[project.optional-dependencies]

View File

@ -29,6 +29,10 @@ def index():
# Get volume configuration
volume_config = VolumeConfig.get_current_volume()
# Get scraper module configuration
from ..models import ScraperModuleConfig
current_scraper_module = ScraperModuleConfig.get_current_module()
# Get paper counts by status
paper_counts = {
'new': PaperMetadata.query.filter_by(status='New').count(),
@ -46,7 +50,10 @@ def index():
recent_logs=recent_logs,
paper_counts=paper_counts,
volume_config=volume_config,
max_volume=MAX_VOLUME
max_volume=MAX_VOLUME,
current_scraper_module=current_scraper_module,
available_scraper_modules=[s["name"] for s in available_scrapers],
scraper_details={s["name"]: s for s in available_scrapers}
)
@bp.route("/start", methods=["POST"])
@ -219,6 +226,13 @@ def get_status():
# Get current hour quota info
current_quota = scraper_manager.get_current_hour_quota()
# Get current scraper module configuration
from ..models import ScraperModuleConfig
current_scraper_module = ScraperModuleConfig.get_current_module()
# Get volume configuration
current_volume = VolumeConfig.get_current_volume()
return jsonify({
"success": True,
"scraper_state": {
@ -227,7 +241,9 @@ def get_status():
"last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
},
"paper_counts": paper_counts,
"current_quota": current_quota
"current_quota": current_quota,
"current_scraper_module": current_scraper_module,
"volume_config": current_volume
})
except Exception as e:
@ -665,6 +681,35 @@ def update_scraper_config():
"message": message
}), 400
# Handle scraper module configuration updates
if "scraper_module" in data:
from ..models import ScraperModuleConfig
new_module = data["scraper_module"]
# Validate that the module exists and is valid
available_modules = [m["name"] for m in get_available_scrapers()]
if new_module not in available_modules:
return jsonify({
"success": False,
"message": f"Invalid scraper module: {new_module}"
}), 400
# Update the database configuration
ScraperModuleConfig.set_module(new_module)
ActivityLog.log_scraper_command(
action="update_scraper_module",
status="success",
description=f"Updated scraper module to '{new_module}'"
)
return jsonify({
"success": True,
"message": f"Scraper module updated to '{new_module}' successfully"
})
# Handle other configuration updates here if needed in the future
return jsonify({
@ -682,3 +727,72 @@ def update_scraper_config():
"success": False,
"message": f"Error updating scraper config: {str(e)}"
}), 500
@bp.route("/publishers")
def get_publishers():
"""Get publisher overview data for the scraper overview modal."""
try:
import os
import glob
# Get available parser modules
parsers_dir = os.path.join(current_app.root_path, 'parsers')
parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py'))
available_parsers = []
for parser_file in parser_files:
filename = os.path.basename(parser_file)
if filename != 'base_parser.py': # Skip the base parser
parser_name = filename.replace('_parser.py', '')
available_parsers.append(parser_name)
# Get publishers from database (papers that have publisher detected)
publisher_query = db.session.query(
PaperMetadata.publisher,
db.func.count(PaperMetadata.id).label('paper_count')
).filter(
PaperMetadata.publisher.isnot(None),
PaperMetadata.publisher != ''
).group_by(PaperMetadata.publisher).all()
publishers_data = []
for publisher, count in publisher_query:
# Check if a parser exists for this publisher
has_parser = publisher in available_parsers
publishers_data.append({
'name': publisher,
'paper_count': count,
'has_parser': has_parser,
'parser_status': 'available' if has_parser else 'missing'
})
# Sort by paper count descending
publishers_data.sort(key=lambda x: x['paper_count'], reverse=True)
# Get totals
total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data)
total_papers_without_publisher = PaperMetadata.query.filter(
db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '')
).count()
return jsonify({
'success': True,
'data': {
'publishers': publishers_data,
'available_parsers': available_parsers,
'stats': {
'total_publishers': len(publishers_data),
'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]),
'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]),
'total_papers_with_publisher': total_papers_with_publisher,
'total_papers_without_publisher': total_papers_without_publisher
}
}
})
except Exception as e:
return jsonify({
'success': False,
'message': f'Error getting publisher data: {str(e)}'
}), 500

View File

@ -191,6 +191,7 @@ class PaperMetadata(db.Model):
type = db.Column(db.String(50))
language = db.Column(db.String(50))
published_online = db.Column(db.Date) # or DateTime/String
publisher = db.Column(db.String(100), nullable=True) # Detected publisher name
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
file_path = db.Column(db.Text)

View File

@ -0,0 +1,6 @@
# Parser modules for extracting full text from publisher-specific HTML content
from .base_parser import BaseParser, ParsedContent, ParseError
from .elsevier_parser import ElsevierParser
from .arxiv_parser import ArxivParser
__all__ = ['BaseParser', 'ParsedContent', 'ParseError', 'ElsevierParser', 'ArxivParser']

View File

@ -0,0 +1,227 @@
import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError
class ArxivParser(BaseParser):
"""Parser for arXiv papers."""
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""Check if this is an arXiv page."""
html_lower = html_content.lower()
# Check for arXiv indicators
indicators = [
'arxiv.org',
'export.arxiv.org',
'arxiv:',
'meta name="citation_publisher" content="arxiv"',
]
return any(indicator in html_lower for indicator in indicators)
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""Parse arXiv HTML content."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title = self._extract_title(soup)
# Extract abstract
abstract = self._extract_abstract(soup)
# Extract authors
authors = self._extract_authors(soup)
# Extract full text (arXiv usually just has abstract on the HTML page)
full_text = self._extract_full_text(soup, abstract)
# Extract keywords/subjects
keywords = self._extract_subjects(soup)
# Extract arxiv ID
arxiv_id = self._extract_arxiv_id(soup)
if not full_text or len(full_text.strip()) < 50:
raise ParseError("Could not extract meaningful content from arXiv page")
return ParsedContent(
full_text=full_text,
title=title,
abstract=abstract,
authors=authors,
keywords=keywords,
sections=None, # arXiv HTML pages don't usually have full sections
references=None, # References are typically in the PDF
doi=doi,
journal="arXiv",
publication_date=self._extract_submission_date(soup),
metadata={
'parser': 'arxiv',
'arxiv_id': arxiv_id,
'source': 'arxiv.org'
}
)
except Exception as e:
raise ParseError(f"Failed to parse arXiv content: {str(e)}")
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract paper title."""
# Try multiple title selectors for arXiv
selectors = [
'h1.title',
'meta[name="citation_title"]',
'title'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_title'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True)
# Remove "Title:" prefix if present
text = re.sub(r'^Title:\s*', '', text)
return text
return None
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract paper abstract."""
# arXiv abstract selectors
selectors = [
'blockquote.abstract',
'div.abstract',
'meta[name="citation_abstract"]'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_abstract'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True)
# Remove "Abstract:" prefix if present
text = re.sub(r'^Abstract:\s*', '', text)
return text
return None
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract author names."""
authors = []
# Try author meta tags
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
if author_metas:
authors = [meta.get('content', '').strip() for meta in author_metas]
# Try arXiv author div
if not authors:
authors_div = soup.select_one('div.authors')
if authors_div:
# Extract author links or text
author_links = authors_div.find_all('a')
if author_links:
authors = [link.get_text(strip=True) for link in author_links]
else:
# Fallback to text parsing
text = authors_div.get_text()
# Remove "Authors:" prefix and split by commas
text = re.sub(r'^Authors?:\s*', '', text)
authors = [author.strip() for author in text.split(',')]
return authors if authors else None
def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
"""Extract main content (usually just abstract for arXiv HTML pages)."""
content_parts = []
# For arXiv, the HTML page typically only contains abstract and metadata
# The full text is in the PDF
if abstract:
content_parts.append(f"Abstract\n{abstract}")
# Look for any additional content sections
comments_section = soup.select_one('td.comments')
if comments_section:
comments = comments_section.get_text(strip=True)
if comments:
content_parts.append(f"Comments\n{comments}")
# Add note about PDF availability
content_parts.append(
"\nNote: This is the abstract and metadata from the arXiv HTML page. "
"The full text is available in the PDF version."
)
return '\n\n'.join(content_parts)
def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract subject classifications."""
subjects = []
# Look for subject classification
subjects_td = soup.select_one('td.subjects')
if subjects_td:
subjects_text = subjects_td.get_text(strip=True)
# Parse subjects (format: "Primary: subject1; Secondary: subject2")
subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
# Clean up prefixes
subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
subjects = [subj for subj in subjects if subj] # Remove empty strings
return subjects if subjects else None
def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract arXiv ID."""
# Look for arXiv ID in various places
arxiv_id_patterns = [
r'arXiv:(\d+\.\d+(?:v\d+)?)',
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
]
# Search in page text
page_text = soup.get_text()
for pattern in arxiv_id_patterns:
match = re.search(pattern, page_text)
if match:
return match.group(1)
# Search in URL or meta tags
canonical_link = soup.find('link', attrs={'rel': 'canonical'})
if canonical_link:
href = canonical_link.get('href', '')
for pattern in arxiv_id_patterns:
match = re.search(pattern, href)
if match:
return match.group(1)
return None
def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract submission date."""
# Look for submission date
submission_td = soup.select_one('td.submission-history')
if submission_td:
date_text = submission_td.get_text()
# Extract date (format varies)
date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
if date_match:
return date_match.group(1)
# Try meta tag
date_meta = soup.find('meta', attrs={'name': 'citation_date'})
if date_meta:
return date_meta.get('content', '').strip()
return None

View File

@ -0,0 +1,83 @@
from abc import ABC, abstractmethod
from typing import Dict, Optional, List
from dataclasses import dataclass
@dataclass
class ParsedContent:
"""Container for parsed content from a publisher's HTML."""
full_text: str
title: Optional[str] = None
abstract: Optional[str] = None
authors: Optional[List[str]] = None
keywords: Optional[List[str]] = None
sections: Optional[Dict[str, str]] = None # section_title -> section_content
references: Optional[List[str]] = None
doi: Optional[str] = None
journal: Optional[str] = None
publication_date: Optional[str] = None
metadata: Optional[Dict] = None # Additional metadata specific to publisher
class BaseParser(ABC):
"""Base class for all publisher-specific parsers."""
def __init__(self):
self.parser_name = self.__class__.__name__.lower().replace('parser', '')
@abstractmethod
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""
Check if this parser can handle the given HTML content.
Args:
html_content: The HTML content to check
url: Optional URL of the content (for additional context)
Returns:
True if this parser can handle the content, False otherwise
"""
pass
@abstractmethod
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""
Parse HTML content and extract structured information.
Args:
html_content: The HTML content to parse
doi: Optional DOI of the paper
Returns:
ParsedContent object with extracted information
Raises:
ParseError: If parsing fails
"""
pass
def get_name(self) -> str:
"""Return the name of this parser."""
return self.parser_name
def get_description(self) -> str:
"""Return a description of this parser."""
return getattr(self.__class__, "__doc__", "No description available")
def validate_content(self, content: ParsedContent) -> bool:
"""
Validate the parsed content to ensure it meets minimum requirements.
Args:
content: The parsed content to validate
Returns:
True if content is valid, False otherwise
"""
# Basic validation - must have some full text
if not content.full_text or len(content.full_text.strip()) < 100:
return False
return True
class ParseError(Exception):
"""Exception raised when parsing fails."""
pass

View File

@ -0,0 +1,252 @@
import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError
class ElsevierParser(BaseParser):
"""Parser for Elsevier/ScienceDirect articles."""
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""Check if this is an Elsevier/ScienceDirect page."""
html_lower = html_content.lower()
# Check for Elsevier/ScienceDirect indicators
indicators = [
'sciencedirect.com',
'elsevier.com',
'meta name="citation_publisher" content="elsevier"',
'copyright.*elsevier',
'sciencedirect',
]
return any(indicator in html_lower for indicator in indicators)
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""Parse Elsevier/ScienceDirect HTML content."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title = self._extract_title(soup)
# Extract abstract
abstract = self._extract_abstract(soup)
# Extract authors
authors = self._extract_authors(soup)
# Extract full text
full_text = self._extract_full_text(soup)
# Extract sections
sections = self._extract_sections(soup)
# Extract keywords
keywords = self._extract_keywords(soup)
# Extract references
references = self._extract_references(soup)
# Extract journal info
journal = self._extract_journal(soup)
# Extract publication date
publication_date = self._extract_publication_date(soup)
# Combine everything into full text if sections exist
if sections:
full_text = self._combine_sections(sections, abstract)
if not full_text or len(full_text.strip()) < 100:
raise ParseError("Could not extract meaningful full text content")
return ParsedContent(
full_text=full_text,
title=title,
abstract=abstract,
authors=authors,
keywords=keywords,
sections=sections,
references=references,
doi=doi,
journal=journal,
publication_date=publication_date,
metadata={
'parser': 'elsevier',
'source': 'sciencedirect'
}
)
except Exception as e:
raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract article title."""
# Try multiple title selectors
selectors = [
'h1.title-text',
'h1[data-testid="title"]',
'h1.article-title',
'meta[name="citation_title"]',
'title'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_title'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return None
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract article abstract."""
selectors = [
'div.abstract-content',
'div[data-testid="abstract"]',
'div.abstract',
'section.abstract',
'div#abstract'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return None
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract author names."""
authors = []
# Try author meta tags
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
if author_metas:
authors = [meta.get('content', '').strip() for meta in author_metas]
# Try author div/span elements
if not authors:
author_elements = soup.select('div.author a, span.author, .author-name')
authors = [elem.get_text(strip=True) for elem in author_elements]
return authors if authors else None
def _extract_full_text(self, soup: BeautifulSoup) -> str:
"""Extract main article content."""
content_parts = []
# Try main content selectors
main_selectors = [
'div.article-content',
'div.body-content',
'main.article-body',
'div[data-testid="article-body"]',
'section.article-section'
]
for selector in main_selectors:
elements = soup.select(selector)
for element in elements:
# Remove script, style, and navigation elements
for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
unwanted.decompose()
text = element.get_text(separator='\n', strip=True)
if text and len(text) > 50: # Only add substantial content
content_parts.append(text)
return '\n\n'.join(content_parts)
def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
"""Extract article sections with headings."""
sections = {}
# Look for section headings and content
section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
for heading in section_elements:
section_title = heading.get_text(strip=True)
# Find content after this heading until next heading
content_parts = []
current = heading.next_sibling
while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
if hasattr(current, 'get_text'):
text = current.get_text(strip=True)
if text:
content_parts.append(text)
current = current.next_sibling
if content_parts:
sections[section_title] = '\n'.join(content_parts)
return sections if sections else None
def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract article keywords."""
keywords = []
# Try keyword meta tags
keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
if keyword_metas:
for meta in keyword_metas:
content = meta.get('content', '')
if content:
keywords.extend([kw.strip() for kw in content.split(',')])
# Try keyword sections
if not keywords:
keyword_sections = soup.select('div.keywords, section.keywords')
for section in keyword_sections:
text = section.get_text()
keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
return keywords if keywords else None
def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract references."""
references = []
ref_sections = soup.select('section.references, div.references, ol.references li')
for section in ref_sections:
if section.name == 'li':
references.append(section.get_text(strip=True))
else:
ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
references.extend([item.get_text(strip=True) for item in ref_items])
return references if references else None
def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract journal name."""
journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
if journal_meta:
return journal_meta.get('content', '').strip()
return None
def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract publication date."""
date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
if date_meta:
return date_meta.get('content', '').strip()
return None
def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
"""Combine all sections into full text."""
full_text_parts = []
if abstract:
full_text_parts.append(f"Abstract\n{abstract}")
for section_title, section_content in sections.items():
full_text_parts.append(f"{section_title}\n{section_content}")
return '\n\n'.join(full_text_parts)

View File

@ -18,6 +18,43 @@ class BaseScraper(ABC):
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
def __init__(self):
"""Initialize the scraper."""
self.scraper_name = self.get_name().lower()
def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
"""Log the start of a scraping operation."""
from ..models import ActivityLog
ActivityLog.log_scraper_activity(
action=f"{self.scraper_name}_scrape_start",
status="info",
description=f"Starting {self.get_name()} for DOI: {doi}",
paper_id=paper_id
)
def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
"""Log successful completion of scraping."""
from ..models import ActivityLog
ActivityLog.log_scraper_activity(
action=f"{self.scraper_name}_scrape_success",
status="success",
description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
paper_id=paper_id
)
def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
"""Log failed scraping operation."""
from ..models import ActivityLog
ActivityLog.log_scraper_activity(
action=f"{self.scraper_name}_scrape_failure",
status="error",
description=f"{self.get_name()} failed for DOI: {doi} - {message}",
paper_id=paper_id
)
@abstractmethod
def scrape(self, doi: str) -> ScrapeResult:
"""

View File

@ -30,6 +30,9 @@ class Scraper(BaseScraper):
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Simulate processing time (1-3 seconds)
processing_time = random.uniform(1, 3)
time.sleep(processing_time)
@ -145,12 +148,7 @@ class Scraper(BaseScraper):
)
# Log success
ActivityLog.log_scraper_activity(
action="dummy_scrape",
status="success",
description=f"Successfully scraped {doi}",
paper_id=paper.id
)
self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)
result = ScrapeResult(
status="success",
@ -178,12 +176,7 @@ class Scraper(BaseScraper):
paper.error_msg = error_msg
# Log failure
ActivityLog.log_scraper_activity(
action="dummy_scrape",
status="error",
description=f"Failed to scrape {doi}: {error_msg}",
paper_id=paper.id
)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",

View File

@ -30,13 +30,8 @@ class Scraper(BaseScraper):
timestamp=datetime.utcnow()
)
# Log retry attempt
ActivityLog.log_scraper_activity(
action="retry_failed_paper",
status="info",
description=f"Retrying failed paper: {paper.title}",
paper_id=paper.id
)
# Log start of retry
self.log_scrape_start(doi, paper.id)
# Simulate longer processing time for retry (2-5 seconds)
processing_time = random.uniform(2, 5)
@ -64,12 +59,7 @@ class Scraper(BaseScraper):
result_data = {"file_path": file_path}
# Log success
ActivityLog.log_scraper_activity(
action="retry_scrape_success",
status="success",
description=f"Successfully retried {doi} on second attempt",
paper_id=paper.id
)
self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id)
result = ScrapeResult(
status="success",
@ -81,12 +71,7 @@ class Scraper(BaseScraper):
except Exception as e:
error_msg = f"Failed to save retry file: {str(e)}"
ActivityLog.log_scraper_activity(
action="retry_scrape_file_error",
status="error",
description=error_msg,
paper_id=paper.id
)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",
@ -105,12 +90,7 @@ class Scraper(BaseScraper):
]
error_msg = random.choice(error_messages)
ActivityLog.log_scraper_activity(
action="retry_scrape_failure",
status="error",
description=f"Retry failed for {doi}: {error_msg}",
paper_id=paper.id
)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",

View File

@ -0,0 +1,172 @@
import time
import os
import requests
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Scraper that fetches HTML content from DOI and saves it for further processing."""
# This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "HtmlDownloaded"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "FetchingHtml"
def scrape(self, doi: str) -> ScrapeResult:
"""Fetch HTML content from DOI and save to download path."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Prepare file paths
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.html"
file_path = os.path.join(download_path, file_name)
# Check/create download directory (same pattern as dummy)
if not os.path.exists(download_path):
try:
os.makedirs(download_path, exist_ok=True)
except OSError as e:
error_msg = f"Failed to create download directory: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check path permissions (same pattern as dummy)
if not os.access(download_path, os.W_OK):
error_msg = f"Download path '{download_path}' is not writable"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
ActivityLog.log_scraper_activity(
action="html_fetch_path_error",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_write_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Fetch HTML from DOI
doi_url = f"https://doi.org/{doi}"
headers = {'User-Agent': 'SciPaperLoader/1.0'}
response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True)
# Check for invalid DOI (404) or other HTTP errors
if response.status_code == 404:
error_msg = f"Invalid DOI: {doi} not found"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "invalid_doi"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
response.raise_for_status() # Raise for other HTTP errors
# Save HTML content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.file_path = file_path
paper.error_msg = None
db.session.commit()
# Log success
self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id)
return ScrapeResult(
status="success",
message=f"Successfully fetched HTML for {doi}",
data={
"file_path": file_path,
"url": response.url, # Final URL after redirects
"title": paper.title
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.RequestException as e:
error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
ActivityLog.log_scraper_activity(
action="html_fetch",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "network_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save HTML file: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "file_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)

View File

@ -0,0 +1,282 @@
import time
import requests
import re
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
# This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "PublisherDetected"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
# Publisher detection patterns based on URL domains and paths
PUBLISHER_URL_PATTERNS = {
'elsevier': [
r'sciencedirect\.com',
r'elsevier\.com',
r'.*\.elsevier\.com'
],
'springer': [
r'link\.springer\.com',
r'springer\.com',
r'.*\.springer\.com'
],
'wiley': [
r'onlinelibrary\.wiley\.com',
r'wiley\.com',
r'.*\.wiley\.com'
],
'ieee': [
r'ieeexplore\.ieee\.org',
r'ieee\.org',
r'.*\.ieee\.org'
],
'plos': [
r'journals\.plos\.org',
r'plos\.org',
r'.*\.plos\.org'
],
'nature': [
r'nature\.com',
r'.*\.nature\.com'
],
'sage': [
r'journals\.sagepub\.com',
r'sagepub\.com',
r'.*\.sagepub\.com'
],
'taylor_francis': [
r'tandfonline\.com',
r'.*\.tandfonline\.com'
],
'acs': [
r'pubs\.acs\.org',
r'acs\.org',
r'.*\.acs\.org'
],
'arxiv': [
r'arxiv\.org',
r'export\.arxiv\.org'
],
'pubmed': [
r'pubmed\.ncbi\.nlm\.nih\.gov',
r'ncbi\.nlm\.nih\.gov'
],
'oxford': [
r'academic\.oup\.com',
r'oup\.com',
r'.*\.oup\.com'
],
'cambridge': [
r'cambridge\.org',
r'.*\.cambridge\.org'
],
'biorxiv': [
r'biorxiv\.org',
r'.*\.biorxiv\.org'
],
'researchgate': [
r'researchgate\.net',
r'.*\.researchgate\.net'
]
}
def scrape(self, doi: str) -> ScrapeResult:
"""Detect publisher from the final URL after DOI redirect."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
try:
# Get the final URL by following the DOI redirect
final_url = self._get_final_url(doi)
if not final_url:
error_msg = f"Could not resolve DOI {doi} to a URL"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "doi_resolution_failed"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Detect publisher from URL
detected_publisher = self._detect_publisher_from_url(final_url)
if detected_publisher:
# Update paper with detected publisher
paper.publisher = detected_publisher
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.error_msg = None
db.session.commit()
success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=success_msg,
data={
"publisher": detected_publisher,
"final_url": final_url
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
error_msg = f"Could not detect publisher from URL: {final_url}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={
"final_url": final_url,
"error_code": "publisher_not_detected"
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "publisher_detection_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
def _get_final_url(self, doi: str) -> Optional[str]:
"""
Get the final URL after following DOI redirects.
Args:
doi: The DOI to resolve
Returns:
Final URL after redirects, or None if resolution fails
"""
try:
doi_url = f"https://doi.org/{doi}"
headers = {
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
# Make a HEAD request to get the final URL without downloading content
response = requests.head(
doi_url,
headers=headers,
timeout=15,
allow_redirects=True
)
# If HEAD is not allowed, try GET but with minimal content
if response.status_code == 405: # Method Not Allowed
response = requests.get(
doi_url,
headers=headers,
timeout=15,
allow_redirects=True,
stream=True # Don't download the full content
)
response.close() # Close connection after getting headers
if response.status_code in [200, 302, 301]:
return response.url
else:
return None
except Exception as e:
# Log error but don't raise - we'll handle this gracefully
return None
def _detect_publisher_from_url(self, url: str) -> Optional[str]:
"""
Detect publisher from URL using domain patterns.
Args:
url: The URL to analyze
Returns:
Publisher name if detected, None otherwise
"""
if not url:
return None
# Parse the URL to get the domain
parsed_url = urlparse(url)
domain = parsed_url.netloc.lower()
# Remove 'www.' prefix if present
if domain.startswith('www.'):
domain = domain[4:]
# Score each publisher based on URL pattern matches
publisher_scores = {}
for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
score = 0
for pattern in patterns:
if re.search(pattern, domain, re.IGNORECASE):
score += 10 # Strong match for domain patterns
# Also check the full URL for path-based patterns
if re.search(pattern, url.lower(), re.IGNORECASE):
score += 5
if score > 0:
publisher_scores[publisher] = score
# Return the publisher with the highest score
if publisher_scores:
best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
# Only return if we have a reasonable confidence (score > 5)
if publisher_scores[best_publisher] > 5:
return best_publisher
return None

View File

@ -0,0 +1,237 @@
import time
import os
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
from ..parsers.base_parser import BaseParser, ParseError
from ..parsers.elsevier_parser import ElsevierParser
from ..parsers.arxiv_parser import ArxivParser
class Scraper(BaseScraper):
"""Full text extraction scraper that uses publisher-specific parsers."""
# This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
OUTPUT_STATUS_SUCCESS = "TextExtracted"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "ExtractingText"
def __init__(self):
super().__init__()
# Registry of available parsers
self.parsers = [
ElsevierParser(),
ArxivParser(),
# Add more parsers here as you create them
# SpringerParser(),
# WileyParser(),
# IEEEParser(),
]
def scrape(self, doi: str) -> ScrapeResult:
"""Extract full text using appropriate publisher parser."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Check if HTML file exists
if not paper.file_path or not os.path.exists(paper.file_path):
error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "html_file_not_found"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Read HTML content
with open(paper.file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
# Find appropriate parser
parser = self._select_parser(html_content)
if not parser:
error_msg = f"No suitable parser found for DOI {doi}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "no_parser_available"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Parse content
parsed_content = parser.parse(html_content, doi)
# Validate parsed content
if not parser.validate_content(parsed_content):
error_msg = f"Parsed content validation failed for DOI {doi}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "content_validation_failed"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Save extracted text to file
text_file_path = self._save_extracted_text(parsed_content, doi)
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.error_msg = None
# You might want to add a text_file_path field to store the text file location
# paper.text_file_path = text_file_path
db.session.commit()
success_msg = f"Successfully extracted text using {parser.get_name()} parser"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=f"Successfully extracted full text for {doi}",
data={
"text_file_path": text_file_path,
"parser_used": parser.get_name(),
"title": parsed_content.title,
"word_count": len(parsed_content.full_text.split()),
"has_abstract": bool(parsed_content.abstract),
"has_sections": bool(parsed_content.sections),
"author_count": len(parsed_content.authors) if parsed_content.authors else 0,
"keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
"reference_count": len(parsed_content.references) if parsed_content.references else 0
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except ParseError as e:
error_msg = f"Parser error for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "parser_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "extraction_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
def _select_parser(self, html_content: str) -> Optional[BaseParser]:
"""
Select the most appropriate parser for the HTML content.
Args:
html_content: The HTML content to analyze
Returns:
The best parser for this content, or None if no parser can handle it
"""
for parser in self.parsers:
if parser.can_parse(html_content):
return parser
return None
def _save_extracted_text(self, parsed_content, doi: str) -> str:
"""
Save extracted text to a file.
Args:
parsed_content: The parsed content object
doi: The DOI of the paper
Returns:
Path to the saved text file
"""
download_path = DownloadPathConfig.get_path()
text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
text_file_path = os.path.join(download_path, text_file_name)
with open(text_file_path, 'w', encoding='utf-8') as f:
# Write structured content
f.write(f"DOI: {parsed_content.doi or doi}\n")
f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
if parsed_content.authors:
f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
if parsed_content.keywords:
f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
f.write("=" * 80 + "\n\n")
# Write full text
f.write(parsed_content.full_text)
# Optionally write references at the end
if parsed_content.references:
f.write("\n\n" + "=" * 80 + "\n")
f.write("REFERENCES\n")
f.write("=" * 80 + "\n")
for i, ref in enumerate(parsed_content.references, 1):
f.write(f"{i}. {ref}\n")
return text_file_path

View File

@ -0,0 +1,201 @@
import time
import os
import requests
from urllib.parse import urlparse
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Web fetcher scraper that downloads HTML content from DOI URLs."""
# This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "WebContentDownloaded"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "FetchingWebContent"
def scrape(self, doi: str) -> ScrapeResult:
"""Fetch HTML content from DOI and save to download path."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Prepare file paths
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.html"
file_path = os.path.join(download_path, file_name)
# Check/create download directory
if not os.path.exists(download_path):
try:
os.makedirs(download_path, exist_ok=True)
except OSError as e:
error_msg = f"Failed to create download directory: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check path permissions
if not os.access(download_path, os.W_OK):
error_msg = f"Download path '{download_path}' is not writable"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_write_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Fetch HTML from DOI
doi_url = f"https://doi.org/{doi}"
headers = {
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(
doi_url,
headers=headers,
timeout=30,
allow_redirects=True,
verify=True
)
# Check for invalid DOI (404) or other HTTP errors
if response.status_code == 404:
error_msg = f"Invalid DOI: {doi} not found (404)"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "invalid_doi"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check for other HTTP errors
response.raise_for_status()
# Save HTML content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Extract final URL after redirects (for publisher detection)
final_url = response.url
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.file_path = file_path
paper.error_msg = None
db.session.commit()
# Log success
success_msg = f"Successfully fetched HTML content for {doi} from {final_url}"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=f"Successfully fetched HTML for {doi}",
data={
"file_path": file_path,
"final_url": final_url,
"content_length": len(response.text),
"content_type": response.headers.get('content-type', 'unknown'),
"title": paper.title,
"domain": urlparse(final_url).netloc if final_url else None
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.HTTPError as e:
error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "http_error", "status_code": e.response.status_code},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.RequestException as e:
error_msg = f"Network error fetching {doi_url}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "network_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save HTML file: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "file_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)

View File

@ -0,0 +1,384 @@
# JavaScript Modularization Documentation
## Overview
The JavaScript code in the SciPaperLoader application has been modularized into reusable components to improve maintainability, reduce code duplication, and enable easier testing and updates.
## Modularization Task Completed
### Problem Statement
The original codebase had ~800+ lines of inline JavaScript scattered across multiple Jinja templates with several critical issues:
- **Code Duplication**: Similar functionality replicated across templates
- **Maintenance Difficulty**: Changes required editing multiple template files
- **Linter Issues**: Jinja template syntax mixed with JavaScript caused linting errors
- **Testing Challenges**: Inline code was difficult to unit test
- **Poor Separation of Concerns**: Template logic mixed with application logic
### Solution Implemented
Successfully transformed the codebase by:
1. **Extracted 10 Modular JavaScript Files** (~800+ lines of code moved from templates)
2. **Eliminated Code Duplication** by creating reusable components
3. **Fixed Linter Compatibility** by separating template syntax from JavaScript logic
4. **Implemented Clean Variable Passing** using JSON script tags instead of direct Jinja embedding
5. **Created Class-Based Architecture** with proper inheritance and composition patterns
6. **Established Inter-Component Communication** through callback systems
7. **Added Comprehensive Error Handling** and loading states throughout
### Key Achievements
- ✅ **5 templates modularized**: `scraper.html.jinja`, `papers.html.jinja`, `upload.html.jinja`, `logger.html.jinja`, `config/schedule.html.jinja`
- ✅ **10 JavaScript modules created**: Covering all functionality from utilities to dashboard coordination
- ✅ **Zero functionality loss**: All existing features preserved during modularization
- ✅ **Improved maintainability**: Changes now require editing single module files
- ✅ **Enhanced testability**: Individual modules can be unit tested
- ✅ **Clean variable handling**: Jinja variables passed as JSON configuration instead of inline embedding
### Before vs After Example
**Before (inline in template)**:
```html
<script>
var maxVolume = {{ max_volume }}; // Linter error
$('#start-scraper').click(function() {
// 50+ lines of mixed template/JS code
});
</script>
```
**After (modular)**:
```html
<script type="application/json" id="config-data">
{"maxVolume": {{ max_volume|tojson }}}
</script>
<script src="{{ url_for('static', filename='js/scraper-control.js') }}"></script>
<script>
const config = JSON.parse(document.getElementById('config-data').textContent);
new ScraperControl(config).init();
</script>
```
## Modular JavaScript Files
### 1. `/static/js/common.js`
**Purpose**: Common utilities used across the application
**Key Functions**:
- `showFlashMessage(message, type)` - Display flash messages to users
- `createStatusBadge(status)` - Generate status badge HTML
- `formatTimestamp(timestamp)` - Format timestamps for display
- `truncateText(text, maxLength)` - Truncate text with ellipsis
- `toggleButtonLoading(button, loading, loadingText)` - Handle button loading states
- `apiRequest(url, options)` - Generic API request wrapper
**Used by**: All templates that need basic utilities
### 2. `/static/js/modal-handler.js`
**Purpose**: Handle modal dialogs with dynamic content loading
**Key Features**:
- AJAX content loading
- Error handling
- Automatic click handler setup
- Bootstrap modal integration
**Used by**:
- `papers.html.jinja` (paper details modal)
- `logger.html.jinja` (log details modal)
### 3. `/static/js/form-handler.js`
**Purpose**: Handle form submissions with progress tracking
**Key Features**:
- Progress modal display
- Task status polling
- Error handling
- Customizable callbacks
**Used by**:
- `upload.html.jinja` (CSV upload form)
### 4. `/static/js/chart.js`
**Purpose**: Handle Chart.js activity visualization
**Key Features**:
- Chart initialization and rendering
- Data loading from API
- Error handling for missing Chart.js
**Used by**:
- `scraper.html.jinja` (activity charts)
### 5. `/static/js/scraper-control.js`
**Purpose**: Handle scraper control operations (start/stop/pause/reset)
**Key Features**:
- Status polling
- Volume configuration
- Callback system for refreshing other components
**Used by**:
- `scraper.html.jinja`
### 6. `/static/js/paper-processor.js`
**Purpose**: Handle paper search and processing functionality
**Key Features**:
- Paper search
- Single paper processing
- Status polling
- Scraper selection
**Used by**:
- `scraper.html.jinja`
### 7. `/static/js/activity-monitor.js`
**Purpose**: Handle activity log display and real-time notifications
**Key Features**:
- Activity log loading
- Real-time updates
- Notification management
**Used by**:
- `scraper.html.jinja`
### 8. `/static/js/scraper-dashboard.js`
**Purpose**: Coordinate all scraper dashboard components
**Key Features**:
- Component initialization
- Inter-component communication
- Configuration management
**Used by**:
- `scraper.html.jinja`
### 9. `/static/js/config-handler.js`
**Purpose**: Handle configuration forms and Alpine.js integration
**Key Features**:
- Configuration API calls
- Alpine.js data objects
- Schedule management
- Volume updates
**Used by**:
- `config/schedule.html.jinja`
## Template Updates
### Templates Using Modular JavaScript
1. **scraper.html.jinja**
- Uses all scraper-related modules
- Passes Jinja variables as configuration parameters
- Initializes dashboard with `initScraperDashboard(config)`
2. **papers.html.jinja**
- Uses `modal-handler.js` for paper detail modals
- Simplified from custom modal code to single line initialization
3. **upload.html.jinja**
- Uses `form-handler.js` for upload progress tracking
- Custom result display function
- Automatic task status polling
4. **logger.html.jinja**
- Uses `modal-handler.js` for log detail modals
- Custom URL construction for log endpoints
5. **config/schedule.html.jinja**
- Uses `config-handler.js` for Alpine.js integration
- Modular schedule management functions
## Benefits of Modularization
### 1. **Reusability**
- Modal functionality shared between papers and logger templates
- Common utilities used across all templates
- Form handling can be reused for other forms
### 2. **Maintainability**
- Single place to update common functionality
- Clear separation of concerns
- Easier debugging and testing
### 3. **Parameter Passing**
- Jinja variables passed as configuration objects
- No more hardcoded values in JavaScript
- Environment-specific settings easily configurable
### 4. **Extensibility**
- Easy to add new functionality to existing modules
- New templates can easily use existing modules
- Plugin-like architecture for components
## Usage Examples
### Basic Modal Usage
```javascript
const modal = new ModalHandler('modalId', 'contentElementId');
modal.setupClickHandlers('.clickable-items');
```
### Form with Progress Tracking
```javascript
const formHandler = new FormHandler('formId', {
onSuccess: (result) => console.log('Success:', result),
onError: (error) => console.log('Error:', error)
});
```
### Configuration Management
```javascript
// In Alpine.js template
x-data="configHandler.createScheduleManager(initialData, volume)"
```
## Migration Notes
### Old vs New Approach
**Before**: Inline JavaScript in each template
```html
<script>
document.addEventListener('DOMContentLoaded', function() {
// Lots of inline JavaScript code
});
</script>
```
**After**: Modular imports with configuration
```html
<script src="{{ url_for('static', filename='js/common.js') }}"></script>
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
<script>
const modal = new ModalHandler('modalId', 'contentId');
modal.setupClickHandlers('.links');
</script>
```
### Jinja Variable Handling
To properly separate Jinja template variables from JavaScript code and avoid linting issues, we use a clean JSON configuration approach:
**Before**: Variables embedded directly in JavaScript (causes linting issues)
```javascript
if (volume > {{ max_volume }}) {
// Error handling - JSLint will complain about {{ }}
}
```
**After**: Clean separation using JSON script tags
```html
<!-- Jinja variables in JSON format -->
<script type="application/json" id="config-data">
{
"maxVolume": {{ max_volume|tojson }},
"currentVolume": {{ volume|tojson }},
"apiUrl": {{ url_for('api.endpoint')|tojson }},
"csrfToken": {{ csrf_token()|tojson }}
}
</script>
<!-- Clean JavaScript that reads the configuration -->
<script>
document.addEventListener('DOMContentLoaded', function() {
const config = JSON.parse(document.getElementById('config-data').textContent);
const handler = new VolumeHandler(config);
});
</script>
```
**Benefits of this approach**:
- **Linter-friendly**: No template syntax in JavaScript files
- **Type-safe**: JSON ensures proper data types
- **Maintainable**: Clear separation of concerns
- **Secure**: Automatic escaping with `|tojson` filter
- **Debuggable**: Easy to inspect configuration in DevTools
**Real-world example from scraper.html.jinja**:
```html
<script type="application/json" id="scraper-config">
{
"statusUrl": {{ url_for('api.scraper_status')|tojson }},
"startUrl": {{ url_for('api.start_scraper')|tojson }},
"volume": {{ volume|tojson }},
"scraperType": {{ scraper_type|tojson }},
"csrfToken": {{ csrf_token()|tojson }}
}
</script>
<script>
const config = JSON.parse(document.getElementById('scraper-config').textContent);
initScraperDashboard(config);
</script>
```
## Future Improvements
### Potential Enhancements
1. **Bundle Management**: Consider using webpack or similar for production builds
2. **Unit Testing**: Add comprehensive test suite for individual modules
3. **JSDoc Comments**: Add detailed documentation for better IDE support
4. **Centralized Error Reporting**: Implement global error handling system
5. **Performance Optimization**: Implement lazy loading for non-critical modules
6. **TypeScript Migration**: Consider migrating to TypeScript for better type safety
### Adding New Modules
When creating new JavaScript modules:
1. Follow the established class-based pattern
2. Include proper error handling
3. Use the configuration pattern for Jinja variables
4. Add documentation to this README
5. Update templates to use the new module
## Testing
A test file `test_js_modularization.py` has been created to verify the modularization. To run comprehensive testing:
```bash
python test_js_modularization.py
```
This will verify:
- All JavaScript files exist and are properly formatted
- Templates correctly reference the modular files
- Configuration patterns are properly implemented
- No inline JavaScript remains in templates
## Maintenance
### When Making Changes
1. **Update Single Module**: Changes to functionality only require editing one file
2. **Test Affected Templates**: Ensure all templates using the module still work
3. **Update Documentation**: Keep this README current with any changes
4. **Consider Dependencies**: Check if changes affect other modules
### File Organization
```
/static/js/
├── README.md # This documentation
├── common.js # Shared utilities
├── modal-handler.js # Modal functionality
├── form-handler.js # Form processing
├── chart.js # Chart visualization
├── scraper-control.js # Scraper operations
├── paper-processor.js # Paper management
├── activity-monitor.js # Activity tracking
├── scraper-dashboard.js # Dashboard coordination
├── config-handler.js # Configuration management
└── table-handler.js # Table utilities
```
## Migration Summary
The modularization successfully transformed **~800+ lines of inline JavaScript** from templates into a maintainable, reusable module system. This improvement provides:
- **Enhanced maintainability** through single-responsibility modules
- **Reduced code duplication** via shared utility functions
- **Improved linter compatibility** by separating template and JavaScript concerns
- **Better testability** with isolated, unit-testable modules
- **Cleaner templates** with minimal, configuration-only JavaScript
- **Easier debugging** with clearly separated concerns and proper error handling
All existing functionality has been preserved while significantly improving the codebase architecture and developer experience.

View File

@ -38,12 +38,12 @@ class ScraperController {
this.resetButton.addEventListener("click", () => this.resetScraper());
}
// Volume form
const volumeForm = document.getElementById("volumeForm");
if (volumeForm) {
volumeForm.addEventListener("submit", (e) => {
// Configuration form (handles both volume and scraper module)
const configForm = document.getElementById("volumeForm");
if (configForm) {
configForm.addEventListener("submit", (e) => {
e.preventDefault();
this.updateVolume();
this.updateConfiguration();
});
}
}
@ -245,16 +245,22 @@ class ScraperController {
}
/**
* Update volume configuration
* Update configuration (volume and/or scraper module)
*/
async updateVolume() {
async updateConfiguration() {
const volumeInput = document.getElementById("volumeInput");
const scraperSelect = document.getElementById("mainScraperSelect");
const submitButton = document.querySelector(
'#volumeForm button[type="submit"]'
);
if (!volumeInput || !submitButton) return;
if (!submitButton) return;
const updates = {};
let hasChanges = false;
// Check volume changes
if (volumeInput) {
const volume = volumeInput.value;
// Basic validation
@ -267,27 +273,45 @@ class ScraperController {
return;
}
updates.volume = volume;
hasChanges = true;
}
// Check scraper module changes
if (scraperSelect && scraperSelect.value) {
updates.scraper_module = scraperSelect.value;
hasChanges = true;
}
if (!hasChanges) {
showFlashMessage("No changes to save", "info");
return;
}
// Toggle loading state
toggleButtonLoading(submitButton, true, "Updating...");
try {
const data = await apiRequest("/scraper/update_config", {
method: "POST",
body: JSON.stringify({ volume: volume }),
body: JSON.stringify(updates),
});
if (data.success) {
showFlashMessage(
data.message || "Volume updated successfully",
data.message || "Configuration updated successfully",
"success"
);
} else {
showFlashMessage(data.message || "Failed to update volume", "error");
showFlashMessage(
data.message || "Failed to update configuration",
"error"
);
}
} catch (error) {
console.error("Error updating volume:", error);
console.error("Error updating configuration:", error);
showFlashMessage(
"Network error while updating volume. Please try again.",
"Network error while updating configuration. Please try again.",
"error"
);
} finally {

View File

@ -0,0 +1,500 @@
/**
* Scraper Overview functionality
*/
class ScraperOverview {
constructor() {
this.modal = null;
this.scrapers = [];
this.systemConfig = {};
this.init();
}
init() {
// Initialize modal reference
this.modal = document.getElementById("scraperOverviewModal");
// Load data when modal is shown
if (this.modal) {
this.modal.addEventListener("show.bs.modal", () => {
this.loadScraperOverview();
});
}
}
async loadScraperOverview() {
const loadingEl = document.getElementById("scraperOverviewLoading");
const errorEl = document.getElementById("scraperOverviewError");
const contentEl = document.getElementById("scraperOverviewContent");
// Show loading state
loadingEl?.classList.remove("d-none");
errorEl?.classList.add("d-none");
contentEl?.classList.add("d-none");
try {
// Load scrapers, system config, and publishers in parallel
const [scrapersResponse, statusResponse, publishersResponse] =
await Promise.all([
fetch("/scraper/scrapers"),
fetch("/scraper/status"),
fetch("/scraper/publishers"),
]);
if (
!scrapersResponse.ok ||
!statusResponse.ok ||
!publishersResponse.ok
) {
throw new Error("Failed to load scraper information");
}
const scrapersData = await scrapersResponse.json();
const statusData = await statusResponse.json();
const publishersData = await publishersResponse.json();
if (
!scrapersData.success ||
!statusData.success ||
!publishersData.success
) {
throw new Error(
scrapersData.message ||
statusData.message ||
publishersData.message ||
"Unknown error"
);
}
this.scrapers = scrapersData.scrapers;
this.systemConfig = statusData;
this.publishersData = publishersData.data;
// Update UI
this.updateSystemConfig();
this.updateScrapersTable();
this.updatePublishersSection();
this.updateStatusFlowDiagram();
// Show content
loadingEl?.classList.add("d-none");
contentEl?.classList.remove("d-none");
} catch (error) {
console.error("Error loading scraper overview:", error);
// Show error state
loadingEl?.classList.add("d-none");
const errorMessage = document.getElementById(
"scraperOverviewErrorMessage"
);
if (errorMessage) {
errorMessage.textContent =
error.message || "Failed to load scraper information";
}
errorEl?.classList.remove("d-none");
}
}
updateSystemConfig() {
// Current scraper module
const currentModuleEl = document.getElementById("currentScraperModule");
if (currentModuleEl) {
const currentModule =
this.systemConfig.current_scraper_module || "System Default";
currentModuleEl.textContent = currentModule;
currentModuleEl.className = "badge bg-primary";
}
// Volume limit
const volumeLimitEl = document.getElementById("currentVolumeLimit");
if (volumeLimitEl) {
const volumeLimit = this.systemConfig.volume_config || "Unknown";
volumeLimitEl.textContent = volumeLimit;
}
// Total modules
const totalModulesEl = document.getElementById("totalScraperModules");
if (totalModulesEl) {
totalModulesEl.textContent = this.scrapers.length;
}
// Paper counts summary
const paperCountsEl = document.getElementById("paperCountsSummary");
if (paperCountsEl && this.systemConfig.paper_counts) {
const counts = this.systemConfig.paper_counts;
paperCountsEl.innerHTML = `
<div class="d-flex flex-wrap gap-2">
<span class="badge bg-primary">${counts.new || 0} New</span>
<span class="badge bg-warning">${
counts.processing || 0
} Processing</span>
<span class="badge bg-success">${
counts.done || 0
} Done</span>
<span class="badge bg-danger">${
counts.failed || 0
} Failed</span>
<span class="badge bg-info">${
counts.pending || 0
} Pending</span>
<span class="badge bg-secondary">${
counts.retrying || 0
} Retrying</span>
</div>
`;
}
}
updateScrapersTable() {
const tbody = document.getElementById("scrapersTableBody");
if (!tbody) return;
tbody.innerHTML = "";
this.scrapers.forEach((scraper) => {
const row = document.createElement("tr");
// Check if this is the current active scraper
const isCurrentScraper =
scraper.name === this.systemConfig.current_scraper_module;
if (scraper.error) {
row.innerHTML = `
<td>${scraper.name}</td>
<td colspan="5" class="text-danger">
<i class="fas fa-exclamation-triangle"></i> ${scraper.error}
</td>
`;
} else {
row.innerHTML = `
<td>
<strong>${scraper.name}</strong>
${
scraper.name === "dummy"
? '<span class="badge bg-info ms-2">Test Module</span>'
: ""
}
${
isCurrentScraper
? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
: ""
}
</td>
<td class="scraper-description">
${this.truncateDescription(scraper.description)}
</td>
<td class="input-status-list">
${this.renderStatusBadges(
scraper.input_statuses,
"bg-info"
)}
</td>
<td class="status-output">
<span class="badge bg-success">${
scraper.output_status_success
}</span>
</td>
<td class="status-output">
<span class="badge bg-danger">${
scraper.output_status_failure
}</span>
</td>
<td class="status-output">
<span class="badge bg-warning">${
scraper.output_status_processing
}</span>
</td>
`;
}
// Highlight the current scraper row
if (isCurrentScraper) {
row.classList.add("table-success");
}
tbody.appendChild(row);
});
}
updateStatusFlowDiagram() {
const diagramEl = document.getElementById("statusFlowDiagram");
if (!diagramEl) return;
// Analyze actual scrapers to build real flow
const statusFlow = this.analyzeScraperFlow();
let diagramHTML = '<div class="status-flow-container">';
// Create visual flow based on actual scrapers
statusFlow.forEach((stage, index) => {
if (index > 0) {
diagramHTML +=
'<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
}
diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
if (stage.scrapers && stage.scrapers.length > 0) {
diagramHTML +=
'<div class="mb-2"><small class="text-muted">Handled by: ' +
stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
"</small></div>";
}
diagramHTML += '<div class="status-badges">';
stage.statuses.forEach((status, statusIndex) => {
if (statusIndex > 0) {
diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
}
const badgeClass = this.getStatusBadgeClass(status);
diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
});
diagramHTML += "</div>";
if (stage.description) {
diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
}
diagramHTML += "</div>";
});
diagramHTML += "</div>";
// Add explanation
diagramHTML += `
<div class="mt-4 p-3 bg-light rounded">
<h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
<ul class="small mb-0">
<li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
<li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
<li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
<li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
<li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
</ul>
</div>
`;
diagramEl.innerHTML = diagramHTML;
}
analyzeScraperFlow() {
// Build actual flow based on available scrapers
const stages = [];
const allInputStatuses = new Set();
const allOutputStatuses = new Set();
const scrapersByInput = {};
// Analyze scrapers to understand the flow
this.scrapers.forEach((scraper) => {
if (scraper.input_statuses) {
scraper.input_statuses.forEach((status) => {
allInputStatuses.add(status);
if (!scrapersByInput[status]) {
scrapersByInput[status] = [];
}
scrapersByInput[status].push(scraper.name);
});
}
if (scraper.output_status_success)
allOutputStatuses.add(scraper.output_status_success);
if (scraper.output_status_failure)
allOutputStatuses.add(scraper.output_status_failure);
});
// Entry point
if (allInputStatuses.has("New")) {
stages.push({
title: "Entry Point",
statuses: ["New"],
scrapers: scrapersByInput["New"] || [],
description: "Newly uploaded papers enter the processing pipeline",
});
}
// Processing stages
const processingStatuses = Array.from(allInputStatuses).filter(
(status) => !["New", "Done", "Failed"].includes(status)
);
if (processingStatuses.length > 0) {
stages.push({
title: "Processing Stages",
statuses: processingStatuses,
scrapers: [],
description: "Papers move through various processing stages",
});
}
// Final outputs
const finalStatuses = ["Done", "Failed"];
stages.push({
title: "Final States",
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
scrapers: [],
description: "Papers end up in final success or failure states",
});
// Retry handling
if (allInputStatuses.has("Failed")) {
stages.push({
title: "Retry Processing",
statuses: ["Failed", "Retrying"],
scrapers: scrapersByInput["Failed"] || [],
description: "Failed papers can be retried with specialized scrapers",
});
}
return stages;
}
getStatusBadgeClass(status) {
const statusClasses = {
New: "bg-primary",
Pending: "bg-warning",
Processing: "bg-warning",
Retrying: "bg-warning",
Done: "bg-success",
Failed: "bg-danger",
HtmlDownloaded: "bg-info",
PublisherDetected: "bg-info",
TextExtracted: "bg-info",
};
return statusClasses[status] || "bg-secondary";
}
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
if (!Array.isArray(statuses)) return "";
return statuses
.map(
(status) =>
`<span class="badge ${this.getStatusBadgeClass(
status
)} status-badge">${status}</span>`
)
.join("");
}
truncateDescription(description, maxLength = 100) {
if (!description) return "No description available";
if (description.length <= maxLength) return description;
return description.substring(0, maxLength).trim() + "...";
}
updatePublishersSection() {
// Update publisher statistics
const publisherStatsEl = document.getElementById("publisherStats");
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
const stats = this.publishersData.stats;
publisherStatsEl.innerHTML = `
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-primary mb-1">${stats.total_publishers}</div>
<div class="text-muted small">Total Publishers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
<div class="text-muted small">With Parsers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
<div class="text-muted small">Missing Parsers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
<div class="text-muted small">Papers with Publisher</div>
</div>
</div>
`;
}
// Update publishers table
const publishersTableBody = document.getElementById("publishersTableBody");
if (
publishersTableBody &&
this.publishersData &&
this.publishersData.publishers
) {
publishersTableBody.innerHTML = "";
if (this.publishersData.publishers.length === 0) {
publishersTableBody.innerHTML = `
<tr>
<td colspan="4" class="text-center text-muted py-4">
<i class="fas fa-info-circle"></i> No publishers detected yet.<br>
<small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
</td>
</tr>
`;
return;
}
this.publishersData.publishers.forEach((publisher) => {
const row = document.createElement("tr");
// Publisher status badge
const statusBadge = publisher.has_parser
? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
: '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
// Parser availability indicator
const parserIndicator = publisher.has_parser
? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
: '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
row.innerHTML = `
<td>
<strong>${publisher.name}</strong>
</td>
<td>
<span class="badge bg-info">${publisher.paper_count}</span>
</td>
<td>${statusBadge}</td>
<td class="text-center">${parserIndicator}</td>
`;
publishersTableBody.appendChild(row);
});
}
}
// Public method to show the modal
show() {
if (this.modal) {
const bootstrapModal = new bootstrap.Modal(this.modal);
bootstrapModal.show();
}
}
}
// Global function to load scraper overview (used by retry button)
function loadScraperOverview() {
if (window.scraperOverview) {
window.scraperOverview.loadScraperOverview();
}
}
// Global function to show scraper overview modal
function showScraperOverview() {
if (!window.scraperOverview) {
window.scraperOverview = new ScraperOverview();
}
window.scraperOverview.show();
}
// Initialize when DOM is ready
document.addEventListener("DOMContentLoaded", function () {
window.scraperOverview = new ScraperOverview();
});

View File

@ -65,7 +65,13 @@
<div class="col-md-6">
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
<div class="form-section">
<div class="d-flex justify-content-between align-items-center mb-2">
<h6>Scraper Module</h6>
<button type="button" class="btn btn-outline-info btn-sm"
onclick="showScraperOverview()" title="View scraper modules overview">
<i class="fas fa-info-circle"></i> How Scrapers Work
</button>
</div>
<p class="text-muted">Select which scraper module to use for processing papers.</p>
<div class="mb-3">

View File

@ -53,4 +53,13 @@
{% endif %}
</div>
</div>
<!-- Include the scraper overview modal -->
{% include "partials/scraper_overview_modal.html.jinja" %}
{% endblock content %}
{% block scripts %}
{{ super() }}
<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
{% endblock scripts %}

View File

@ -0,0 +1,249 @@
<!-- Scraper Overview Modal -->
<div class="modal fade" id="scraperOverviewModal" tabindex="-1" role="dialog"
aria-labelledby="scraperOverviewModalLabel" aria-hidden="true">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="scraperOverviewModalLabel">
<i class="fas fa-cogs"></i> Scraper Modules Overview
</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
<!-- Loading state -->
<div id="scraperOverviewLoading" class="text-center py-4">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
<p class="mt-2 text-muted">Loading scraper information...</p>
</div>
<!-- Error state -->
<div id="scraperOverviewError" class="alert alert-danger d-none" role="alert">
<h6 class="alert-heading">Error Loading Scrapers</h6>
<p id="scraperOverviewErrorMessage"></p>
<button class="btn btn-outline-danger btn-sm" onclick="loadScraperOverview()">
<i class="fas fa-redo"></i> Retry
</button>
</div>
<!-- Content -->
<div id="scraperOverviewContent" class="d-none">
<!-- Scraper Architecture Overview -->
<div class="card mb-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-info-circle"></i> How Scraper Modules Work
</h6>
</div>
<div class="card-body">
<p class="mb-3">
SciPaperLoader uses a modular scraper architecture where each scraper module handles
specific paper processing stages. Papers flow through different statuses as they are
processed by various scrapers.
</p>
<div class="row">
<div class="col-md-6">
<h6>Key Concepts:</h6>
<ul class="small">
<li><strong>Input Statuses:</strong> Paper statuses this scraper can process
</li>
<li><strong>Output Statuses:</strong> Statuses papers get after processing</li>
<li><strong>Processing Status:</strong> Temporary status while scraper works
</li>
<li><strong>Pipeline:</strong> Scrapers can be chained together</li>
</ul>
</div>
<div class="col-md-6">
<h6>Status Flow Example:</h6>
<div class="d-flex align-items-center small">
<span class="badge bg-info">New</span>
<i class="fas fa-arrow-right mx-2"></i>
<span class="badge bg-warning">Processing</span>
<i class="fas fa-arrow-right mx-2"></i>
<span class="badge bg-success">Done</span>
</div>
<div class="text-muted mt-1">Papers transition through these statuses</div>
</div>
</div>
</div>
</div>
<!-- Current System Configuration -->
<div class="card mb-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-server"></i> System Configuration
</h6>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-4">
<p><strong>Active Scraper Module:</strong> <span id="currentScraperModule"
class="badge bg-primary">Loading...</span></p>
<p><strong>Daily Volume Limit:</strong> <span
id="currentVolumeLimit">Loading...</span> papers</p>
</div>
<div class="col-md-4">
<p><strong>Total Available Modules:</strong> <span
id="totalScraperModules">Loading...</span></p>
<p><strong>Processing Pipeline:</strong> <span
id="processingPipeline">Multi-stage</span></p>
</div>
<div class="col-md-4">
<p><strong>Current Paper Counts:</strong></p>
<div id="paperCountsSummary" class="small">
<!-- Will be populated by JavaScript -->
</div>
</div>
</div>
</div>
</div>
<!-- Available Scrapers Table -->
<div class="card">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-list"></i> Available Scraper Modules
</h6>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th>Module Name</th>
<th>Description</th>
<th>Input Statuses</th>
<th>Success Output</th>
<th>Failure Output</th>
<th>Processing Status</th>
</tr>
</thead>
<tbody id="scrapersTableBody">
<!-- Table content will be populated by JavaScript -->
</tbody>
</table>
</div>
</div>
</div>
<!-- Publisher Parser Overview -->
<div class="card mt-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-building"></i> Publisher Parser Overview
</h6>
</div>
<div class="card-body">
<div class="row mb-3">
<div class="col-md-12">
<p class="text-muted mb-2">
<i class="fas fa-info-circle"></i>
Publishers are detected from paper URLs and mapped to specific parser modules
for content extraction.
</p>
</div>
</div>
<!-- Publisher Statistics -->
<div class="row mb-4" id="publisherStats">
<!-- Will be populated by JavaScript -->
</div>
<!-- Publishers Table -->
<div class="table-responsive">
<table class="table table-hover table-sm">
<thead>
<tr>
<th>Publisher</th>
<th>Papers</th>
<th>Parser Status</th>
<th>Parser Available</th>
</tr>
</thead>
<tbody id="publishersTableBody">
<!-- Table content will be populated by JavaScript -->
</tbody>
</table>
</div>
</div>
</div>
<!-- Status Flow Diagram -->
<div class="card mt-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-project-diagram"></i> Paper Status Flow Diagram
</h6>
</div>
<div class="card-body">
<div id="statusFlowDiagram" class="text-center py-4">
<!-- This will be populated by JavaScript -->
</div>
</div>
</div>
</div>
</div>
<div class="modal-footer">
<div class="d-flex justify-content-between w-100">
<small class="text-muted">
<i class="fas fa-lightbulb"></i>
Tip: Scrapers can be chained to create complex processing pipelines
</small>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<style>
/* Custom styles for the scraper overview modal */
#scraperOverviewModal .modal-xl {
max-width: 1200px;
}
#scraperOverviewModal .table th {
font-size: 0.9rem;
background-color: #f8f9fa;
}
#scraperOverviewModal .badge {
font-size: 0.75rem;
}
#scraperOverviewModal .status-badge {
margin: 2px;
display: inline-block;
}
.status-flow-node {
display: inline-block;
padding: 8px 16px;
margin: 4px;
border-radius: 20px;
font-size: 0.9rem;
font-weight: 500;
}
.status-flow-arrow {
color: #6c757d;
margin: 0 8px;
}
.scraper-description {
max-width: 300px;
word-break: break-word;
}
.input-status-list {
max-width: 150px;
}
.status-output {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 0.8rem;
}
</style>

View File

@ -114,20 +114,44 @@
<div class="col-md-6">
<div class="card">
<div class="card-header">
<h5>Volume Configuration</h5>
<div class="card-header d-flex justify-content-between align-items-center">
<h5>Scraper Configuration</h5>
<button type="button" class="btn btn-outline-info btn-sm" onclick="showScraperOverview()"
title="View scraper modules overview">
<i class="fas fa-info-circle"></i> How Scrapers Work
</button>
</div>
<div class="card-body">
<form id="volumeForm">
<div class="form-group">
<div class="form-group mb-3">
<label for="volumeInput">Papers per day:</label>
<input type="number" class="form-control" id="volumeInput"
value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
<button type="submit" class="btn btn-primary mt-2">
<i class="fas fa-save"></i> Update Volume
</button>
</div>
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
</div>
<div class="form-group mb-3">
<label for="mainScraperSelect">Scraper Module:</label>
<select class="form-control" id="mainScraperSelect">
{% for module in available_scraper_modules %}
<option value="{{ module }}" {% if module==current_scraper_module %}selected{% endif %}>
{{ module }}
{% if scraper_details[module] %}
- {{ scraper_details[module].description[:50] }}{% if
scraper_details[module].description|length > 50 %}...{% endif %}
{% endif %}
</option>
{% endfor %}
</select>
<div class="form-text">
Select which scraper module to use for automated processing. Current: <strong>{{
current_scraper_module }}</strong>
</div>
</div>
<button type="submit" class="btn btn-primary">
<i class="fas fa-save"></i> Update Configuration
</button>
</form>
</div>
</div>
@ -306,6 +330,10 @@
</div>
</div>
</div>
<!-- Include the scraper overview modal -->
{% include "partials/scraper_overview_modal.html.jinja" %}
{% endblock content %}
{% block scripts %}
@ -320,6 +348,7 @@
<script src="{{ url_for('static', filename='js/paper-processor.js') }}"></script>
<script src="{{ url_for('static', filename='js/activity-monitor.js') }}"></script>
<script src="{{ url_for('static', filename='js/scraper-dashboard.js') }}"></script>
<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
<script id="scraper-config" type="application/json">
{