adds scraper modules and modular publisher parser system
This commit is contained in:
parent
ce6bc03b46
commit
a7964a2f3d
22
Makefile
22
Makefile
@ -1,5 +1,5 @@
|
||||
# List of phony targets (targets that don't represent files)
|
||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics
|
||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics clean-papers purge-db
|
||||
|
||||
# Define Python and pip executables inside virtual environment
|
||||
PYTHON := venv/bin/python
|
||||
@ -14,7 +14,7 @@ clean:
|
||||
rm -rf venv build dist .pytest_cache .mypy_cache *.egg-info
|
||||
|
||||
# Define database path
|
||||
DB_PATH=scipaperloader/papers.db
|
||||
DB_PATH=instance/papers.db
|
||||
|
||||
# Backup the database with timestamp
|
||||
backup-db:
|
||||
@ -90,6 +90,24 @@ reset-db: venv
|
||||
$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
|
||||
$(PYTHON) -m flask --app scipaperloader db upgrade
|
||||
|
||||
# Clean all papers from the database (keep other tables intact)
|
||||
clean-papers: venv
|
||||
@echo "Cleaning all papers from the database..."
|
||||
@$(PYTHON) -c "from scipaperloader.db import db; from scipaperloader.models import PaperMetadata; from scipaperloader import create_app; app = create_app(); app.app_context().push(); PaperMetadata.query.delete(); db.session.commit(); print('All papers have been removed from the database')"
|
||||
|
||||
# Completely purge all database contents (removes all tables and data)
|
||||
purge-db: venv
|
||||
@echo "WARNING: This will completely wipe all database contents!"
|
||||
@read -p "Are you sure you want to continue? (y/N) " -n 1 -r; \
|
||||
echo; \
|
||||
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
|
||||
echo "Purging database..."; \
|
||||
rm -f $(DB_PATH); \
|
||||
echo "Database completely purged"; \
|
||||
else \
|
||||
echo "Operation cancelled"; \
|
||||
fi
|
||||
|
||||
# Create and set up virtual environment
|
||||
venv:
|
||||
python3 -m venv venv && \
|
||||
|
@ -15,6 +15,8 @@ dependencies = [
|
||||
"pandas>=2.2.3,<3",
|
||||
"APScheduler>=3.10.4,<4",
|
||||
"flask-migrate>=4.1.0,<5",
|
||||
"beautifulsoup4>=4.13.4,<5 ",
|
||||
"requests>=2.32.4,<3"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
@ -29,6 +29,10 @@ def index():
|
||||
# Get volume configuration
|
||||
volume_config = VolumeConfig.get_current_volume()
|
||||
|
||||
# Get scraper module configuration
|
||||
from ..models import ScraperModuleConfig
|
||||
current_scraper_module = ScraperModuleConfig.get_current_module()
|
||||
|
||||
# Get paper counts by status
|
||||
paper_counts = {
|
||||
'new': PaperMetadata.query.filter_by(status='New').count(),
|
||||
@ -46,7 +50,10 @@ def index():
|
||||
recent_logs=recent_logs,
|
||||
paper_counts=paper_counts,
|
||||
volume_config=volume_config,
|
||||
max_volume=MAX_VOLUME
|
||||
max_volume=MAX_VOLUME,
|
||||
current_scraper_module=current_scraper_module,
|
||||
available_scraper_modules=[s["name"] for s in available_scrapers],
|
||||
scraper_details={s["name"]: s for s in available_scrapers}
|
||||
)
|
||||
|
||||
@bp.route("/start", methods=["POST"])
|
||||
@ -219,6 +226,13 @@ def get_status():
|
||||
# Get current hour quota info
|
||||
current_quota = scraper_manager.get_current_hour_quota()
|
||||
|
||||
# Get current scraper module configuration
|
||||
from ..models import ScraperModuleConfig
|
||||
current_scraper_module = ScraperModuleConfig.get_current_module()
|
||||
|
||||
# Get volume configuration
|
||||
current_volume = VolumeConfig.get_current_volume()
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"scraper_state": {
|
||||
@ -227,7 +241,9 @@ def get_status():
|
||||
"last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
|
||||
},
|
||||
"paper_counts": paper_counts,
|
||||
"current_quota": current_quota
|
||||
"current_quota": current_quota,
|
||||
"current_scraper_module": current_scraper_module,
|
||||
"volume_config": current_volume
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
@ -665,6 +681,35 @@ def update_scraper_config():
|
||||
"message": message
|
||||
}), 400
|
||||
|
||||
# Handle scraper module configuration updates
|
||||
if "scraper_module" in data:
|
||||
from ..models import ScraperModuleConfig
|
||||
|
||||
new_module = data["scraper_module"]
|
||||
|
||||
# Validate that the module exists and is valid
|
||||
available_modules = [m["name"] for m in get_available_scrapers()]
|
||||
|
||||
if new_module not in available_modules:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Invalid scraper module: {new_module}"
|
||||
}), 400
|
||||
|
||||
# Update the database configuration
|
||||
ScraperModuleConfig.set_module(new_module)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="update_scraper_module",
|
||||
status="success",
|
||||
description=f"Updated scraper module to '{new_module}'"
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": f"Scraper module updated to '{new_module}' successfully"
|
||||
})
|
||||
|
||||
# Handle other configuration updates here if needed in the future
|
||||
|
||||
return jsonify({
|
||||
@ -681,4 +726,73 @@ def update_scraper_config():
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Error updating scraper config: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@bp.route("/publishers")
|
||||
def get_publishers():
|
||||
"""Get publisher overview data for the scraper overview modal."""
|
||||
try:
|
||||
import os
|
||||
import glob
|
||||
|
||||
# Get available parser modules
|
||||
parsers_dir = os.path.join(current_app.root_path, 'parsers')
|
||||
parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py'))
|
||||
available_parsers = []
|
||||
|
||||
for parser_file in parser_files:
|
||||
filename = os.path.basename(parser_file)
|
||||
if filename != 'base_parser.py': # Skip the base parser
|
||||
parser_name = filename.replace('_parser.py', '')
|
||||
available_parsers.append(parser_name)
|
||||
|
||||
# Get publishers from database (papers that have publisher detected)
|
||||
publisher_query = db.session.query(
|
||||
PaperMetadata.publisher,
|
||||
db.func.count(PaperMetadata.id).label('paper_count')
|
||||
).filter(
|
||||
PaperMetadata.publisher.isnot(None),
|
||||
PaperMetadata.publisher != ''
|
||||
).group_by(PaperMetadata.publisher).all()
|
||||
|
||||
publishers_data = []
|
||||
for publisher, count in publisher_query:
|
||||
# Check if a parser exists for this publisher
|
||||
has_parser = publisher in available_parsers
|
||||
|
||||
publishers_data.append({
|
||||
'name': publisher,
|
||||
'paper_count': count,
|
||||
'has_parser': has_parser,
|
||||
'parser_status': 'available' if has_parser else 'missing'
|
||||
})
|
||||
|
||||
# Sort by paper count descending
|
||||
publishers_data.sort(key=lambda x: x['paper_count'], reverse=True)
|
||||
|
||||
# Get totals
|
||||
total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data)
|
||||
total_papers_without_publisher = PaperMetadata.query.filter(
|
||||
db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '')
|
||||
).count()
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'data': {
|
||||
'publishers': publishers_data,
|
||||
'available_parsers': available_parsers,
|
||||
'stats': {
|
||||
'total_publishers': len(publishers_data),
|
||||
'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]),
|
||||
'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]),
|
||||
'total_papers_with_publisher': total_papers_with_publisher,
|
||||
'total_papers_without_publisher': total_papers_without_publisher
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f'Error getting publisher data: {str(e)}'
|
||||
}), 500
|
@ -191,6 +191,7 @@ class PaperMetadata(db.Model):
|
||||
type = db.Column(db.String(50))
|
||||
language = db.Column(db.String(50))
|
||||
published_online = db.Column(db.Date) # or DateTime/String
|
||||
publisher = db.Column(db.String(100), nullable=True) # Detected publisher name
|
||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
|
||||
file_path = db.Column(db.Text)
|
||||
|
6
scipaperloader/parsers/__init__.py
Normal file
6
scipaperloader/parsers/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
# Parser modules for extracting full text from publisher-specific HTML content
|
||||
from .base_parser import BaseParser, ParsedContent, ParseError
|
||||
from .elsevier_parser import ElsevierParser
|
||||
from .arxiv_parser import ArxivParser
|
||||
|
||||
__all__ = ['BaseParser', 'ParsedContent', 'ParseError', 'ElsevierParser', 'ArxivParser']
|
227
scipaperloader/parsers/arxiv_parser.py
Normal file
227
scipaperloader/parsers/arxiv_parser.py
Normal file
@ -0,0 +1,227 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List
|
||||
from .base_parser import BaseParser, ParsedContent, ParseError
|
||||
|
||||
class ArxivParser(BaseParser):
|
||||
"""Parser for arXiv papers."""
|
||||
|
||||
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
||||
"""Check if this is an arXiv page."""
|
||||
html_lower = html_content.lower()
|
||||
|
||||
# Check for arXiv indicators
|
||||
indicators = [
|
||||
'arxiv.org',
|
||||
'export.arxiv.org',
|
||||
'arxiv:',
|
||||
'meta name="citation_publisher" content="arxiv"',
|
||||
]
|
||||
|
||||
return any(indicator in html_lower for indicator in indicators)
|
||||
|
||||
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
||||
"""Parse arXiv HTML content."""
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
title = self._extract_title(soup)
|
||||
|
||||
# Extract abstract
|
||||
abstract = self._extract_abstract(soup)
|
||||
|
||||
# Extract authors
|
||||
authors = self._extract_authors(soup)
|
||||
|
||||
# Extract full text (arXiv usually just has abstract on the HTML page)
|
||||
full_text = self._extract_full_text(soup, abstract)
|
||||
|
||||
# Extract keywords/subjects
|
||||
keywords = self._extract_subjects(soup)
|
||||
|
||||
# Extract arxiv ID
|
||||
arxiv_id = self._extract_arxiv_id(soup)
|
||||
|
||||
if not full_text or len(full_text.strip()) < 50:
|
||||
raise ParseError("Could not extract meaningful content from arXiv page")
|
||||
|
||||
return ParsedContent(
|
||||
full_text=full_text,
|
||||
title=title,
|
||||
abstract=abstract,
|
||||
authors=authors,
|
||||
keywords=keywords,
|
||||
sections=None, # arXiv HTML pages don't usually have full sections
|
||||
references=None, # References are typically in the PDF
|
||||
doi=doi,
|
||||
journal="arXiv",
|
||||
publication_date=self._extract_submission_date(soup),
|
||||
metadata={
|
||||
'parser': 'arxiv',
|
||||
'arxiv_id': arxiv_id,
|
||||
'source': 'arxiv.org'
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise ParseError(f"Failed to parse arXiv content: {str(e)}")
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract paper title."""
|
||||
# Try multiple title selectors for arXiv
|
||||
selectors = [
|
||||
'h1.title',
|
||||
'meta[name="citation_title"]',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
if 'meta' in selector:
|
||||
element = soup.find('meta', attrs={'name': 'citation_title'})
|
||||
if element:
|
||||
return element.get('content', '').strip()
|
||||
else:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True)
|
||||
# Remove "Title:" prefix if present
|
||||
text = re.sub(r'^Title:\s*', '', text)
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract paper abstract."""
|
||||
# arXiv abstract selectors
|
||||
selectors = [
|
||||
'blockquote.abstract',
|
||||
'div.abstract',
|
||||
'meta[name="citation_abstract"]'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
if 'meta' in selector:
|
||||
element = soup.find('meta', attrs={'name': 'citation_abstract'})
|
||||
if element:
|
||||
return element.get('content', '').strip()
|
||||
else:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True)
|
||||
# Remove "Abstract:" prefix if present
|
||||
text = re.sub(r'^Abstract:\s*', '', text)
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract author names."""
|
||||
authors = []
|
||||
|
||||
# Try author meta tags
|
||||
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
|
||||
if author_metas:
|
||||
authors = [meta.get('content', '').strip() for meta in author_metas]
|
||||
|
||||
# Try arXiv author div
|
||||
if not authors:
|
||||
authors_div = soup.select_one('div.authors')
|
||||
if authors_div:
|
||||
# Extract author links or text
|
||||
author_links = authors_div.find_all('a')
|
||||
if author_links:
|
||||
authors = [link.get_text(strip=True) for link in author_links]
|
||||
else:
|
||||
# Fallback to text parsing
|
||||
text = authors_div.get_text()
|
||||
# Remove "Authors:" prefix and split by commas
|
||||
text = re.sub(r'^Authors?:\s*', '', text)
|
||||
authors = [author.strip() for author in text.split(',')]
|
||||
|
||||
return authors if authors else None
|
||||
|
||||
def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
|
||||
"""Extract main content (usually just abstract for arXiv HTML pages)."""
|
||||
content_parts = []
|
||||
|
||||
# For arXiv, the HTML page typically only contains abstract and metadata
|
||||
# The full text is in the PDF
|
||||
|
||||
if abstract:
|
||||
content_parts.append(f"Abstract\n{abstract}")
|
||||
|
||||
# Look for any additional content sections
|
||||
comments_section = soup.select_one('td.comments')
|
||||
if comments_section:
|
||||
comments = comments_section.get_text(strip=True)
|
||||
if comments:
|
||||
content_parts.append(f"Comments\n{comments}")
|
||||
|
||||
# Add note about PDF availability
|
||||
content_parts.append(
|
||||
"\nNote: This is the abstract and metadata from the arXiv HTML page. "
|
||||
"The full text is available in the PDF version."
|
||||
)
|
||||
|
||||
return '\n\n'.join(content_parts)
|
||||
|
||||
def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract subject classifications."""
|
||||
subjects = []
|
||||
|
||||
# Look for subject classification
|
||||
subjects_td = soup.select_one('td.subjects')
|
||||
if subjects_td:
|
||||
subjects_text = subjects_td.get_text(strip=True)
|
||||
# Parse subjects (format: "Primary: subject1; Secondary: subject2")
|
||||
subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
|
||||
# Clean up prefixes
|
||||
subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
|
||||
subjects = [subj for subj in subjects if subj] # Remove empty strings
|
||||
|
||||
return subjects if subjects else None
|
||||
|
||||
def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract arXiv ID."""
|
||||
# Look for arXiv ID in various places
|
||||
arxiv_id_patterns = [
|
||||
r'arXiv:(\d+\.\d+(?:v\d+)?)',
|
||||
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
|
||||
]
|
||||
|
||||
# Search in page text
|
||||
page_text = soup.get_text()
|
||||
for pattern in arxiv_id_patterns:
|
||||
match = re.search(pattern, page_text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Search in URL or meta tags
|
||||
canonical_link = soup.find('link', attrs={'rel': 'canonical'})
|
||||
if canonical_link:
|
||||
href = canonical_link.get('href', '')
|
||||
for pattern in arxiv_id_patterns:
|
||||
match = re.search(pattern, href)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract submission date."""
|
||||
# Look for submission date
|
||||
submission_td = soup.select_one('td.submission-history')
|
||||
if submission_td:
|
||||
date_text = submission_td.get_text()
|
||||
# Extract date (format varies)
|
||||
date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
|
||||
if date_match:
|
||||
return date_match.group(1)
|
||||
|
||||
# Try meta tag
|
||||
date_meta = soup.find('meta', attrs={'name': 'citation_date'})
|
||||
if date_meta:
|
||||
return date_meta.get('content', '').strip()
|
||||
|
||||
return None
|
83
scipaperloader/parsers/base_parser.py
Normal file
83
scipaperloader/parsers/base_parser.py
Normal file
@ -0,0 +1,83 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class ParsedContent:
|
||||
"""Container for parsed content from a publisher's HTML."""
|
||||
full_text: str
|
||||
title: Optional[str] = None
|
||||
abstract: Optional[str] = None
|
||||
authors: Optional[List[str]] = None
|
||||
keywords: Optional[List[str]] = None
|
||||
sections: Optional[Dict[str, str]] = None # section_title -> section_content
|
||||
references: Optional[List[str]] = None
|
||||
doi: Optional[str] = None
|
||||
journal: Optional[str] = None
|
||||
publication_date: Optional[str] = None
|
||||
metadata: Optional[Dict] = None # Additional metadata specific to publisher
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""Base class for all publisher-specific parsers."""
|
||||
|
||||
def __init__(self):
|
||||
self.parser_name = self.__class__.__name__.lower().replace('parser', '')
|
||||
|
||||
@abstractmethod
|
||||
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
||||
"""
|
||||
Check if this parser can handle the given HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to check
|
||||
url: Optional URL of the content (for additional context)
|
||||
|
||||
Returns:
|
||||
True if this parser can handle the content, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
||||
"""
|
||||
Parse HTML content and extract structured information.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
doi: Optional DOI of the paper
|
||||
|
||||
Returns:
|
||||
ParsedContent object with extracted information
|
||||
|
||||
Raises:
|
||||
ParseError: If parsing fails
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Return the name of this parser."""
|
||||
return self.parser_name
|
||||
|
||||
def get_description(self) -> str:
|
||||
"""Return a description of this parser."""
|
||||
return getattr(self.__class__, "__doc__", "No description available")
|
||||
|
||||
def validate_content(self, content: ParsedContent) -> bool:
|
||||
"""
|
||||
Validate the parsed content to ensure it meets minimum requirements.
|
||||
|
||||
Args:
|
||||
content: The parsed content to validate
|
||||
|
||||
Returns:
|
||||
True if content is valid, False otherwise
|
||||
"""
|
||||
# Basic validation - must have some full text
|
||||
if not content.full_text or len(content.full_text.strip()) < 100:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
class ParseError(Exception):
|
||||
"""Exception raised when parsing fails."""
|
||||
pass
|
252
scipaperloader/parsers/elsevier_parser.py
Normal file
252
scipaperloader/parsers/elsevier_parser.py
Normal file
@ -0,0 +1,252 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List
|
||||
from .base_parser import BaseParser, ParsedContent, ParseError
|
||||
|
||||
class ElsevierParser(BaseParser):
|
||||
"""Parser for Elsevier/ScienceDirect articles."""
|
||||
|
||||
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
||||
"""Check if this is an Elsevier/ScienceDirect page."""
|
||||
html_lower = html_content.lower()
|
||||
|
||||
# Check for Elsevier/ScienceDirect indicators
|
||||
indicators = [
|
||||
'sciencedirect.com',
|
||||
'elsevier.com',
|
||||
'meta name="citation_publisher" content="elsevier"',
|
||||
'copyright.*elsevier',
|
||||
'sciencedirect',
|
||||
]
|
||||
|
||||
return any(indicator in html_lower for indicator in indicators)
|
||||
|
||||
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
||||
"""Parse Elsevier/ScienceDirect HTML content."""
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
title = self._extract_title(soup)
|
||||
|
||||
# Extract abstract
|
||||
abstract = self._extract_abstract(soup)
|
||||
|
||||
# Extract authors
|
||||
authors = self._extract_authors(soup)
|
||||
|
||||
# Extract full text
|
||||
full_text = self._extract_full_text(soup)
|
||||
|
||||
# Extract sections
|
||||
sections = self._extract_sections(soup)
|
||||
|
||||
# Extract keywords
|
||||
keywords = self._extract_keywords(soup)
|
||||
|
||||
# Extract references
|
||||
references = self._extract_references(soup)
|
||||
|
||||
# Extract journal info
|
||||
journal = self._extract_journal(soup)
|
||||
|
||||
# Extract publication date
|
||||
publication_date = self._extract_publication_date(soup)
|
||||
|
||||
# Combine everything into full text if sections exist
|
||||
if sections:
|
||||
full_text = self._combine_sections(sections, abstract)
|
||||
|
||||
if not full_text or len(full_text.strip()) < 100:
|
||||
raise ParseError("Could not extract meaningful full text content")
|
||||
|
||||
return ParsedContent(
|
||||
full_text=full_text,
|
||||
title=title,
|
||||
abstract=abstract,
|
||||
authors=authors,
|
||||
keywords=keywords,
|
||||
sections=sections,
|
||||
references=references,
|
||||
doi=doi,
|
||||
journal=journal,
|
||||
publication_date=publication_date,
|
||||
metadata={
|
||||
'parser': 'elsevier',
|
||||
'source': 'sciencedirect'
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract article title."""
|
||||
# Try multiple title selectors
|
||||
selectors = [
|
||||
'h1.title-text',
|
||||
'h1[data-testid="title"]',
|
||||
'h1.article-title',
|
||||
'meta[name="citation_title"]',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
if 'meta' in selector:
|
||||
element = soup.find('meta', attrs={'name': 'citation_title'})
|
||||
if element:
|
||||
return element.get('content', '').strip()
|
||||
else:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract article abstract."""
|
||||
selectors = [
|
||||
'div.abstract-content',
|
||||
'div[data-testid="abstract"]',
|
||||
'div.abstract',
|
||||
'section.abstract',
|
||||
'div#abstract'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract author names."""
|
||||
authors = []
|
||||
|
||||
# Try author meta tags
|
||||
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
|
||||
if author_metas:
|
||||
authors = [meta.get('content', '').strip() for meta in author_metas]
|
||||
|
||||
# Try author div/span elements
|
||||
if not authors:
|
||||
author_elements = soup.select('div.author a, span.author, .author-name')
|
||||
authors = [elem.get_text(strip=True) for elem in author_elements]
|
||||
|
||||
return authors if authors else None
|
||||
|
||||
def _extract_full_text(self, soup: BeautifulSoup) -> str:
|
||||
"""Extract main article content."""
|
||||
content_parts = []
|
||||
|
||||
# Try main content selectors
|
||||
main_selectors = [
|
||||
'div.article-content',
|
||||
'div.body-content',
|
||||
'main.article-body',
|
||||
'div[data-testid="article-body"]',
|
||||
'section.article-section'
|
||||
]
|
||||
|
||||
for selector in main_selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Remove script, style, and navigation elements
|
||||
for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
|
||||
unwanted.decompose()
|
||||
|
||||
text = element.get_text(separator='\n', strip=True)
|
||||
if text and len(text) > 50: # Only add substantial content
|
||||
content_parts.append(text)
|
||||
|
||||
return '\n\n'.join(content_parts)
|
||||
|
||||
def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
|
||||
"""Extract article sections with headings."""
|
||||
sections = {}
|
||||
|
||||
# Look for section headings and content
|
||||
section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
|
||||
|
||||
for heading in section_elements:
|
||||
section_title = heading.get_text(strip=True)
|
||||
|
||||
# Find content after this heading until next heading
|
||||
content_parts = []
|
||||
current = heading.next_sibling
|
||||
|
||||
while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
|
||||
if hasattr(current, 'get_text'):
|
||||
text = current.get_text(strip=True)
|
||||
if text:
|
||||
content_parts.append(text)
|
||||
current = current.next_sibling
|
||||
|
||||
if content_parts:
|
||||
sections[section_title] = '\n'.join(content_parts)
|
||||
|
||||
return sections if sections else None
|
||||
|
||||
def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract article keywords."""
|
||||
keywords = []
|
||||
|
||||
# Try keyword meta tags
|
||||
keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
|
||||
if keyword_metas:
|
||||
for meta in keyword_metas:
|
||||
content = meta.get('content', '')
|
||||
if content:
|
||||
keywords.extend([kw.strip() for kw in content.split(',')])
|
||||
|
||||
# Try keyword sections
|
||||
if not keywords:
|
||||
keyword_sections = soup.select('div.keywords, section.keywords')
|
||||
for section in keyword_sections:
|
||||
text = section.get_text()
|
||||
keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
|
||||
|
||||
return keywords if keywords else None
|
||||
|
||||
def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract references."""
|
||||
references = []
|
||||
|
||||
ref_sections = soup.select('section.references, div.references, ol.references li')
|
||||
for section in ref_sections:
|
||||
if section.name == 'li':
|
||||
references.append(section.get_text(strip=True))
|
||||
else:
|
||||
ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
|
||||
references.extend([item.get_text(strip=True) for item in ref_items])
|
||||
|
||||
return references if references else None
|
||||
|
||||
def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract journal name."""
|
||||
journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
|
||||
if journal_meta:
|
||||
return journal_meta.get('content', '').strip()
|
||||
|
||||
return None
|
||||
|
||||
def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract publication date."""
|
||||
date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
|
||||
if date_meta:
|
||||
return date_meta.get('content', '').strip()
|
||||
|
||||
return None
|
||||
|
||||
def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
|
||||
"""Combine all sections into full text."""
|
||||
full_text_parts = []
|
||||
|
||||
if abstract:
|
||||
full_text_parts.append(f"Abstract\n{abstract}")
|
||||
|
||||
for section_title, section_content in sections.items():
|
||||
full_text_parts.append(f"{section_title}\n{section_content}")
|
||||
|
||||
return '\n\n'.join(full_text_parts)
|
@ -18,6 +18,43 @@ class BaseScraper(ABC):
|
||||
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
|
||||
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the scraper."""
|
||||
self.scraper_name = self.get_name().lower()
|
||||
|
||||
def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
|
||||
"""Log the start of a scraping operation."""
|
||||
from ..models import ActivityLog
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action=f"{self.scraper_name}_scrape_start",
|
||||
status="info",
|
||||
description=f"Starting {self.get_name()} for DOI: {doi}",
|
||||
paper_id=paper_id
|
||||
)
|
||||
|
||||
def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
|
||||
"""Log successful completion of scraping."""
|
||||
from ..models import ActivityLog
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action=f"{self.scraper_name}_scrape_success",
|
||||
status="success",
|
||||
description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
|
||||
paper_id=paper_id
|
||||
)
|
||||
|
||||
def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
|
||||
"""Log failed scraping operation."""
|
||||
from ..models import ActivityLog
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action=f"{self.scraper_name}_scrape_failure",
|
||||
status="error",
|
||||
description=f"{self.get_name()} failed for DOI: {doi} - {message}",
|
||||
paper_id=paper_id
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""
|
||||
|
@ -30,6 +30,9 @@ class Scraper(BaseScraper):
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Simulate processing time (1-3 seconds)
|
||||
processing_time = random.uniform(1, 3)
|
||||
time.sleep(processing_time)
|
||||
@ -145,12 +148,7 @@ class Scraper(BaseScraper):
|
||||
)
|
||||
|
||||
# Log success
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="dummy_scrape",
|
||||
status="success",
|
||||
description=f"Successfully scraped {doi}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="success",
|
||||
@ -178,12 +176,7 @@ class Scraper(BaseScraper):
|
||||
paper.error_msg = error_msg
|
||||
|
||||
# Log failure
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="dummy_scrape",
|
||||
status="error",
|
||||
description=f"Failed to scrape {doi}: {error_msg}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
|
@ -30,13 +30,8 @@ class Scraper(BaseScraper):
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log retry attempt
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_failed_paper",
|
||||
status="info",
|
||||
description=f"Retrying failed paper: {paper.title}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
# Log start of retry
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Simulate longer processing time for retry (2-5 seconds)
|
||||
processing_time = random.uniform(2, 5)
|
||||
@ -64,12 +59,7 @@ class Scraper(BaseScraper):
|
||||
result_data = {"file_path": file_path}
|
||||
|
||||
# Log success
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_success",
|
||||
status="success",
|
||||
description=f"Successfully retried {doi} on second attempt",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="success",
|
||||
@ -81,12 +71,7 @@ class Scraper(BaseScraper):
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save retry file: {str(e)}"
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_file_error",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
@ -105,12 +90,7 @@ class Scraper(BaseScraper):
|
||||
]
|
||||
error_msg = random.choice(error_messages)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_failure",
|
||||
status="error",
|
||||
description=f"Retry failed for {doi}: {error_msg}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
|
172
scipaperloader/scrapers/html_fetcher.py
Normal file
172
scipaperloader/scrapers/html_fetcher.py
Normal file
@ -0,0 +1,172 @@
|
||||
import time
|
||||
import os
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Scraper that fetches HTML content from DOI and saves it for further processing."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "HtmlDownloaded"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "FetchingHtml"
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Fetch HTML content from DOI and save to download path."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
# Prepare file paths
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
file_name = f"{doi.replace('/', '_')}.html"
|
||||
file_path = os.path.join(download_path, file_name)
|
||||
|
||||
# Check/create download directory (same pattern as dummy)
|
||||
if not os.path.exists(download_path):
|
||||
try:
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
except OSError as e:
|
||||
error_msg = f"Failed to create download directory: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Check path permissions (same pattern as dummy)
|
||||
if not os.access(download_path, os.W_OK):
|
||||
error_msg = f"Download path '{download_path}' is not writable"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="html_fetch_path_error",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_write_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
try:
|
||||
# Fetch HTML from DOI
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
headers = {'User-Agent': 'SciPaperLoader/1.0'}
|
||||
response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True)
|
||||
|
||||
# Check for invalid DOI (404) or other HTTP errors
|
||||
if response.status_code == 404:
|
||||
error_msg = f"Invalid DOI: {doi} not found"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "invalid_doi"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
response.raise_for_status() # Raise for other HTTP errors
|
||||
|
||||
# Save HTML content
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(response.text)
|
||||
|
||||
# Update paper status to success
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.file_path = file_path
|
||||
paper.error_msg = None
|
||||
db.session.commit()
|
||||
|
||||
# Log success
|
||||
self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully fetched HTML for {doi}",
|
||||
data={
|
||||
"file_path": file_path,
|
||||
"url": response.url, # Final URL after redirects
|
||||
"title": paper.title
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="html_fetch",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "network_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save HTML file: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "file_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
282
scipaperloader/scrapers/publisher_detector.py
Normal file
282
scipaperloader/scrapers/publisher_detector.py
Normal file
@ -0,0 +1,282 @@
|
||||
import time
|
||||
import requests
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "PublisherDetected"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
|
||||
|
||||
# Publisher detection patterns based on URL domains and paths
|
||||
PUBLISHER_URL_PATTERNS = {
|
||||
'elsevier': [
|
||||
r'sciencedirect\.com',
|
||||
r'elsevier\.com',
|
||||
r'.*\.elsevier\.com'
|
||||
],
|
||||
'springer': [
|
||||
r'link\.springer\.com',
|
||||
r'springer\.com',
|
||||
r'.*\.springer\.com'
|
||||
],
|
||||
'wiley': [
|
||||
r'onlinelibrary\.wiley\.com',
|
||||
r'wiley\.com',
|
||||
r'.*\.wiley\.com'
|
||||
],
|
||||
'ieee': [
|
||||
r'ieeexplore\.ieee\.org',
|
||||
r'ieee\.org',
|
||||
r'.*\.ieee\.org'
|
||||
],
|
||||
'plos': [
|
||||
r'journals\.plos\.org',
|
||||
r'plos\.org',
|
||||
r'.*\.plos\.org'
|
||||
],
|
||||
'nature': [
|
||||
r'nature\.com',
|
||||
r'.*\.nature\.com'
|
||||
],
|
||||
'sage': [
|
||||
r'journals\.sagepub\.com',
|
||||
r'sagepub\.com',
|
||||
r'.*\.sagepub\.com'
|
||||
],
|
||||
'taylor_francis': [
|
||||
r'tandfonline\.com',
|
||||
r'.*\.tandfonline\.com'
|
||||
],
|
||||
'acs': [
|
||||
r'pubs\.acs\.org',
|
||||
r'acs\.org',
|
||||
r'.*\.acs\.org'
|
||||
],
|
||||
'arxiv': [
|
||||
r'arxiv\.org',
|
||||
r'export\.arxiv\.org'
|
||||
],
|
||||
'pubmed': [
|
||||
r'pubmed\.ncbi\.nlm\.nih\.gov',
|
||||
r'ncbi\.nlm\.nih\.gov'
|
||||
],
|
||||
'oxford': [
|
||||
r'academic\.oup\.com',
|
||||
r'oup\.com',
|
||||
r'.*\.oup\.com'
|
||||
],
|
||||
'cambridge': [
|
||||
r'cambridge\.org',
|
||||
r'.*\.cambridge\.org'
|
||||
],
|
||||
'biorxiv': [
|
||||
r'biorxiv\.org',
|
||||
r'.*\.biorxiv\.org'
|
||||
],
|
||||
'researchgate': [
|
||||
r'researchgate\.net',
|
||||
r'.*\.researchgate\.net'
|
||||
]
|
||||
}
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Detect publisher from the final URL after DOI redirect."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
try:
|
||||
# Get the final URL by following the DOI redirect
|
||||
final_url = self._get_final_url(doi)
|
||||
|
||||
if not final_url:
|
||||
error_msg = f"Could not resolve DOI {doi} to a URL"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "doi_resolution_failed"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Detect publisher from URL
|
||||
detected_publisher = self._detect_publisher_from_url(final_url)
|
||||
|
||||
if detected_publisher:
|
||||
# Update paper with detected publisher
|
||||
paper.publisher = detected_publisher
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.error_msg = None
|
||||
db.session.commit()
|
||||
|
||||
success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
|
||||
self.log_scrape_success(doi, success_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=success_msg,
|
||||
data={
|
||||
"publisher": detected_publisher,
|
||||
"final_url": final_url
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
else:
|
||||
error_msg = f"Could not detect publisher from URL: {final_url}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={
|
||||
"final_url": final_url,
|
||||
"error_code": "publisher_not_detected"
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "publisher_detection_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
def _get_final_url(self, doi: str) -> Optional[str]:
|
||||
"""
|
||||
Get the final URL after following DOI redirects.
|
||||
|
||||
Args:
|
||||
doi: The DOI to resolve
|
||||
|
||||
Returns:
|
||||
Final URL after redirects, or None if resolution fails
|
||||
"""
|
||||
try:
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
headers = {
|
||||
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
|
||||
# Make a HEAD request to get the final URL without downloading content
|
||||
response = requests.head(
|
||||
doi_url,
|
||||
headers=headers,
|
||||
timeout=15,
|
||||
allow_redirects=True
|
||||
)
|
||||
|
||||
# If HEAD is not allowed, try GET but with minimal content
|
||||
if response.status_code == 405: # Method Not Allowed
|
||||
response = requests.get(
|
||||
doi_url,
|
||||
headers=headers,
|
||||
timeout=15,
|
||||
allow_redirects=True,
|
||||
stream=True # Don't download the full content
|
||||
)
|
||||
response.close() # Close connection after getting headers
|
||||
|
||||
if response.status_code in [200, 302, 301]:
|
||||
return response.url
|
||||
else:
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
# Log error but don't raise - we'll handle this gracefully
|
||||
return None
|
||||
|
||||
def _detect_publisher_from_url(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Detect publisher from URL using domain patterns.
|
||||
|
||||
Args:
|
||||
url: The URL to analyze
|
||||
|
||||
Returns:
|
||||
Publisher name if detected, None otherwise
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Parse the URL to get the domain
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc.lower()
|
||||
|
||||
# Remove 'www.' prefix if present
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
# Score each publisher based on URL pattern matches
|
||||
publisher_scores = {}
|
||||
|
||||
for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
|
||||
score = 0
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, domain, re.IGNORECASE):
|
||||
score += 10 # Strong match for domain patterns
|
||||
|
||||
# Also check the full URL for path-based patterns
|
||||
if re.search(pattern, url.lower(), re.IGNORECASE):
|
||||
score += 5
|
||||
|
||||
if score > 0:
|
||||
publisher_scores[publisher] = score
|
||||
|
||||
# Return the publisher with the highest score
|
||||
if publisher_scores:
|
||||
best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
|
||||
|
||||
# Only return if we have a reasonable confidence (score > 5)
|
||||
if publisher_scores[best_publisher] > 5:
|
||||
return best_publisher
|
||||
|
||||
return None
|
237
scipaperloader/scrapers/text_extractor.py
Normal file
237
scipaperloader/scrapers/text_extractor.py
Normal file
@ -0,0 +1,237 @@
|
||||
import time
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
from ..parsers.base_parser import BaseParser, ParseError
|
||||
from ..parsers.elsevier_parser import ElsevierParser
|
||||
from ..parsers.arxiv_parser import ArxivParser
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Full text extraction scraper that uses publisher-specific parsers."""
|
||||
|
||||
# This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
|
||||
INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
|
||||
OUTPUT_STATUS_SUCCESS = "TextExtracted"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "ExtractingText"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Registry of available parsers
|
||||
self.parsers = [
|
||||
ElsevierParser(),
|
||||
ArxivParser(),
|
||||
# Add more parsers here as you create them
|
||||
# SpringerParser(),
|
||||
# WileyParser(),
|
||||
# IEEEParser(),
|
||||
]
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Extract full text using appropriate publisher parser."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
# Check if HTML file exists
|
||||
if not paper.file_path or not os.path.exists(paper.file_path):
|
||||
error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "html_file_not_found"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
try:
|
||||
# Read HTML content
|
||||
with open(paper.file_path, 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Find appropriate parser
|
||||
parser = self._select_parser(html_content)
|
||||
|
||||
if not parser:
|
||||
error_msg = f"No suitable parser found for DOI {doi}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "no_parser_available"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Parse content
|
||||
parsed_content = parser.parse(html_content, doi)
|
||||
|
||||
# Validate parsed content
|
||||
if not parser.validate_content(parsed_content):
|
||||
error_msg = f"Parsed content validation failed for DOI {doi}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "content_validation_failed"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Save extracted text to file
|
||||
text_file_path = self._save_extracted_text(parsed_content, doi)
|
||||
|
||||
# Update paper status to success
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.error_msg = None
|
||||
# You might want to add a text_file_path field to store the text file location
|
||||
# paper.text_file_path = text_file_path
|
||||
db.session.commit()
|
||||
|
||||
success_msg = f"Successfully extracted text using {parser.get_name()} parser"
|
||||
self.log_scrape_success(doi, success_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully extracted full text for {doi}",
|
||||
data={
|
||||
"text_file_path": text_file_path,
|
||||
"parser_used": parser.get_name(),
|
||||
"title": parsed_content.title,
|
||||
"word_count": len(parsed_content.full_text.split()),
|
||||
"has_abstract": bool(parsed_content.abstract),
|
||||
"has_sections": bool(parsed_content.sections),
|
||||
"author_count": len(parsed_content.authors) if parsed_content.authors else 0,
|
||||
"keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
|
||||
"reference_count": len(parsed_content.references) if parsed_content.references else 0
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except ParseError as e:
|
||||
error_msg = f"Parser error for DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "parser_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "extraction_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
def _select_parser(self, html_content: str) -> Optional[BaseParser]:
|
||||
"""
|
||||
Select the most appropriate parser for the HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to analyze
|
||||
|
||||
Returns:
|
||||
The best parser for this content, or None if no parser can handle it
|
||||
"""
|
||||
for parser in self.parsers:
|
||||
if parser.can_parse(html_content):
|
||||
return parser
|
||||
|
||||
return None
|
||||
|
||||
def _save_extracted_text(self, parsed_content, doi: str) -> str:
|
||||
"""
|
||||
Save extracted text to a file.
|
||||
|
||||
Args:
|
||||
parsed_content: The parsed content object
|
||||
doi: The DOI of the paper
|
||||
|
||||
Returns:
|
||||
Path to the saved text file
|
||||
"""
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
|
||||
text_file_path = os.path.join(download_path, text_file_name)
|
||||
|
||||
with open(text_file_path, 'w', encoding='utf-8') as f:
|
||||
# Write structured content
|
||||
f.write(f"DOI: {parsed_content.doi or doi}\n")
|
||||
f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
|
||||
f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
|
||||
f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
|
||||
|
||||
if parsed_content.authors:
|
||||
f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
|
||||
|
||||
if parsed_content.keywords:
|
||||
f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
|
||||
|
||||
f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
# Write full text
|
||||
f.write(parsed_content.full_text)
|
||||
|
||||
# Optionally write references at the end
|
||||
if parsed_content.references:
|
||||
f.write("\n\n" + "=" * 80 + "\n")
|
||||
f.write("REFERENCES\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
for i, ref in enumerate(parsed_content.references, 1):
|
||||
f.write(f"{i}. {ref}\n")
|
||||
|
||||
return text_file_path
|
201
scipaperloader/scrapers/web_fetcher.py
Normal file
201
scipaperloader/scrapers/web_fetcher.py
Normal file
@ -0,0 +1,201 @@
|
||||
import time
|
||||
import os
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Web fetcher scraper that downloads HTML content from DOI URLs."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "WebContentDownloaded"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "FetchingWebContent"
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Fetch HTML content from DOI and save to download path."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
# Prepare file paths
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
file_name = f"{doi.replace('/', '_')}.html"
|
||||
file_path = os.path.join(download_path, file_name)
|
||||
|
||||
# Check/create download directory
|
||||
if not os.path.exists(download_path):
|
||||
try:
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
except OSError as e:
|
||||
error_msg = f"Failed to create download directory: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Check path permissions
|
||||
if not os.access(download_path, os.W_OK):
|
||||
error_msg = f"Download path '{download_path}' is not writable"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_write_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
try:
|
||||
# Fetch HTML from DOI
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
headers = {
|
||||
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
doi_url,
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
allow_redirects=True,
|
||||
verify=True
|
||||
)
|
||||
|
||||
# Check for invalid DOI (404) or other HTTP errors
|
||||
if response.status_code == 404:
|
||||
error_msg = f"Invalid DOI: {doi} not found (404)"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "invalid_doi"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Check for other HTTP errors
|
||||
response.raise_for_status()
|
||||
|
||||
# Save HTML content
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(response.text)
|
||||
|
||||
# Extract final URL after redirects (for publisher detection)
|
||||
final_url = response.url
|
||||
|
||||
# Update paper status to success
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.file_path = file_path
|
||||
paper.error_msg = None
|
||||
db.session.commit()
|
||||
|
||||
# Log success
|
||||
success_msg = f"Successfully fetched HTML content for {doi} from {final_url}"
|
||||
self.log_scrape_success(doi, success_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully fetched HTML for {doi}",
|
||||
data={
|
||||
"file_path": file_path,
|
||||
"final_url": final_url,
|
||||
"content_length": len(response.text),
|
||||
"content_type": response.headers.get('content-type', 'unknown'),
|
||||
"title": paper.title,
|
||||
"domain": urlparse(final_url).netloc if final_url else None
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "http_error", "status_code": e.response.status_code},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = f"Network error fetching {doi_url}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "network_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save HTML file: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "file_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
384
scipaperloader/static/js/README.md
Normal file
384
scipaperloader/static/js/README.md
Normal file
@ -0,0 +1,384 @@
|
||||
# JavaScript Modularization Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The JavaScript code in the SciPaperLoader application has been modularized into reusable components to improve maintainability, reduce code duplication, and enable easier testing and updates.
|
||||
|
||||
## Modularization Task Completed
|
||||
|
||||
### Problem Statement
|
||||
The original codebase had ~800+ lines of inline JavaScript scattered across multiple Jinja templates with several critical issues:
|
||||
- **Code Duplication**: Similar functionality replicated across templates
|
||||
- **Maintenance Difficulty**: Changes required editing multiple template files
|
||||
- **Linter Issues**: Jinja template syntax mixed with JavaScript caused linting errors
|
||||
- **Testing Challenges**: Inline code was difficult to unit test
|
||||
- **Poor Separation of Concerns**: Template logic mixed with application logic
|
||||
|
||||
### Solution Implemented
|
||||
Successfully transformed the codebase by:
|
||||
|
||||
1. **Extracted 10 Modular JavaScript Files** (~800+ lines of code moved from templates)
|
||||
2. **Eliminated Code Duplication** by creating reusable components
|
||||
3. **Fixed Linter Compatibility** by separating template syntax from JavaScript logic
|
||||
4. **Implemented Clean Variable Passing** using JSON script tags instead of direct Jinja embedding
|
||||
5. **Created Class-Based Architecture** with proper inheritance and composition patterns
|
||||
6. **Established Inter-Component Communication** through callback systems
|
||||
7. **Added Comprehensive Error Handling** and loading states throughout
|
||||
|
||||
### Key Achievements
|
||||
- ✅ **5 templates modularized**: `scraper.html.jinja`, `papers.html.jinja`, `upload.html.jinja`, `logger.html.jinja`, `config/schedule.html.jinja`
|
||||
- ✅ **10 JavaScript modules created**: Covering all functionality from utilities to dashboard coordination
|
||||
- ✅ **Zero functionality loss**: All existing features preserved during modularization
|
||||
- ✅ **Improved maintainability**: Changes now require editing single module files
|
||||
- ✅ **Enhanced testability**: Individual modules can be unit tested
|
||||
- ✅ **Clean variable handling**: Jinja variables passed as JSON configuration instead of inline embedding
|
||||
|
||||
### Before vs After Example
|
||||
**Before (inline in template)**:
|
||||
```html
|
||||
<script>
|
||||
var maxVolume = {{ max_volume }}; // Linter error
|
||||
$('#start-scraper').click(function() {
|
||||
// 50+ lines of mixed template/JS code
|
||||
});
|
||||
</script>
|
||||
```
|
||||
|
||||
**After (modular)**:
|
||||
```html
|
||||
<script type="application/json" id="config-data">
|
||||
{"maxVolume": {{ max_volume|tojson }}}
|
||||
</script>
|
||||
<script src="{{ url_for('static', filename='js/scraper-control.js') }}"></script>
|
||||
<script>
|
||||
const config = JSON.parse(document.getElementById('config-data').textContent);
|
||||
new ScraperControl(config).init();
|
||||
</script>
|
||||
```
|
||||
|
||||
## Modular JavaScript Files
|
||||
|
||||
### 1. `/static/js/common.js`
|
||||
**Purpose**: Common utilities used across the application
|
||||
|
||||
**Key Functions**:
|
||||
- `showFlashMessage(message, type)` - Display flash messages to users
|
||||
- `createStatusBadge(status)` - Generate status badge HTML
|
||||
- `formatTimestamp(timestamp)` - Format timestamps for display
|
||||
- `truncateText(text, maxLength)` - Truncate text with ellipsis
|
||||
- `toggleButtonLoading(button, loading, loadingText)` - Handle button loading states
|
||||
- `apiRequest(url, options)` - Generic API request wrapper
|
||||
|
||||
**Used by**: All templates that need basic utilities
|
||||
|
||||
### 2. `/static/js/modal-handler.js`
|
||||
**Purpose**: Handle modal dialogs with dynamic content loading
|
||||
|
||||
**Key Features**:
|
||||
- AJAX content loading
|
||||
- Error handling
|
||||
- Automatic click handler setup
|
||||
- Bootstrap modal integration
|
||||
|
||||
**Used by**:
|
||||
- `papers.html.jinja` (paper details modal)
|
||||
- `logger.html.jinja` (log details modal)
|
||||
|
||||
### 3. `/static/js/form-handler.js`
|
||||
**Purpose**: Handle form submissions with progress tracking
|
||||
|
||||
**Key Features**:
|
||||
- Progress modal display
|
||||
- Task status polling
|
||||
- Error handling
|
||||
- Customizable callbacks
|
||||
|
||||
**Used by**:
|
||||
- `upload.html.jinja` (CSV upload form)
|
||||
|
||||
### 4. `/static/js/chart.js`
|
||||
**Purpose**: Handle Chart.js activity visualization
|
||||
|
||||
**Key Features**:
|
||||
- Chart initialization and rendering
|
||||
- Data loading from API
|
||||
- Error handling for missing Chart.js
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja` (activity charts)
|
||||
|
||||
### 5. `/static/js/scraper-control.js`
|
||||
**Purpose**: Handle scraper control operations (start/stop/pause/reset)
|
||||
|
||||
**Key Features**:
|
||||
- Status polling
|
||||
- Volume configuration
|
||||
- Callback system for refreshing other components
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 6. `/static/js/paper-processor.js`
|
||||
**Purpose**: Handle paper search and processing functionality
|
||||
|
||||
**Key Features**:
|
||||
- Paper search
|
||||
- Single paper processing
|
||||
- Status polling
|
||||
- Scraper selection
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 7. `/static/js/activity-monitor.js`
|
||||
**Purpose**: Handle activity log display and real-time notifications
|
||||
|
||||
**Key Features**:
|
||||
- Activity log loading
|
||||
- Real-time updates
|
||||
- Notification management
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 8. `/static/js/scraper-dashboard.js`
|
||||
**Purpose**: Coordinate all scraper dashboard components
|
||||
|
||||
**Key Features**:
|
||||
- Component initialization
|
||||
- Inter-component communication
|
||||
- Configuration management
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 9. `/static/js/config-handler.js`
|
||||
**Purpose**: Handle configuration forms and Alpine.js integration
|
||||
|
||||
**Key Features**:
|
||||
- Configuration API calls
|
||||
- Alpine.js data objects
|
||||
- Schedule management
|
||||
- Volume updates
|
||||
|
||||
**Used by**:
|
||||
- `config/schedule.html.jinja`
|
||||
|
||||
## Template Updates
|
||||
|
||||
### Templates Using Modular JavaScript
|
||||
|
||||
1. **scraper.html.jinja**
|
||||
- Uses all scraper-related modules
|
||||
- Passes Jinja variables as configuration parameters
|
||||
- Initializes dashboard with `initScraperDashboard(config)`
|
||||
|
||||
2. **papers.html.jinja**
|
||||
- Uses `modal-handler.js` for paper detail modals
|
||||
- Simplified from custom modal code to single line initialization
|
||||
|
||||
3. **upload.html.jinja**
|
||||
- Uses `form-handler.js` for upload progress tracking
|
||||
- Custom result display function
|
||||
- Automatic task status polling
|
||||
|
||||
4. **logger.html.jinja**
|
||||
- Uses `modal-handler.js` for log detail modals
|
||||
- Custom URL construction for log endpoints
|
||||
|
||||
5. **config/schedule.html.jinja**
|
||||
- Uses `config-handler.js` for Alpine.js integration
|
||||
- Modular schedule management functions
|
||||
|
||||
## Benefits of Modularization
|
||||
|
||||
### 1. **Reusability**
|
||||
- Modal functionality shared between papers and logger templates
|
||||
- Common utilities used across all templates
|
||||
- Form handling can be reused for other forms
|
||||
|
||||
### 2. **Maintainability**
|
||||
- Single place to update common functionality
|
||||
- Clear separation of concerns
|
||||
- Easier debugging and testing
|
||||
|
||||
### 3. **Parameter Passing**
|
||||
- Jinja variables passed as configuration objects
|
||||
- No more hardcoded values in JavaScript
|
||||
- Environment-specific settings easily configurable
|
||||
|
||||
### 4. **Extensibility**
|
||||
- Easy to add new functionality to existing modules
|
||||
- New templates can easily use existing modules
|
||||
- Plugin-like architecture for components
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Modal Usage
|
||||
```javascript
|
||||
const modal = new ModalHandler('modalId', 'contentElementId');
|
||||
modal.setupClickHandlers('.clickable-items');
|
||||
```
|
||||
|
||||
### Form with Progress Tracking
|
||||
```javascript
|
||||
const formHandler = new FormHandler('formId', {
|
||||
onSuccess: (result) => console.log('Success:', result),
|
||||
onError: (error) => console.log('Error:', error)
|
||||
});
|
||||
```
|
||||
|
||||
### Configuration Management
|
||||
```javascript
|
||||
// In Alpine.js template
|
||||
x-data="configHandler.createScheduleManager(initialData, volume)"
|
||||
```
|
||||
|
||||
## Migration Notes
|
||||
|
||||
### Old vs New Approach
|
||||
|
||||
**Before**: Inline JavaScript in each template
|
||||
```html
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
// Lots of inline JavaScript code
|
||||
});
|
||||
</script>
|
||||
```
|
||||
|
||||
**After**: Modular imports with configuration
|
||||
```html
|
||||
<script src="{{ url_for('static', filename='js/common.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
|
||||
<script>
|
||||
const modal = new ModalHandler('modalId', 'contentId');
|
||||
modal.setupClickHandlers('.links');
|
||||
</script>
|
||||
```
|
||||
|
||||
### Jinja Variable Handling
|
||||
|
||||
To properly separate Jinja template variables from JavaScript code and avoid linting issues, we use a clean JSON configuration approach:
|
||||
|
||||
**Before**: Variables embedded directly in JavaScript (causes linting issues)
|
||||
```javascript
|
||||
if (volume > {{ max_volume }}) {
|
||||
// Error handling - JSLint will complain about {{ }}
|
||||
}
|
||||
```
|
||||
|
||||
**After**: Clean separation using JSON script tags
|
||||
```html
|
||||
<!-- Jinja variables in JSON format -->
|
||||
<script type="application/json" id="config-data">
|
||||
{
|
||||
"maxVolume": {{ max_volume|tojson }},
|
||||
"currentVolume": {{ volume|tojson }},
|
||||
"apiUrl": {{ url_for('api.endpoint')|tojson }},
|
||||
"csrfToken": {{ csrf_token()|tojson }}
|
||||
}
|
||||
</script>
|
||||
|
||||
<!-- Clean JavaScript that reads the configuration -->
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const config = JSON.parse(document.getElementById('config-data').textContent);
|
||||
const handler = new VolumeHandler(config);
|
||||
});
|
||||
</script>
|
||||
```
|
||||
|
||||
**Benefits of this approach**:
|
||||
- **Linter-friendly**: No template syntax in JavaScript files
|
||||
- **Type-safe**: JSON ensures proper data types
|
||||
- **Maintainable**: Clear separation of concerns
|
||||
- **Secure**: Automatic escaping with `|tojson` filter
|
||||
- **Debuggable**: Easy to inspect configuration in DevTools
|
||||
|
||||
**Real-world example from scraper.html.jinja**:
|
||||
```html
|
||||
<script type="application/json" id="scraper-config">
|
||||
{
|
||||
"statusUrl": {{ url_for('api.scraper_status')|tojson }},
|
||||
"startUrl": {{ url_for('api.start_scraper')|tojson }},
|
||||
"volume": {{ volume|tojson }},
|
||||
"scraperType": {{ scraper_type|tojson }},
|
||||
"csrfToken": {{ csrf_token()|tojson }}
|
||||
}
|
||||
</script>
|
||||
|
||||
<script>
|
||||
const config = JSON.parse(document.getElementById('scraper-config').textContent);
|
||||
initScraperDashboard(config);
|
||||
</script>
|
||||
```
|
||||
|
||||
## Future Improvements
|
||||
|
||||
### Potential Enhancements
|
||||
1. **Bundle Management**: Consider using webpack or similar for production builds
|
||||
2. **Unit Testing**: Add comprehensive test suite for individual modules
|
||||
3. **JSDoc Comments**: Add detailed documentation for better IDE support
|
||||
4. **Centralized Error Reporting**: Implement global error handling system
|
||||
5. **Performance Optimization**: Implement lazy loading for non-critical modules
|
||||
6. **TypeScript Migration**: Consider migrating to TypeScript for better type safety
|
||||
|
||||
### Adding New Modules
|
||||
When creating new JavaScript modules:
|
||||
1. Follow the established class-based pattern
|
||||
2. Include proper error handling
|
||||
3. Use the configuration pattern for Jinja variables
|
||||
4. Add documentation to this README
|
||||
5. Update templates to use the new module
|
||||
|
||||
## Testing
|
||||
|
||||
A test file `test_js_modularization.py` has been created to verify the modularization. To run comprehensive testing:
|
||||
|
||||
```bash
|
||||
python test_js_modularization.py
|
||||
```
|
||||
|
||||
This will verify:
|
||||
- All JavaScript files exist and are properly formatted
|
||||
- Templates correctly reference the modular files
|
||||
- Configuration patterns are properly implemented
|
||||
- No inline JavaScript remains in templates
|
||||
|
||||
## Maintenance
|
||||
|
||||
### When Making Changes
|
||||
1. **Update Single Module**: Changes to functionality only require editing one file
|
||||
2. **Test Affected Templates**: Ensure all templates using the module still work
|
||||
3. **Update Documentation**: Keep this README current with any changes
|
||||
4. **Consider Dependencies**: Check if changes affect other modules
|
||||
|
||||
### File Organization
|
||||
```
|
||||
/static/js/
|
||||
├── README.md # This documentation
|
||||
├── common.js # Shared utilities
|
||||
├── modal-handler.js # Modal functionality
|
||||
├── form-handler.js # Form processing
|
||||
├── chart.js # Chart visualization
|
||||
├── scraper-control.js # Scraper operations
|
||||
├── paper-processor.js # Paper management
|
||||
├── activity-monitor.js # Activity tracking
|
||||
├── scraper-dashboard.js # Dashboard coordination
|
||||
├── config-handler.js # Configuration management
|
||||
└── table-handler.js # Table utilities
|
||||
```
|
||||
|
||||
## Migration Summary
|
||||
|
||||
The modularization successfully transformed **~800+ lines of inline JavaScript** from templates into a maintainable, reusable module system. This improvement provides:
|
||||
|
||||
- **Enhanced maintainability** through single-responsibility modules
|
||||
- **Reduced code duplication** via shared utility functions
|
||||
- **Improved linter compatibility** by separating template and JavaScript concerns
|
||||
- **Better testability** with isolated, unit-testable modules
|
||||
- **Cleaner templates** with minimal, configuration-only JavaScript
|
||||
- **Easier debugging** with clearly separated concerns and proper error handling
|
||||
|
||||
All existing functionality has been preserved while significantly improving the codebase architecture and developer experience.
|
@ -38,12 +38,12 @@ class ScraperController {
|
||||
this.resetButton.addEventListener("click", () => this.resetScraper());
|
||||
}
|
||||
|
||||
// Volume form
|
||||
const volumeForm = document.getElementById("volumeForm");
|
||||
if (volumeForm) {
|
||||
volumeForm.addEventListener("submit", (e) => {
|
||||
// Configuration form (handles both volume and scraper module)
|
||||
const configForm = document.getElementById("volumeForm");
|
||||
if (configForm) {
|
||||
configForm.addEventListener("submit", (e) => {
|
||||
e.preventDefault();
|
||||
this.updateVolume();
|
||||
this.updateConfiguration();
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -245,25 +245,46 @@ class ScraperController {
|
||||
}
|
||||
|
||||
/**
|
||||
* Update volume configuration
|
||||
* Update configuration (volume and/or scraper module)
|
||||
*/
|
||||
async updateVolume() {
|
||||
async updateConfiguration() {
|
||||
const volumeInput = document.getElementById("volumeInput");
|
||||
const scraperSelect = document.getElementById("mainScraperSelect");
|
||||
const submitButton = document.querySelector(
|
||||
'#volumeForm button[type="submit"]'
|
||||
);
|
||||
|
||||
if (!volumeInput || !submitButton) return;
|
||||
if (!submitButton) return;
|
||||
|
||||
const volume = volumeInput.value;
|
||||
const updates = {};
|
||||
let hasChanges = false;
|
||||
|
||||
// Basic validation
|
||||
if (!volume || volume < 1 || volume > this.maxVolume) {
|
||||
showFlashMessage(
|
||||
`Please enter a valid volume between 1 and ${this.maxVolume}`,
|
||||
"warning"
|
||||
);
|
||||
volumeInput.focus();
|
||||
// Check volume changes
|
||||
if (volumeInput) {
|
||||
const volume = volumeInput.value;
|
||||
|
||||
// Basic validation
|
||||
if (!volume || volume < 1 || volume > this.maxVolume) {
|
||||
showFlashMessage(
|
||||
`Please enter a valid volume between 1 and ${this.maxVolume}`,
|
||||
"warning"
|
||||
);
|
||||
volumeInput.focus();
|
||||
return;
|
||||
}
|
||||
|
||||
updates.volume = volume;
|
||||
hasChanges = true;
|
||||
}
|
||||
|
||||
// Check scraper module changes
|
||||
if (scraperSelect && scraperSelect.value) {
|
||||
updates.scraper_module = scraperSelect.value;
|
||||
hasChanges = true;
|
||||
}
|
||||
|
||||
if (!hasChanges) {
|
||||
showFlashMessage("No changes to save", "info");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -273,21 +294,24 @@ class ScraperController {
|
||||
try {
|
||||
const data = await apiRequest("/scraper/update_config", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({ volume: volume }),
|
||||
body: JSON.stringify(updates),
|
||||
});
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage(
|
||||
data.message || "Volume updated successfully",
|
||||
data.message || "Configuration updated successfully",
|
||||
"success"
|
||||
);
|
||||
} else {
|
||||
showFlashMessage(data.message || "Failed to update volume", "error");
|
||||
showFlashMessage(
|
||||
data.message || "Failed to update configuration",
|
||||
"error"
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error updating volume:", error);
|
||||
console.error("Error updating configuration:", error);
|
||||
showFlashMessage(
|
||||
"Network error while updating volume. Please try again.",
|
||||
"Network error while updating configuration. Please try again.",
|
||||
"error"
|
||||
);
|
||||
} finally {
|
||||
|
500
scipaperloader/static/js/scraper-overview.js
Normal file
500
scipaperloader/static/js/scraper-overview.js
Normal file
@ -0,0 +1,500 @@
|
||||
/**
|
||||
* Scraper Overview functionality
|
||||
*/
|
||||
|
||||
class ScraperOverview {
|
||||
constructor() {
|
||||
this.modal = null;
|
||||
this.scrapers = [];
|
||||
this.systemConfig = {};
|
||||
this.init();
|
||||
}
|
||||
|
||||
init() {
|
||||
// Initialize modal reference
|
||||
this.modal = document.getElementById("scraperOverviewModal");
|
||||
|
||||
// Load data when modal is shown
|
||||
if (this.modal) {
|
||||
this.modal.addEventListener("show.bs.modal", () => {
|
||||
this.loadScraperOverview();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async loadScraperOverview() {
|
||||
const loadingEl = document.getElementById("scraperOverviewLoading");
|
||||
const errorEl = document.getElementById("scraperOverviewError");
|
||||
const contentEl = document.getElementById("scraperOverviewContent");
|
||||
|
||||
// Show loading state
|
||||
loadingEl?.classList.remove("d-none");
|
||||
errorEl?.classList.add("d-none");
|
||||
contentEl?.classList.add("d-none");
|
||||
|
||||
try {
|
||||
// Load scrapers, system config, and publishers in parallel
|
||||
const [scrapersResponse, statusResponse, publishersResponse] =
|
||||
await Promise.all([
|
||||
fetch("/scraper/scrapers"),
|
||||
fetch("/scraper/status"),
|
||||
fetch("/scraper/publishers"),
|
||||
]);
|
||||
|
||||
if (
|
||||
!scrapersResponse.ok ||
|
||||
!statusResponse.ok ||
|
||||
!publishersResponse.ok
|
||||
) {
|
||||
throw new Error("Failed to load scraper information");
|
||||
}
|
||||
|
||||
const scrapersData = await scrapersResponse.json();
|
||||
const statusData = await statusResponse.json();
|
||||
const publishersData = await publishersResponse.json();
|
||||
|
||||
if (
|
||||
!scrapersData.success ||
|
||||
!statusData.success ||
|
||||
!publishersData.success
|
||||
) {
|
||||
throw new Error(
|
||||
scrapersData.message ||
|
||||
statusData.message ||
|
||||
publishersData.message ||
|
||||
"Unknown error"
|
||||
);
|
||||
}
|
||||
|
||||
this.scrapers = scrapersData.scrapers;
|
||||
this.systemConfig = statusData;
|
||||
this.publishersData = publishersData.data;
|
||||
|
||||
// Update UI
|
||||
this.updateSystemConfig();
|
||||
this.updateScrapersTable();
|
||||
this.updatePublishersSection();
|
||||
this.updateStatusFlowDiagram();
|
||||
|
||||
// Show content
|
||||
loadingEl?.classList.add("d-none");
|
||||
contentEl?.classList.remove("d-none");
|
||||
} catch (error) {
|
||||
console.error("Error loading scraper overview:", error);
|
||||
|
||||
// Show error state
|
||||
loadingEl?.classList.add("d-none");
|
||||
const errorMessage = document.getElementById(
|
||||
"scraperOverviewErrorMessage"
|
||||
);
|
||||
if (errorMessage) {
|
||||
errorMessage.textContent =
|
||||
error.message || "Failed to load scraper information";
|
||||
}
|
||||
errorEl?.classList.remove("d-none");
|
||||
}
|
||||
}
|
||||
|
||||
updateSystemConfig() {
|
||||
// Current scraper module
|
||||
const currentModuleEl = document.getElementById("currentScraperModule");
|
||||
if (currentModuleEl) {
|
||||
const currentModule =
|
||||
this.systemConfig.current_scraper_module || "System Default";
|
||||
currentModuleEl.textContent = currentModule;
|
||||
currentModuleEl.className = "badge bg-primary";
|
||||
}
|
||||
|
||||
// Volume limit
|
||||
const volumeLimitEl = document.getElementById("currentVolumeLimit");
|
||||
if (volumeLimitEl) {
|
||||
const volumeLimit = this.systemConfig.volume_config || "Unknown";
|
||||
volumeLimitEl.textContent = volumeLimit;
|
||||
}
|
||||
|
||||
// Total modules
|
||||
const totalModulesEl = document.getElementById("totalScraperModules");
|
||||
if (totalModulesEl) {
|
||||
totalModulesEl.textContent = this.scrapers.length;
|
||||
}
|
||||
|
||||
// Paper counts summary
|
||||
const paperCountsEl = document.getElementById("paperCountsSummary");
|
||||
if (paperCountsEl && this.systemConfig.paper_counts) {
|
||||
const counts = this.systemConfig.paper_counts;
|
||||
paperCountsEl.innerHTML = `
|
||||
<div class="d-flex flex-wrap gap-2">
|
||||
<span class="badge bg-primary">${counts.new || 0} New</span>
|
||||
<span class="badge bg-warning">${
|
||||
counts.processing || 0
|
||||
} Processing</span>
|
||||
<span class="badge bg-success">${
|
||||
counts.done || 0
|
||||
} Done</span>
|
||||
<span class="badge bg-danger">${
|
||||
counts.failed || 0
|
||||
} Failed</span>
|
||||
<span class="badge bg-info">${
|
||||
counts.pending || 0
|
||||
} Pending</span>
|
||||
<span class="badge bg-secondary">${
|
||||
counts.retrying || 0
|
||||
} Retrying</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
}
|
||||
|
||||
updateScrapersTable() {
|
||||
const tbody = document.getElementById("scrapersTableBody");
|
||||
if (!tbody) return;
|
||||
|
||||
tbody.innerHTML = "";
|
||||
|
||||
this.scrapers.forEach((scraper) => {
|
||||
const row = document.createElement("tr");
|
||||
|
||||
// Check if this is the current active scraper
|
||||
const isCurrentScraper =
|
||||
scraper.name === this.systemConfig.current_scraper_module;
|
||||
|
||||
if (scraper.error) {
|
||||
row.innerHTML = `
|
||||
<td>${scraper.name}</td>
|
||||
<td colspan="5" class="text-danger">
|
||||
<i class="fas fa-exclamation-triangle"></i> ${scraper.error}
|
||||
</td>
|
||||
`;
|
||||
} else {
|
||||
row.innerHTML = `
|
||||
<td>
|
||||
<strong>${scraper.name}</strong>
|
||||
${
|
||||
scraper.name === "dummy"
|
||||
? '<span class="badge bg-info ms-2">Test Module</span>'
|
||||
: ""
|
||||
}
|
||||
${
|
||||
isCurrentScraper
|
||||
? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
|
||||
: ""
|
||||
}
|
||||
</td>
|
||||
<td class="scraper-description">
|
||||
${this.truncateDescription(scraper.description)}
|
||||
</td>
|
||||
<td class="input-status-list">
|
||||
${this.renderStatusBadges(
|
||||
scraper.input_statuses,
|
||||
"bg-info"
|
||||
)}
|
||||
</td>
|
||||
<td class="status-output">
|
||||
<span class="badge bg-success">${
|
||||
scraper.output_status_success
|
||||
}</span>
|
||||
</td>
|
||||
<td class="status-output">
|
||||
<span class="badge bg-danger">${
|
||||
scraper.output_status_failure
|
||||
}</span>
|
||||
</td>
|
||||
<td class="status-output">
|
||||
<span class="badge bg-warning">${
|
||||
scraper.output_status_processing
|
||||
}</span>
|
||||
</td>
|
||||
`;
|
||||
}
|
||||
|
||||
// Highlight the current scraper row
|
||||
if (isCurrentScraper) {
|
||||
row.classList.add("table-success");
|
||||
}
|
||||
|
||||
tbody.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
updateStatusFlowDiagram() {
|
||||
const diagramEl = document.getElementById("statusFlowDiagram");
|
||||
if (!diagramEl) return;
|
||||
|
||||
// Analyze actual scrapers to build real flow
|
||||
const statusFlow = this.analyzeScraperFlow();
|
||||
|
||||
let diagramHTML = '<div class="status-flow-container">';
|
||||
|
||||
// Create visual flow based on actual scrapers
|
||||
statusFlow.forEach((stage, index) => {
|
||||
if (index > 0) {
|
||||
diagramHTML +=
|
||||
'<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
|
||||
}
|
||||
|
||||
diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
|
||||
diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
|
||||
|
||||
if (stage.scrapers && stage.scrapers.length > 0) {
|
||||
diagramHTML +=
|
||||
'<div class="mb-2"><small class="text-muted">Handled by: ' +
|
||||
stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
|
||||
"</small></div>";
|
||||
}
|
||||
|
||||
diagramHTML += '<div class="status-badges">';
|
||||
stage.statuses.forEach((status, statusIndex) => {
|
||||
if (statusIndex > 0) {
|
||||
diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
|
||||
}
|
||||
|
||||
const badgeClass = this.getStatusBadgeClass(status);
|
||||
diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
|
||||
});
|
||||
diagramHTML += "</div>";
|
||||
|
||||
if (stage.description) {
|
||||
diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
|
||||
}
|
||||
|
||||
diagramHTML += "</div>";
|
||||
});
|
||||
|
||||
diagramHTML += "</div>";
|
||||
|
||||
// Add explanation
|
||||
diagramHTML += `
|
||||
<div class="mt-4 p-3 bg-light rounded">
|
||||
<h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
|
||||
<ul class="small mb-0">
|
||||
<li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
|
||||
<li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
|
||||
<li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
|
||||
<li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
|
||||
<li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
|
||||
</ul>
|
||||
</div>
|
||||
`;
|
||||
|
||||
diagramEl.innerHTML = diagramHTML;
|
||||
}
|
||||
|
||||
analyzeScraperFlow() {
|
||||
// Build actual flow based on available scrapers
|
||||
const stages = [];
|
||||
const allInputStatuses = new Set();
|
||||
const allOutputStatuses = new Set();
|
||||
const scrapersByInput = {};
|
||||
|
||||
// Analyze scrapers to understand the flow
|
||||
this.scrapers.forEach((scraper) => {
|
||||
if (scraper.input_statuses) {
|
||||
scraper.input_statuses.forEach((status) => {
|
||||
allInputStatuses.add(status);
|
||||
if (!scrapersByInput[status]) {
|
||||
scrapersByInput[status] = [];
|
||||
}
|
||||
scrapersByInput[status].push(scraper.name);
|
||||
});
|
||||
}
|
||||
|
||||
if (scraper.output_status_success)
|
||||
allOutputStatuses.add(scraper.output_status_success);
|
||||
if (scraper.output_status_failure)
|
||||
allOutputStatuses.add(scraper.output_status_failure);
|
||||
});
|
||||
|
||||
// Entry point
|
||||
if (allInputStatuses.has("New")) {
|
||||
stages.push({
|
||||
title: "Entry Point",
|
||||
statuses: ["New"],
|
||||
scrapers: scrapersByInput["New"] || [],
|
||||
description: "Newly uploaded papers enter the processing pipeline",
|
||||
});
|
||||
}
|
||||
|
||||
// Processing stages
|
||||
const processingStatuses = Array.from(allInputStatuses).filter(
|
||||
(status) => !["New", "Done", "Failed"].includes(status)
|
||||
);
|
||||
|
||||
if (processingStatuses.length > 0) {
|
||||
stages.push({
|
||||
title: "Processing Stages",
|
||||
statuses: processingStatuses,
|
||||
scrapers: [],
|
||||
description: "Papers move through various processing stages",
|
||||
});
|
||||
}
|
||||
|
||||
// Final outputs
|
||||
const finalStatuses = ["Done", "Failed"];
|
||||
stages.push({
|
||||
title: "Final States",
|
||||
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
|
||||
scrapers: [],
|
||||
description: "Papers end up in final success or failure states",
|
||||
});
|
||||
|
||||
// Retry handling
|
||||
if (allInputStatuses.has("Failed")) {
|
||||
stages.push({
|
||||
title: "Retry Processing",
|
||||
statuses: ["Failed", "Retrying"],
|
||||
scrapers: scrapersByInput["Failed"] || [],
|
||||
description: "Failed papers can be retried with specialized scrapers",
|
||||
});
|
||||
}
|
||||
|
||||
return stages;
|
||||
}
|
||||
|
||||
getStatusBadgeClass(status) {
|
||||
const statusClasses = {
|
||||
New: "bg-primary",
|
||||
Pending: "bg-warning",
|
||||
Processing: "bg-warning",
|
||||
Retrying: "bg-warning",
|
||||
Done: "bg-success",
|
||||
Failed: "bg-danger",
|
||||
HtmlDownloaded: "bg-info",
|
||||
PublisherDetected: "bg-info",
|
||||
TextExtracted: "bg-info",
|
||||
};
|
||||
|
||||
return statusClasses[status] || "bg-secondary";
|
||||
}
|
||||
|
||||
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
|
||||
if (!Array.isArray(statuses)) return "";
|
||||
|
||||
return statuses
|
||||
.map(
|
||||
(status) =>
|
||||
`<span class="badge ${this.getStatusBadgeClass(
|
||||
status
|
||||
)} status-badge">${status}</span>`
|
||||
)
|
||||
.join("");
|
||||
}
|
||||
|
||||
truncateDescription(description, maxLength = 100) {
|
||||
if (!description) return "No description available";
|
||||
|
||||
if (description.length <= maxLength) return description;
|
||||
|
||||
return description.substring(0, maxLength).trim() + "...";
|
||||
}
|
||||
|
||||
updatePublishersSection() {
|
||||
// Update publisher statistics
|
||||
const publisherStatsEl = document.getElementById("publisherStats");
|
||||
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
|
||||
const stats = this.publishersData.stats;
|
||||
publisherStatsEl.innerHTML = `
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-primary mb-1">${stats.total_publishers}</div>
|
||||
<div class="text-muted small">Total Publishers</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
|
||||
<div class="text-muted small">With Parsers</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
|
||||
<div class="text-muted small">Missing Parsers</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
|
||||
<div class="text-muted small">Papers with Publisher</div>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
// Update publishers table
|
||||
const publishersTableBody = document.getElementById("publishersTableBody");
|
||||
if (
|
||||
publishersTableBody &&
|
||||
this.publishersData &&
|
||||
this.publishersData.publishers
|
||||
) {
|
||||
publishersTableBody.innerHTML = "";
|
||||
|
||||
if (this.publishersData.publishers.length === 0) {
|
||||
publishersTableBody.innerHTML = `
|
||||
<tr>
|
||||
<td colspan="4" class="text-center text-muted py-4">
|
||||
<i class="fas fa-info-circle"></i> No publishers detected yet.<br>
|
||||
<small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
|
||||
</td>
|
||||
</tr>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
this.publishersData.publishers.forEach((publisher) => {
|
||||
const row = document.createElement("tr");
|
||||
|
||||
// Publisher status badge
|
||||
const statusBadge = publisher.has_parser
|
||||
? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
|
||||
: '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
|
||||
|
||||
// Parser availability indicator
|
||||
const parserIndicator = publisher.has_parser
|
||||
? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
|
||||
: '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
|
||||
|
||||
row.innerHTML = `
|
||||
<td>
|
||||
<strong>${publisher.name}</strong>
|
||||
</td>
|
||||
<td>
|
||||
<span class="badge bg-info">${publisher.paper_count}</span>
|
||||
</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td class="text-center">${parserIndicator}</td>
|
||||
`;
|
||||
|
||||
publishersTableBody.appendChild(row);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Public method to show the modal
|
||||
show() {
|
||||
if (this.modal) {
|
||||
const bootstrapModal = new bootstrap.Modal(this.modal);
|
||||
bootstrapModal.show();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global function to load scraper overview (used by retry button)
|
||||
function loadScraperOverview() {
|
||||
if (window.scraperOverview) {
|
||||
window.scraperOverview.loadScraperOverview();
|
||||
}
|
||||
}
|
||||
|
||||
// Global function to show scraper overview modal
|
||||
function showScraperOverview() {
|
||||
if (!window.scraperOverview) {
|
||||
window.scraperOverview = new ScraperOverview();
|
||||
}
|
||||
window.scraperOverview.show();
|
||||
}
|
||||
|
||||
// Initialize when DOM is ready
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
window.scraperOverview = new ScraperOverview();
|
||||
});
|
@ -65,7 +65,13 @@
|
||||
<div class="col-md-6">
|
||||
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
|
||||
<div class="form-section">
|
||||
<h6>Scraper Module</h6>
|
||||
<div class="d-flex justify-content-between align-items-center mb-2">
|
||||
<h6>Scraper Module</h6>
|
||||
<button type="button" class="btn btn-outline-info btn-sm"
|
||||
onclick="showScraperOverview()" title="View scraper modules overview">
|
||||
<i class="fas fa-info-circle"></i> How Scrapers Work
|
||||
</button>
|
||||
</div>
|
||||
<p class="text-muted">Select which scraper module to use for processing papers.</p>
|
||||
|
||||
<div class="mb-3">
|
||||
|
@ -53,4 +53,13 @@
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock content %}
|
||||
|
||||
<!-- Include the scraper overview modal -->
|
||||
{% include "partials/scraper_overview_modal.html.jinja" %}
|
||||
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
|
||||
{% endblock scripts %}
|
@ -0,0 +1,249 @@
|
||||
<!-- Scraper Overview Modal -->
|
||||
<div class="modal fade" id="scraperOverviewModal" tabindex="-1" role="dialog"
|
||||
aria-labelledby="scraperOverviewModalLabel" aria-hidden="true">
|
||||
<div class="modal-dialog modal-xl" role="document">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="scraperOverviewModalLabel">
|
||||
<i class="fas fa-cogs"></i> Scraper Modules Overview
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<!-- Loading state -->
|
||||
<div id="scraperOverviewLoading" class="text-center py-4">
|
||||
<div class="spinner-border text-primary" role="status">
|
||||
<span class="visually-hidden">Loading...</span>
|
||||
</div>
|
||||
<p class="mt-2 text-muted">Loading scraper information...</p>
|
||||
</div>
|
||||
|
||||
<!-- Error state -->
|
||||
<div id="scraperOverviewError" class="alert alert-danger d-none" role="alert">
|
||||
<h6 class="alert-heading">Error Loading Scrapers</h6>
|
||||
<p id="scraperOverviewErrorMessage"></p>
|
||||
<button class="btn btn-outline-danger btn-sm" onclick="loadScraperOverview()">
|
||||
<i class="fas fa-redo"></i> Retry
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Content -->
|
||||
<div id="scraperOverviewContent" class="d-none">
|
||||
<!-- Scraper Architecture Overview -->
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-info-circle"></i> How Scraper Modules Work
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<p class="mb-3">
|
||||
SciPaperLoader uses a modular scraper architecture where each scraper module handles
|
||||
specific paper processing stages. Papers flow through different statuses as they are
|
||||
processed by various scrapers.
|
||||
</p>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<h6>Key Concepts:</h6>
|
||||
<ul class="small">
|
||||
<li><strong>Input Statuses:</strong> Paper statuses this scraper can process
|
||||
</li>
|
||||
<li><strong>Output Statuses:</strong> Statuses papers get after processing</li>
|
||||
<li><strong>Processing Status:</strong> Temporary status while scraper works
|
||||
</li>
|
||||
<li><strong>Pipeline:</strong> Scrapers can be chained together</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<h6>Status Flow Example:</h6>
|
||||
<div class="d-flex align-items-center small">
|
||||
<span class="badge bg-info">New</span>
|
||||
<i class="fas fa-arrow-right mx-2"></i>
|
||||
<span class="badge bg-warning">Processing</span>
|
||||
<i class="fas fa-arrow-right mx-2"></i>
|
||||
<span class="badge bg-success">Done</span>
|
||||
</div>
|
||||
<div class="text-muted mt-1">Papers transition through these statuses</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Current System Configuration -->
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-server"></i> System Configuration
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-4">
|
||||
<p><strong>Active Scraper Module:</strong> <span id="currentScraperModule"
|
||||
class="badge bg-primary">Loading...</span></p>
|
||||
<p><strong>Daily Volume Limit:</strong> <span
|
||||
id="currentVolumeLimit">Loading...</span> papers</p>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p><strong>Total Available Modules:</strong> <span
|
||||
id="totalScraperModules">Loading...</span></p>
|
||||
<p><strong>Processing Pipeline:</strong> <span
|
||||
id="processingPipeline">Multi-stage</span></p>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p><strong>Current Paper Counts:</strong></p>
|
||||
<div id="paperCountsSummary" class="small">
|
||||
<!-- Will be populated by JavaScript -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Available Scrapers Table -->
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-list"></i> Available Scraper Modules
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="table-responsive">
|
||||
<table class="table table-hover">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Module Name</th>
|
||||
<th>Description</th>
|
||||
<th>Input Statuses</th>
|
||||
<th>Success Output</th>
|
||||
<th>Failure Output</th>
|
||||
<th>Processing Status</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="scrapersTableBody">
|
||||
<!-- Table content will be populated by JavaScript -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Publisher Parser Overview -->
|
||||
<div class="card mt-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-building"></i> Publisher Parser Overview
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row mb-3">
|
||||
<div class="col-md-12">
|
||||
<p class="text-muted mb-2">
|
||||
<i class="fas fa-info-circle"></i>
|
||||
Publishers are detected from paper URLs and mapped to specific parser modules
|
||||
for content extraction.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Publisher Statistics -->
|
||||
<div class="row mb-4" id="publisherStats">
|
||||
<!-- Will be populated by JavaScript -->
|
||||
</div>
|
||||
|
||||
<!-- Publishers Table -->
|
||||
<div class="table-responsive">
|
||||
<table class="table table-hover table-sm">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Publisher</th>
|
||||
<th>Papers</th>
|
||||
<th>Parser Status</th>
|
||||
<th>Parser Available</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="publishersTableBody">
|
||||
<!-- Table content will be populated by JavaScript -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Status Flow Diagram -->
|
||||
<div class="card mt-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-project-diagram"></i> Paper Status Flow Diagram
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div id="statusFlowDiagram" class="text-center py-4">
|
||||
<!-- This will be populated by JavaScript -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<div class="d-flex justify-content-between w-100">
|
||||
<small class="text-muted">
|
||||
<i class="fas fa-lightbulb"></i>
|
||||
Tip: Scrapers can be chained to create complex processing pipelines
|
||||
</small>
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
/* Custom styles for the scraper overview modal */
|
||||
#scraperOverviewModal .modal-xl {
|
||||
max-width: 1200px;
|
||||
}
|
||||
|
||||
#scraperOverviewModal .table th {
|
||||
font-size: 0.9rem;
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
#scraperOverviewModal .badge {
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
#scraperOverviewModal .status-badge {
|
||||
margin: 2px;
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
.status-flow-node {
|
||||
display: inline-block;
|
||||
padding: 8px 16px;
|
||||
margin: 4px;
|
||||
border-radius: 20px;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.status-flow-arrow {
|
||||
color: #6c757d;
|
||||
margin: 0 8px;
|
||||
}
|
||||
|
||||
.scraper-description {
|
||||
max-width: 300px;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.input-status-list {
|
||||
max-width: 150px;
|
||||
}
|
||||
|
||||
.status-output {
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
</style>
|
@ -114,20 +114,44 @@
|
||||
|
||||
<div class="col-md-6">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Volume Configuration</h5>
|
||||
<div class="card-header d-flex justify-content-between align-items-center">
|
||||
<h5>Scraper Configuration</h5>
|
||||
<button type="button" class="btn btn-outline-info btn-sm" onclick="showScraperOverview()"
|
||||
title="View scraper modules overview">
|
||||
<i class="fas fa-info-circle"></i> How Scrapers Work
|
||||
</button>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<form id="volumeForm">
|
||||
<div class="form-group">
|
||||
<div class="form-group mb-3">
|
||||
<label for="volumeInput">Papers per day:</label>
|
||||
<input type="number" class="form-control" id="volumeInput"
|
||||
value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
|
||||
<button type="submit" class="btn btn-primary mt-2">
|
||||
<i class="fas fa-save"></i> Update Volume
|
||||
</button>
|
||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||
</div>
|
||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||
|
||||
<div class="form-group mb-3">
|
||||
<label for="mainScraperSelect">Scraper Module:</label>
|
||||
<select class="form-control" id="mainScraperSelect">
|
||||
{% for module in available_scraper_modules %}
|
||||
<option value="{{ module }}" {% if module==current_scraper_module %}selected{% endif %}>
|
||||
{{ module }}
|
||||
{% if scraper_details[module] %}
|
||||
- {{ scraper_details[module].description[:50] }}{% if
|
||||
scraper_details[module].description|length > 50 %}...{% endif %}
|
||||
{% endif %}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<div class="form-text">
|
||||
Select which scraper module to use for automated processing. Current: <strong>{{
|
||||
current_scraper_module }}</strong>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<button type="submit" class="btn btn-primary">
|
||||
<i class="fas fa-save"></i> Update Configuration
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
@ -306,6 +330,10 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Include the scraper overview modal -->
|
||||
{% include "partials/scraper_overview_modal.html.jinja" %}
|
||||
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
@ -320,6 +348,7 @@
|
||||
<script src="{{ url_for('static', filename='js/paper-processor.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/activity-monitor.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/scraper-dashboard.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
|
||||
|
||||
<script id="scraper-config" type="application/json">
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user