diff --git a/Makefile b/Makefile index 22d24dc..43533a9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # List of phony targets (targets that don't represent files) -.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics +.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics clean-papers purge-db # Define Python and pip executables inside virtual environment PYTHON := venv/bin/python @@ -14,7 +14,7 @@ clean: rm -rf venv build dist .pytest_cache .mypy_cache *.egg-info # Define database path -DB_PATH=scipaperloader/papers.db +DB_PATH=instance/papers.db # Backup the database with timestamp backup-db: @@ -90,6 +90,24 @@ reset-db: venv $(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration" $(PYTHON) -m flask --app scipaperloader db upgrade +# Clean all papers from the database (keep other tables intact) +clean-papers: venv + @echo "Cleaning all papers from the database..." + @$(PYTHON) -c "from scipaperloader.db import db; from scipaperloader.models import PaperMetadata; from scipaperloader import create_app; app = create_app(); app.app_context().push(); PaperMetadata.query.delete(); db.session.commit(); print('All papers have been removed from the database')" + +# Completely purge all database contents (removes all tables and data) +purge-db: venv + @echo "WARNING: This will completely wipe all database contents!" + @read -p "Are you sure you want to continue? (y/N) " -n 1 -r; \ + echo; \ + if [[ $$REPLY =~ ^[Yy]$$ ]]; then \ + echo "Purging database..."; \ + rm -f $(DB_PATH); \ + echo "Database completely purged"; \ + else \ + echo "Operation cancelled"; \ + fi + # Create and set up virtual environment venv: python3 -m venv venv && \ diff --git a/pyproject.toml b/pyproject.toml index ec81343..6a4a051 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ dependencies = [ "pandas>=2.2.3,<3", "APScheduler>=3.10.4,<4", "flask-migrate>=4.1.0,<5", + "beautifulsoup4>=4.13.4,<5 ", + "requests>=2.32.4,<3" ] [project.optional-dependencies] diff --git a/scipaperloader/blueprints/scraper.py b/scipaperloader/blueprints/scraper.py index 2fae038..95d5947 100644 --- a/scipaperloader/blueprints/scraper.py +++ b/scipaperloader/blueprints/scraper.py @@ -29,6 +29,10 @@ def index(): # Get volume configuration volume_config = VolumeConfig.get_current_volume() + # Get scraper module configuration + from ..models import ScraperModuleConfig + current_scraper_module = ScraperModuleConfig.get_current_module() + # Get paper counts by status paper_counts = { 'new': PaperMetadata.query.filter_by(status='New').count(), @@ -46,7 +50,10 @@ def index(): recent_logs=recent_logs, paper_counts=paper_counts, volume_config=volume_config, - max_volume=MAX_VOLUME + max_volume=MAX_VOLUME, + current_scraper_module=current_scraper_module, + available_scraper_modules=[s["name"] for s in available_scrapers], + scraper_details={s["name"]: s for s in available_scrapers} ) @bp.route("/start", methods=["POST"]) @@ -219,6 +226,13 @@ def get_status(): # Get current hour quota info current_quota = scraper_manager.get_current_hour_quota() + # Get current scraper module configuration + from ..models import ScraperModuleConfig + current_scraper_module = ScraperModuleConfig.get_current_module() + + # Get volume configuration + current_volume = VolumeConfig.get_current_volume() + return jsonify({ "success": True, "scraper_state": { @@ -227,7 +241,9 @@ def get_status(): "last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None }, "paper_counts": paper_counts, - "current_quota": current_quota + "current_quota": current_quota, + "current_scraper_module": current_scraper_module, + "volume_config": current_volume }) except Exception as e: @@ -665,6 +681,35 @@ def update_scraper_config(): "message": message }), 400 + # Handle scraper module configuration updates + if "scraper_module" in data: + from ..models import ScraperModuleConfig + + new_module = data["scraper_module"] + + # Validate that the module exists and is valid + available_modules = [m["name"] for m in get_available_scrapers()] + + if new_module not in available_modules: + return jsonify({ + "success": False, + "message": f"Invalid scraper module: {new_module}" + }), 400 + + # Update the database configuration + ScraperModuleConfig.set_module(new_module) + + ActivityLog.log_scraper_command( + action="update_scraper_module", + status="success", + description=f"Updated scraper module to '{new_module}'" + ) + + return jsonify({ + "success": True, + "message": f"Scraper module updated to '{new_module}' successfully" + }) + # Handle other configuration updates here if needed in the future return jsonify({ @@ -681,4 +726,73 @@ def update_scraper_config(): return jsonify({ "success": False, "message": f"Error updating scraper config: {str(e)}" + }), 500 + +@bp.route("/publishers") +def get_publishers(): + """Get publisher overview data for the scraper overview modal.""" + try: + import os + import glob + + # Get available parser modules + parsers_dir = os.path.join(current_app.root_path, 'parsers') + parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py')) + available_parsers = [] + + for parser_file in parser_files: + filename = os.path.basename(parser_file) + if filename != 'base_parser.py': # Skip the base parser + parser_name = filename.replace('_parser.py', '') + available_parsers.append(parser_name) + + # Get publishers from database (papers that have publisher detected) + publisher_query = db.session.query( + PaperMetadata.publisher, + db.func.count(PaperMetadata.id).label('paper_count') + ).filter( + PaperMetadata.publisher.isnot(None), + PaperMetadata.publisher != '' + ).group_by(PaperMetadata.publisher).all() + + publishers_data = [] + for publisher, count in publisher_query: + # Check if a parser exists for this publisher + has_parser = publisher in available_parsers + + publishers_data.append({ + 'name': publisher, + 'paper_count': count, + 'has_parser': has_parser, + 'parser_status': 'available' if has_parser else 'missing' + }) + + # Sort by paper count descending + publishers_data.sort(key=lambda x: x['paper_count'], reverse=True) + + # Get totals + total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data) + total_papers_without_publisher = PaperMetadata.query.filter( + db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '') + ).count() + + return jsonify({ + 'success': True, + 'data': { + 'publishers': publishers_data, + 'available_parsers': available_parsers, + 'stats': { + 'total_publishers': len(publishers_data), + 'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]), + 'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]), + 'total_papers_with_publisher': total_papers_with_publisher, + 'total_papers_without_publisher': total_papers_without_publisher + } + } + }) + + except Exception as e: + return jsonify({ + 'success': False, + 'message': f'Error getting publisher data: {str(e)}' }), 500 \ No newline at end of file diff --git a/scipaperloader/models.py b/scipaperloader/models.py index 8ddb780..3387bd5 100644 --- a/scipaperloader/models.py +++ b/scipaperloader/models.py @@ -191,6 +191,7 @@ class PaperMetadata(db.Model): type = db.Column(db.String(50)) language = db.Column(db.String(50)) published_online = db.Column(db.Date) # or DateTime/String + publisher = db.Column(db.String(100), nullable=True) # Detected publisher name status = db.Column(db.String(10)) # 'Pending','Done','Failed' previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion file_path = db.Column(db.Text) diff --git a/scipaperloader/parsers/__init__.py b/scipaperloader/parsers/__init__.py new file mode 100644 index 0000000..c90adf2 --- /dev/null +++ b/scipaperloader/parsers/__init__.py @@ -0,0 +1,6 @@ +# Parser modules for extracting full text from publisher-specific HTML content +from .base_parser import BaseParser, ParsedContent, ParseError +from .elsevier_parser import ElsevierParser +from .arxiv_parser import ArxivParser + +__all__ = ['BaseParser', 'ParsedContent', 'ParseError', 'ElsevierParser', 'ArxivParser'] diff --git a/scipaperloader/parsers/arxiv_parser.py b/scipaperloader/parsers/arxiv_parser.py new file mode 100644 index 0000000..35b8bfe --- /dev/null +++ b/scipaperloader/parsers/arxiv_parser.py @@ -0,0 +1,227 @@ +import re +from bs4 import BeautifulSoup +from typing import Dict, Optional, List +from .base_parser import BaseParser, ParsedContent, ParseError + +class ArxivParser(BaseParser): + """Parser for arXiv papers.""" + + def can_parse(self, html_content: str, url: Optional[str] = None) -> bool: + """Check if this is an arXiv page.""" + html_lower = html_content.lower() + + # Check for arXiv indicators + indicators = [ + 'arxiv.org', + 'export.arxiv.org', + 'arxiv:', + 'meta name="citation_publisher" content="arxiv"', + ] + + return any(indicator in html_lower for indicator in indicators) + + def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent: + """Parse arXiv HTML content.""" + try: + soup = BeautifulSoup(html_content, 'html.parser') + + # Extract title + title = self._extract_title(soup) + + # Extract abstract + abstract = self._extract_abstract(soup) + + # Extract authors + authors = self._extract_authors(soup) + + # Extract full text (arXiv usually just has abstract on the HTML page) + full_text = self._extract_full_text(soup, abstract) + + # Extract keywords/subjects + keywords = self._extract_subjects(soup) + + # Extract arxiv ID + arxiv_id = self._extract_arxiv_id(soup) + + if not full_text or len(full_text.strip()) < 50: + raise ParseError("Could not extract meaningful content from arXiv page") + + return ParsedContent( + full_text=full_text, + title=title, + abstract=abstract, + authors=authors, + keywords=keywords, + sections=None, # arXiv HTML pages don't usually have full sections + references=None, # References are typically in the PDF + doi=doi, + journal="arXiv", + publication_date=self._extract_submission_date(soup), + metadata={ + 'parser': 'arxiv', + 'arxiv_id': arxiv_id, + 'source': 'arxiv.org' + } + ) + + except Exception as e: + raise ParseError(f"Failed to parse arXiv content: {str(e)}") + + def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: + """Extract paper title.""" + # Try multiple title selectors for arXiv + selectors = [ + 'h1.title', + 'meta[name="citation_title"]', + 'title' + ] + + for selector in selectors: + if 'meta' in selector: + element = soup.find('meta', attrs={'name': 'citation_title'}) + if element: + return element.get('content', '').strip() + else: + element = soup.select_one(selector) + if element: + text = element.get_text(strip=True) + # Remove "Title:" prefix if present + text = re.sub(r'^Title:\s*', '', text) + return text + + return None + + def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]: + """Extract paper abstract.""" + # arXiv abstract selectors + selectors = [ + 'blockquote.abstract', + 'div.abstract', + 'meta[name="citation_abstract"]' + ] + + for selector in selectors: + if 'meta' in selector: + element = soup.find('meta', attrs={'name': 'citation_abstract'}) + if element: + return element.get('content', '').strip() + else: + element = soup.select_one(selector) + if element: + text = element.get_text(strip=True) + # Remove "Abstract:" prefix if present + text = re.sub(r'^Abstract:\s*', '', text) + return text + + return None + + def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]: + """Extract author names.""" + authors = [] + + # Try author meta tags + author_metas = soup.find_all('meta', attrs={'name': 'citation_author'}) + if author_metas: + authors = [meta.get('content', '').strip() for meta in author_metas] + + # Try arXiv author div + if not authors: + authors_div = soup.select_one('div.authors') + if authors_div: + # Extract author links or text + author_links = authors_div.find_all('a') + if author_links: + authors = [link.get_text(strip=True) for link in author_links] + else: + # Fallback to text parsing + text = authors_div.get_text() + # Remove "Authors:" prefix and split by commas + text = re.sub(r'^Authors?:\s*', '', text) + authors = [author.strip() for author in text.split(',')] + + return authors if authors else None + + def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str: + """Extract main content (usually just abstract for arXiv HTML pages).""" + content_parts = [] + + # For arXiv, the HTML page typically only contains abstract and metadata + # The full text is in the PDF + + if abstract: + content_parts.append(f"Abstract\n{abstract}") + + # Look for any additional content sections + comments_section = soup.select_one('td.comments') + if comments_section: + comments = comments_section.get_text(strip=True) + if comments: + content_parts.append(f"Comments\n{comments}") + + # Add note about PDF availability + content_parts.append( + "\nNote: This is the abstract and metadata from the arXiv HTML page. " + "The full text is available in the PDF version." + ) + + return '\n\n'.join(content_parts) + + def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]: + """Extract subject classifications.""" + subjects = [] + + # Look for subject classification + subjects_td = soup.select_one('td.subjects') + if subjects_td: + subjects_text = subjects_td.get_text(strip=True) + # Parse subjects (format: "Primary: subject1; Secondary: subject2") + subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)] + # Clean up prefixes + subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects] + subjects = [subj for subj in subjects if subj] # Remove empty strings + + return subjects if subjects else None + + def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]: + """Extract arXiv ID.""" + # Look for arXiv ID in various places + arxiv_id_patterns = [ + r'arXiv:(\d+\.\d+(?:v\d+)?)', + r'(\d{4}\.\d{4,5}(?:v\d+)?)', + ] + + # Search in page text + page_text = soup.get_text() + for pattern in arxiv_id_patterns: + match = re.search(pattern, page_text) + if match: + return match.group(1) + + # Search in URL or meta tags + canonical_link = soup.find('link', attrs={'rel': 'canonical'}) + if canonical_link: + href = canonical_link.get('href', '') + for pattern in arxiv_id_patterns: + match = re.search(pattern, href) + if match: + return match.group(1) + + return None + + def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]: + """Extract submission date.""" + # Look for submission date + submission_td = soup.select_one('td.submission-history') + if submission_td: + date_text = submission_td.get_text() + # Extract date (format varies) + date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text) + if date_match: + return date_match.group(1) + + # Try meta tag + date_meta = soup.find('meta', attrs={'name': 'citation_date'}) + if date_meta: + return date_meta.get('content', '').strip() + + return None diff --git a/scipaperloader/parsers/base_parser.py b/scipaperloader/parsers/base_parser.py new file mode 100644 index 0000000..58d8bb1 --- /dev/null +++ b/scipaperloader/parsers/base_parser.py @@ -0,0 +1,83 @@ +from abc import ABC, abstractmethod +from typing import Dict, Optional, List +from dataclasses import dataclass + +@dataclass +class ParsedContent: + """Container for parsed content from a publisher's HTML.""" + full_text: str + title: Optional[str] = None + abstract: Optional[str] = None + authors: Optional[List[str]] = None + keywords: Optional[List[str]] = None + sections: Optional[Dict[str, str]] = None # section_title -> section_content + references: Optional[List[str]] = None + doi: Optional[str] = None + journal: Optional[str] = None + publication_date: Optional[str] = None + metadata: Optional[Dict] = None # Additional metadata specific to publisher + +class BaseParser(ABC): + """Base class for all publisher-specific parsers.""" + + def __init__(self): + self.parser_name = self.__class__.__name__.lower().replace('parser', '') + + @abstractmethod + def can_parse(self, html_content: str, url: Optional[str] = None) -> bool: + """ + Check if this parser can handle the given HTML content. + + Args: + html_content: The HTML content to check + url: Optional URL of the content (for additional context) + + Returns: + True if this parser can handle the content, False otherwise + """ + pass + + @abstractmethod + def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent: + """ + Parse HTML content and extract structured information. + + Args: + html_content: The HTML content to parse + doi: Optional DOI of the paper + + Returns: + ParsedContent object with extracted information + + Raises: + ParseError: If parsing fails + """ + pass + + def get_name(self) -> str: + """Return the name of this parser.""" + return self.parser_name + + def get_description(self) -> str: + """Return a description of this parser.""" + return getattr(self.__class__, "__doc__", "No description available") + + def validate_content(self, content: ParsedContent) -> bool: + """ + Validate the parsed content to ensure it meets minimum requirements. + + Args: + content: The parsed content to validate + + Returns: + True if content is valid, False otherwise + """ + # Basic validation - must have some full text + if not content.full_text or len(content.full_text.strip()) < 100: + return False + + return True + +class ParseError(Exception): + """Exception raised when parsing fails.""" + pass diff --git a/scipaperloader/parsers/elsevier_parser.py b/scipaperloader/parsers/elsevier_parser.py new file mode 100644 index 0000000..35701e1 --- /dev/null +++ b/scipaperloader/parsers/elsevier_parser.py @@ -0,0 +1,252 @@ +import re +from bs4 import BeautifulSoup +from typing import Dict, Optional, List +from .base_parser import BaseParser, ParsedContent, ParseError + +class ElsevierParser(BaseParser): + """Parser for Elsevier/ScienceDirect articles.""" + + def can_parse(self, html_content: str, url: Optional[str] = None) -> bool: + """Check if this is an Elsevier/ScienceDirect page.""" + html_lower = html_content.lower() + + # Check for Elsevier/ScienceDirect indicators + indicators = [ + 'sciencedirect.com', + 'elsevier.com', + 'meta name="citation_publisher" content="elsevier"', + 'copyright.*elsevier', + 'sciencedirect', + ] + + return any(indicator in html_lower for indicator in indicators) + + def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent: + """Parse Elsevier/ScienceDirect HTML content.""" + try: + soup = BeautifulSoup(html_content, 'html.parser') + + # Extract title + title = self._extract_title(soup) + + # Extract abstract + abstract = self._extract_abstract(soup) + + # Extract authors + authors = self._extract_authors(soup) + + # Extract full text + full_text = self._extract_full_text(soup) + + # Extract sections + sections = self._extract_sections(soup) + + # Extract keywords + keywords = self._extract_keywords(soup) + + # Extract references + references = self._extract_references(soup) + + # Extract journal info + journal = self._extract_journal(soup) + + # Extract publication date + publication_date = self._extract_publication_date(soup) + + # Combine everything into full text if sections exist + if sections: + full_text = self._combine_sections(sections, abstract) + + if not full_text or len(full_text.strip()) < 100: + raise ParseError("Could not extract meaningful full text content") + + return ParsedContent( + full_text=full_text, + title=title, + abstract=abstract, + authors=authors, + keywords=keywords, + sections=sections, + references=references, + doi=doi, + journal=journal, + publication_date=publication_date, + metadata={ + 'parser': 'elsevier', + 'source': 'sciencedirect' + } + ) + + except Exception as e: + raise ParseError(f"Failed to parse Elsevier content: {str(e)}") + + def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: + """Extract article title.""" + # Try multiple title selectors + selectors = [ + 'h1.title-text', + 'h1[data-testid="title"]', + 'h1.article-title', + 'meta[name="citation_title"]', + 'title' + ] + + for selector in selectors: + if 'meta' in selector: + element = soup.find('meta', attrs={'name': 'citation_title'}) + if element: + return element.get('content', '').strip() + else: + element = soup.select_one(selector) + if element: + return element.get_text(strip=True) + + return None + + def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]: + """Extract article abstract.""" + selectors = [ + 'div.abstract-content', + 'div[data-testid="abstract"]', + 'div.abstract', + 'section.abstract', + 'div#abstract' + ] + + for selector in selectors: + element = soup.select_one(selector) + if element: + return element.get_text(strip=True) + + return None + + def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]: + """Extract author names.""" + authors = [] + + # Try author meta tags + author_metas = soup.find_all('meta', attrs={'name': 'citation_author'}) + if author_metas: + authors = [meta.get('content', '').strip() for meta in author_metas] + + # Try author div/span elements + if not authors: + author_elements = soup.select('div.author a, span.author, .author-name') + authors = [elem.get_text(strip=True) for elem in author_elements] + + return authors if authors else None + + def _extract_full_text(self, soup: BeautifulSoup) -> str: + """Extract main article content.""" + content_parts = [] + + # Try main content selectors + main_selectors = [ + 'div.article-content', + 'div.body-content', + 'main.article-body', + 'div[data-testid="article-body"]', + 'section.article-section' + ] + + for selector in main_selectors: + elements = soup.select(selector) + for element in elements: + # Remove script, style, and navigation elements + for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']): + unwanted.decompose() + + text = element.get_text(separator='\n', strip=True) + if text and len(text) > 50: # Only add substantial content + content_parts.append(text) + + return '\n\n'.join(content_parts) + + def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]: + """Extract article sections with headings.""" + sections = {} + + # Look for section headings and content + section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading')) + + for heading in section_elements: + section_title = heading.get_text(strip=True) + + # Find content after this heading until next heading + content_parts = [] + current = heading.next_sibling + + while current and current.name not in ['h1', 'h2', 'h3', 'h4']: + if hasattr(current, 'get_text'): + text = current.get_text(strip=True) + if text: + content_parts.append(text) + current = current.next_sibling + + if content_parts: + sections[section_title] = '\n'.join(content_parts) + + return sections if sections else None + + def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]: + """Extract article keywords.""" + keywords = [] + + # Try keyword meta tags + keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'}) + if keyword_metas: + for meta in keyword_metas: + content = meta.get('content', '') + if content: + keywords.extend([kw.strip() for kw in content.split(',')]) + + # Try keyword sections + if not keywords: + keyword_sections = soup.select('div.keywords, section.keywords') + for section in keyword_sections: + text = section.get_text() + keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()]) + + return keywords if keywords else None + + def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]: + """Extract references.""" + references = [] + + ref_sections = soup.select('section.references, div.references, ol.references li') + for section in ref_sections: + if section.name == 'li': + references.append(section.get_text(strip=True)) + else: + ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference')) + references.extend([item.get_text(strip=True) for item in ref_items]) + + return references if references else None + + def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]: + """Extract journal name.""" + journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'}) + if journal_meta: + return journal_meta.get('content', '').strip() + + return None + + def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]: + """Extract publication date.""" + date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'}) + if date_meta: + return date_meta.get('content', '').strip() + + return None + + def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str: + """Combine all sections into full text.""" + full_text_parts = [] + + if abstract: + full_text_parts.append(f"Abstract\n{abstract}") + + for section_title, section_content in sections.items(): + full_text_parts.append(f"{section_title}\n{section_content}") + + return '\n\n'.join(full_text_parts) diff --git a/scipaperloader/scrapers/base.py b/scipaperloader/scrapers/base.py index 341be94..dd950ca 100644 --- a/scipaperloader/scrapers/base.py +++ b/scipaperloader/scrapers/base.py @@ -18,6 +18,43 @@ class BaseScraper(ABC): OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing + def __init__(self): + """Initialize the scraper.""" + self.scraper_name = self.get_name().lower() + + def log_scrape_start(self, doi: str, paper_id: Optional[int] = None): + """Log the start of a scraping operation.""" + from ..models import ActivityLog + + ActivityLog.log_scraper_activity( + action=f"{self.scraper_name}_scrape_start", + status="info", + description=f"Starting {self.get_name()} for DOI: {doi}", + paper_id=paper_id + ) + + def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None): + """Log successful completion of scraping.""" + from ..models import ActivityLog + + ActivityLog.log_scraper_activity( + action=f"{self.scraper_name}_scrape_success", + status="success", + description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}", + paper_id=paper_id + ) + + def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None): + """Log failed scraping operation.""" + from ..models import ActivityLog + + ActivityLog.log_scraper_activity( + action=f"{self.scraper_name}_scrape_failure", + status="error", + description=f"{self.get_name()} failed for DOI: {doi} - {message}", + paper_id=paper_id + ) + @abstractmethod def scrape(self, doi: str) -> ScrapeResult: """ diff --git a/scipaperloader/scrapers/dummy.py b/scipaperloader/scrapers/dummy.py index f77f193..182ba00 100644 --- a/scipaperloader/scrapers/dummy.py +++ b/scipaperloader/scrapers/dummy.py @@ -30,6 +30,9 @@ class Scraper(BaseScraper): timestamp=datetime.utcnow() ) + # Log start of scraping + self.log_scrape_start(doi, paper.id) + # Simulate processing time (1-3 seconds) processing_time = random.uniform(1, 3) time.sleep(processing_time) @@ -145,12 +148,7 @@ class Scraper(BaseScraper): ) # Log success - ActivityLog.log_scraper_activity( - action="dummy_scrape", - status="success", - description=f"Successfully scraped {doi}", - paper_id=paper.id - ) + self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id) result = ScrapeResult( status="success", @@ -178,12 +176,7 @@ class Scraper(BaseScraper): paper.error_msg = error_msg # Log failure - ActivityLog.log_scraper_activity( - action="dummy_scrape", - status="error", - description=f"Failed to scrape {doi}: {error_msg}", - paper_id=paper.id - ) + self.log_scrape_failure(doi, error_msg, paper.id) result = ScrapeResult( status="error", diff --git a/scipaperloader/scrapers/failed_retry.py b/scipaperloader/scrapers/failed_retry.py index 916eb16..f338c33 100644 --- a/scipaperloader/scrapers/failed_retry.py +++ b/scipaperloader/scrapers/failed_retry.py @@ -30,13 +30,8 @@ class Scraper(BaseScraper): timestamp=datetime.utcnow() ) - # Log retry attempt - ActivityLog.log_scraper_activity( - action="retry_failed_paper", - status="info", - description=f"Retrying failed paper: {paper.title}", - paper_id=paper.id - ) + # Log start of retry + self.log_scrape_start(doi, paper.id) # Simulate longer processing time for retry (2-5 seconds) processing_time = random.uniform(2, 5) @@ -64,12 +59,7 @@ class Scraper(BaseScraper): result_data = {"file_path": file_path} # Log success - ActivityLog.log_scraper_activity( - action="retry_scrape_success", - status="success", - description=f"Successfully retried {doi} on second attempt", - paper_id=paper.id - ) + self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id) result = ScrapeResult( status="success", @@ -81,12 +71,7 @@ class Scraper(BaseScraper): except Exception as e: error_msg = f"Failed to save retry file: {str(e)}" - ActivityLog.log_scraper_activity( - action="retry_scrape_file_error", - status="error", - description=error_msg, - paper_id=paper.id - ) + self.log_scrape_failure(doi, error_msg, paper.id) result = ScrapeResult( status="error", @@ -105,12 +90,7 @@ class Scraper(BaseScraper): ] error_msg = random.choice(error_messages) - ActivityLog.log_scraper_activity( - action="retry_scrape_failure", - status="error", - description=f"Retry failed for {doi}: {error_msg}", - paper_id=paper.id - ) + self.log_scrape_failure(doi, error_msg, paper.id) result = ScrapeResult( status="error", diff --git a/scipaperloader/scrapers/html_fetcher.py b/scipaperloader/scrapers/html_fetcher.py new file mode 100644 index 0000000..af7931c --- /dev/null +++ b/scipaperloader/scrapers/html_fetcher.py @@ -0,0 +1,172 @@ +import time +import os +import requests +from datetime import datetime +from .base import BaseScraper, ScrapeResult +from flask import current_app +from ..models import PaperMetadata, ActivityLog, DownloadPathConfig +from ..db import db + +class Scraper(BaseScraper): + """Scraper that fetches HTML content from DOI and saves it for further processing.""" + + # This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed" + INPUT_STATUSES = ["New"] + OUTPUT_STATUS_SUCCESS = "HtmlDownloaded" + OUTPUT_STATUS_FAILURE = "Failed" + OUTPUT_STATUS_PROCESSING = "FetchingHtml" + + def scrape(self, doi: str) -> ScrapeResult: + """Fetch HTML content from DOI and save to download path.""" + start_time = time.time() + + paper = PaperMetadata.query.filter_by(doi=doi).first() + if not paper: + return ScrapeResult( + status="error", + message=f"No paper found for DOI {doi}", + data=None, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Log start of scraping + self.log_scrape_start(doi, paper.id) + + # Update status to processing + paper.status = self.OUTPUT_STATUS_PROCESSING + db.session.commit() + + # Prepare file paths + download_path = DownloadPathConfig.get_path() + file_name = f"{doi.replace('/', '_')}.html" + file_path = os.path.join(download_path, file_name) + + # Check/create download directory (same pattern as dummy) + if not os.path.exists(download_path): + try: + os.makedirs(download_path, exist_ok=True) + except OSError as e: + error_msg = f"Failed to create download directory: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "path_creation_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Check path permissions (same pattern as dummy) + if not os.access(download_path, os.W_OK): + error_msg = f"Download path '{download_path}' is not writable" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + ActivityLog.log_scraper_activity( + action="html_fetch_path_error", + status="error", + description=error_msg, + paper_id=paper.id + ) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "path_write_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + try: + # Fetch HTML from DOI + doi_url = f"https://doi.org/{doi}" + headers = {'User-Agent': 'SciPaperLoader/1.0'} + response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True) + + # Check for invalid DOI (404) or other HTTP errors + if response.status_code == 404: + error_msg = f"Invalid DOI: {doi} not found" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "invalid_doi"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + response.raise_for_status() # Raise for other HTTP errors + + # Save HTML content + with open(file_path, 'w', encoding='utf-8') as f: + f.write(response.text) + + # Update paper status to success + paper.status = self.OUTPUT_STATUS_SUCCESS + paper.file_path = file_path + paper.error_msg = None + db.session.commit() + + # Log success + self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id) + + return ScrapeResult( + status="success", + message=f"Successfully fetched HTML for {doi}", + data={ + "file_path": file_path, + "url": response.url, # Final URL after redirects + "title": paper.title + }, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except requests.exceptions.RequestException as e: + error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + ActivityLog.log_scraper_activity( + action="html_fetch", + status="error", + description=error_msg, + paper_id=paper.id + ) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "network_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except Exception as e: + error_msg = f"Failed to save HTML file: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "file_creation_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) \ No newline at end of file diff --git a/scipaperloader/scrapers/publisher_detector.py b/scipaperloader/scrapers/publisher_detector.py new file mode 100644 index 0000000..3827d57 --- /dev/null +++ b/scipaperloader/scrapers/publisher_detector.py @@ -0,0 +1,282 @@ +import time +import requests +import re +from urllib.parse import urlparse +from datetime import datetime +from typing import Optional +from .base import BaseScraper, ScrapeResult +from flask import current_app +from ..models import PaperMetadata, ActivityLog, DownloadPathConfig +from ..db import db + +class Scraper(BaseScraper): + """Publisher detection scraper that identifies the publisher from the final URL after DOI redirect.""" + + # This scraper processes "New" papers and outputs "PublisherDetected"/"Failed" + INPUT_STATUSES = ["New"] + OUTPUT_STATUS_SUCCESS = "PublisherDetected" + OUTPUT_STATUS_FAILURE = "Failed" + OUTPUT_STATUS_PROCESSING = "DetectingPublisher" + + # Publisher detection patterns based on URL domains and paths + PUBLISHER_URL_PATTERNS = { + 'elsevier': [ + r'sciencedirect\.com', + r'elsevier\.com', + r'.*\.elsevier\.com' + ], + 'springer': [ + r'link\.springer\.com', + r'springer\.com', + r'.*\.springer\.com' + ], + 'wiley': [ + r'onlinelibrary\.wiley\.com', + r'wiley\.com', + r'.*\.wiley\.com' + ], + 'ieee': [ + r'ieeexplore\.ieee\.org', + r'ieee\.org', + r'.*\.ieee\.org' + ], + 'plos': [ + r'journals\.plos\.org', + r'plos\.org', + r'.*\.plos\.org' + ], + 'nature': [ + r'nature\.com', + r'.*\.nature\.com' + ], + 'sage': [ + r'journals\.sagepub\.com', + r'sagepub\.com', + r'.*\.sagepub\.com' + ], + 'taylor_francis': [ + r'tandfonline\.com', + r'.*\.tandfonline\.com' + ], + 'acs': [ + r'pubs\.acs\.org', + r'acs\.org', + r'.*\.acs\.org' + ], + 'arxiv': [ + r'arxiv\.org', + r'export\.arxiv\.org' + ], + 'pubmed': [ + r'pubmed\.ncbi\.nlm\.nih\.gov', + r'ncbi\.nlm\.nih\.gov' + ], + 'oxford': [ + r'academic\.oup\.com', + r'oup\.com', + r'.*\.oup\.com' + ], + 'cambridge': [ + r'cambridge\.org', + r'.*\.cambridge\.org' + ], + 'biorxiv': [ + r'biorxiv\.org', + r'.*\.biorxiv\.org' + ], + 'researchgate': [ + r'researchgate\.net', + r'.*\.researchgate\.net' + ] + } + + def scrape(self, doi: str) -> ScrapeResult: + """Detect publisher from the final URL after DOI redirect.""" + start_time = time.time() + + paper = PaperMetadata.query.filter_by(doi=doi).first() + if not paper: + return ScrapeResult( + status="error", + message=f"No paper found for DOI {doi}", + data=None, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Log start of scraping + self.log_scrape_start(doi, paper.id) + + # Update status to processing + paper.status = self.OUTPUT_STATUS_PROCESSING + db.session.commit() + + try: + # Get the final URL by following the DOI redirect + final_url = self._get_final_url(doi) + + if not final_url: + error_msg = f"Could not resolve DOI {doi} to a URL" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "doi_resolution_failed"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Detect publisher from URL + detected_publisher = self._detect_publisher_from_url(final_url) + + if detected_publisher: + # Update paper with detected publisher + paper.publisher = detected_publisher + paper.status = self.OUTPUT_STATUS_SUCCESS + paper.error_msg = None + db.session.commit() + + success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}" + self.log_scrape_success(doi, success_msg, paper.id) + + return ScrapeResult( + status="success", + message=success_msg, + data={ + "publisher": detected_publisher, + "final_url": final_url + }, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + else: + error_msg = f"Could not detect publisher from URL: {final_url}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={ + "final_url": final_url, + "error_code": "publisher_not_detected" + }, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except Exception as e: + error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "publisher_detection_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + def _get_final_url(self, doi: str) -> Optional[str]: + """ + Get the final URL after following DOI redirects. + + Args: + doi: The DOI to resolve + + Returns: + Final URL after redirects, or None if resolution fails + """ + try: + doi_url = f"https://doi.org/{doi}" + headers = { + 'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + } + + # Make a HEAD request to get the final URL without downloading content + response = requests.head( + doi_url, + headers=headers, + timeout=15, + allow_redirects=True + ) + + # If HEAD is not allowed, try GET but with minimal content + if response.status_code == 405: # Method Not Allowed + response = requests.get( + doi_url, + headers=headers, + timeout=15, + allow_redirects=True, + stream=True # Don't download the full content + ) + response.close() # Close connection after getting headers + + if response.status_code in [200, 302, 301]: + return response.url + else: + return None + + except Exception as e: + # Log error but don't raise - we'll handle this gracefully + return None + + def _detect_publisher_from_url(self, url: str) -> Optional[str]: + """ + Detect publisher from URL using domain patterns. + + Args: + url: The URL to analyze + + Returns: + Publisher name if detected, None otherwise + """ + if not url: + return None + + # Parse the URL to get the domain + parsed_url = urlparse(url) + domain = parsed_url.netloc.lower() + + # Remove 'www.' prefix if present + if domain.startswith('www.'): + domain = domain[4:] + + # Score each publisher based on URL pattern matches + publisher_scores = {} + + for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items(): + score = 0 + for pattern in patterns: + if re.search(pattern, domain, re.IGNORECASE): + score += 10 # Strong match for domain patterns + + # Also check the full URL for path-based patterns + if re.search(pattern, url.lower(), re.IGNORECASE): + score += 5 + + if score > 0: + publisher_scores[publisher] = score + + # Return the publisher with the highest score + if publisher_scores: + best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x]) + + # Only return if we have a reasonable confidence (score > 5) + if publisher_scores[best_publisher] > 5: + return best_publisher + + return None \ No newline at end of file diff --git a/scipaperloader/scrapers/text_extractor.py b/scipaperloader/scrapers/text_extractor.py new file mode 100644 index 0000000..f0a302d --- /dev/null +++ b/scipaperloader/scrapers/text_extractor.py @@ -0,0 +1,237 @@ +import time +import os +from datetime import datetime +from typing import Optional +from .base import BaseScraper, ScrapeResult +from flask import current_app +from ..models import PaperMetadata, ActivityLog, DownloadPathConfig +from ..db import db +from ..parsers.base_parser import BaseParser, ParseError +from ..parsers.elsevier_parser import ElsevierParser +from ..parsers.arxiv_parser import ArxivParser + +class Scraper(BaseScraper): + """Full text extraction scraper that uses publisher-specific parsers.""" + + # This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed" + INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"] + OUTPUT_STATUS_SUCCESS = "TextExtracted" + OUTPUT_STATUS_FAILURE = "Failed" + OUTPUT_STATUS_PROCESSING = "ExtractingText" + + def __init__(self): + super().__init__() + # Registry of available parsers + self.parsers = [ + ElsevierParser(), + ArxivParser(), + # Add more parsers here as you create them + # SpringerParser(), + # WileyParser(), + # IEEEParser(), + ] + + def scrape(self, doi: str) -> ScrapeResult: + """Extract full text using appropriate publisher parser.""" + start_time = time.time() + + paper = PaperMetadata.query.filter_by(doi=doi).first() + if not paper: + return ScrapeResult( + status="error", + message=f"No paper found for DOI {doi}", + data=None, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Log start of scraping + self.log_scrape_start(doi, paper.id) + + # Update status to processing + paper.status = self.OUTPUT_STATUS_PROCESSING + db.session.commit() + + # Check if HTML file exists + if not paper.file_path or not os.path.exists(paper.file_path): + error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "html_file_not_found"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + try: + # Read HTML content + with open(paper.file_path, 'r', encoding='utf-8') as f: + html_content = f.read() + + # Find appropriate parser + parser = self._select_parser(html_content) + + if not parser: + error_msg = f"No suitable parser found for DOI {doi}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "no_parser_available"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Parse content + parsed_content = parser.parse(html_content, doi) + + # Validate parsed content + if not parser.validate_content(parsed_content): + error_msg = f"Parsed content validation failed for DOI {doi}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "content_validation_failed"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Save extracted text to file + text_file_path = self._save_extracted_text(parsed_content, doi) + + # Update paper status to success + paper.status = self.OUTPUT_STATUS_SUCCESS + paper.error_msg = None + # You might want to add a text_file_path field to store the text file location + # paper.text_file_path = text_file_path + db.session.commit() + + success_msg = f"Successfully extracted text using {parser.get_name()} parser" + self.log_scrape_success(doi, success_msg, paper.id) + + return ScrapeResult( + status="success", + message=f"Successfully extracted full text for {doi}", + data={ + "text_file_path": text_file_path, + "parser_used": parser.get_name(), + "title": parsed_content.title, + "word_count": len(parsed_content.full_text.split()), + "has_abstract": bool(parsed_content.abstract), + "has_sections": bool(parsed_content.sections), + "author_count": len(parsed_content.authors) if parsed_content.authors else 0, + "keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0, + "reference_count": len(parsed_content.references) if parsed_content.references else 0 + }, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except ParseError as e: + error_msg = f"Parser error for DOI {doi}: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "parser_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except Exception as e: + error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "extraction_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + def _select_parser(self, html_content: str) -> Optional[BaseParser]: + """ + Select the most appropriate parser for the HTML content. + + Args: + html_content: The HTML content to analyze + + Returns: + The best parser for this content, or None if no parser can handle it + """ + for parser in self.parsers: + if parser.can_parse(html_content): + return parser + + return None + + def _save_extracted_text(self, parsed_content, doi: str) -> str: + """ + Save extracted text to a file. + + Args: + parsed_content: The parsed content object + doi: The DOI of the paper + + Returns: + Path to the saved text file + """ + download_path = DownloadPathConfig.get_path() + text_file_name = f"{doi.replace('/', '_')}_fulltext.txt" + text_file_path = os.path.join(download_path, text_file_name) + + with open(text_file_path, 'w', encoding='utf-8') as f: + # Write structured content + f.write(f"DOI: {parsed_content.doi or doi}\n") + f.write(f"Title: {parsed_content.title or 'Unknown'}\n") + f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n") + f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n") + + if parsed_content.authors: + f.write(f"Authors: {', '.join(parsed_content.authors)}\n") + + if parsed_content.keywords: + f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n") + + f.write(f"Extracted: {datetime.utcnow().isoformat()}\n") + f.write("=" * 80 + "\n\n") + + # Write full text + f.write(parsed_content.full_text) + + # Optionally write references at the end + if parsed_content.references: + f.write("\n\n" + "=" * 80 + "\n") + f.write("REFERENCES\n") + f.write("=" * 80 + "\n") + for i, ref in enumerate(parsed_content.references, 1): + f.write(f"{i}. {ref}\n") + + return text_file_path diff --git a/scipaperloader/scrapers/web_fetcher.py b/scipaperloader/scrapers/web_fetcher.py new file mode 100644 index 0000000..ec5756f --- /dev/null +++ b/scipaperloader/scrapers/web_fetcher.py @@ -0,0 +1,201 @@ +import time +import os +import requests +from urllib.parse import urlparse +from datetime import datetime +from .base import BaseScraper, ScrapeResult +from flask import current_app +from ..models import PaperMetadata, ActivityLog, DownloadPathConfig +from ..db import db + +class Scraper(BaseScraper): + """Web fetcher scraper that downloads HTML content from DOI URLs.""" + + # This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed" + INPUT_STATUSES = ["New"] + OUTPUT_STATUS_SUCCESS = "WebContentDownloaded" + OUTPUT_STATUS_FAILURE = "Failed" + OUTPUT_STATUS_PROCESSING = "FetchingWebContent" + + def scrape(self, doi: str) -> ScrapeResult: + """Fetch HTML content from DOI and save to download path.""" + start_time = time.time() + + paper = PaperMetadata.query.filter_by(doi=doi).first() + if not paper: + return ScrapeResult( + status="error", + message=f"No paper found for DOI {doi}", + data=None, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Log start of scraping + self.log_scrape_start(doi, paper.id) + + # Update status to processing + paper.status = self.OUTPUT_STATUS_PROCESSING + db.session.commit() + + # Prepare file paths + download_path = DownloadPathConfig.get_path() + file_name = f"{doi.replace('/', '_')}.html" + file_path = os.path.join(download_path, file_name) + + # Check/create download directory + if not os.path.exists(download_path): + try: + os.makedirs(download_path, exist_ok=True) + except OSError as e: + error_msg = f"Failed to create download directory: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "path_creation_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Check path permissions + if not os.access(download_path, os.W_OK): + error_msg = f"Download path '{download_path}' is not writable" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "path_write_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + try: + # Fetch HTML from DOI + doi_url = f"https://doi.org/{doi}" + headers = { + 'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1' + } + + response = requests.get( + doi_url, + headers=headers, + timeout=30, + allow_redirects=True, + verify=True + ) + + # Check for invalid DOI (404) or other HTTP errors + if response.status_code == 404: + error_msg = f"Invalid DOI: {doi} not found (404)" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "invalid_doi"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Check for other HTTP errors + response.raise_for_status() + + # Save HTML content + with open(file_path, 'w', encoding='utf-8') as f: + f.write(response.text) + + # Extract final URL after redirects (for publisher detection) + final_url = response.url + + # Update paper status to success + paper.status = self.OUTPUT_STATUS_SUCCESS + paper.file_path = file_path + paper.error_msg = None + db.session.commit() + + # Log success + success_msg = f"Successfully fetched HTML content for {doi} from {final_url}" + self.log_scrape_success(doi, success_msg, paper.id) + + return ScrapeResult( + status="success", + message=f"Successfully fetched HTML for {doi}", + data={ + "file_path": file_path, + "final_url": final_url, + "content_length": len(response.text), + "content_type": response.headers.get('content-type', 'unknown'), + "title": paper.title, + "domain": urlparse(final_url).netloc if final_url else None + }, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except requests.exceptions.HTTPError as e: + error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "http_error", "status_code": e.response.status_code}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except requests.exceptions.RequestException as e: + error_msg = f"Network error fetching {doi_url}: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "network_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + except Exception as e: + error_msg = f"Failed to save HTML file: {str(e)}" + paper.status = self.OUTPUT_STATUS_FAILURE + paper.error_msg = error_msg + db.session.commit() + + self.log_scrape_failure(doi, error_msg, paper.id) + + return ScrapeResult( + status="error", + message=error_msg, + data={"error_code": "file_creation_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) diff --git a/scipaperloader/static/js/README.md b/scipaperloader/static/js/README.md new file mode 100644 index 0000000..61fbf15 --- /dev/null +++ b/scipaperloader/static/js/README.md @@ -0,0 +1,384 @@ +# JavaScript Modularization Documentation + +## Overview + +The JavaScript code in the SciPaperLoader application has been modularized into reusable components to improve maintainability, reduce code duplication, and enable easier testing and updates. + +## Modularization Task Completed + +### Problem Statement +The original codebase had ~800+ lines of inline JavaScript scattered across multiple Jinja templates with several critical issues: +- **Code Duplication**: Similar functionality replicated across templates +- **Maintenance Difficulty**: Changes required editing multiple template files +- **Linter Issues**: Jinja template syntax mixed with JavaScript caused linting errors +- **Testing Challenges**: Inline code was difficult to unit test +- **Poor Separation of Concerns**: Template logic mixed with application logic + +### Solution Implemented +Successfully transformed the codebase by: + +1. **Extracted 10 Modular JavaScript Files** (~800+ lines of code moved from templates) +2. **Eliminated Code Duplication** by creating reusable components +3. **Fixed Linter Compatibility** by separating template syntax from JavaScript logic +4. **Implemented Clean Variable Passing** using JSON script tags instead of direct Jinja embedding +5. **Created Class-Based Architecture** with proper inheritance and composition patterns +6. **Established Inter-Component Communication** through callback systems +7. **Added Comprehensive Error Handling** and loading states throughout + +### Key Achievements +- ✅ **5 templates modularized**: `scraper.html.jinja`, `papers.html.jinja`, `upload.html.jinja`, `logger.html.jinja`, `config/schedule.html.jinja` +- ✅ **10 JavaScript modules created**: Covering all functionality from utilities to dashboard coordination +- ✅ **Zero functionality loss**: All existing features preserved during modularization +- ✅ **Improved maintainability**: Changes now require editing single module files +- ✅ **Enhanced testability**: Individual modules can be unit tested +- ✅ **Clean variable handling**: Jinja variables passed as JSON configuration instead of inline embedding + +### Before vs After Example +**Before (inline in template)**: +```html + +``` + +**After (modular)**: +```html + + + +``` + +## Modular JavaScript Files + +### 1. `/static/js/common.js` +**Purpose**: Common utilities used across the application + +**Key Functions**: +- `showFlashMessage(message, type)` - Display flash messages to users +- `createStatusBadge(status)` - Generate status badge HTML +- `formatTimestamp(timestamp)` - Format timestamps for display +- `truncateText(text, maxLength)` - Truncate text with ellipsis +- `toggleButtonLoading(button, loading, loadingText)` - Handle button loading states +- `apiRequest(url, options)` - Generic API request wrapper + +**Used by**: All templates that need basic utilities + +### 2. `/static/js/modal-handler.js` +**Purpose**: Handle modal dialogs with dynamic content loading + +**Key Features**: +- AJAX content loading +- Error handling +- Automatic click handler setup +- Bootstrap modal integration + +**Used by**: +- `papers.html.jinja` (paper details modal) +- `logger.html.jinja` (log details modal) + +### 3. `/static/js/form-handler.js` +**Purpose**: Handle form submissions with progress tracking + +**Key Features**: +- Progress modal display +- Task status polling +- Error handling +- Customizable callbacks + +**Used by**: +- `upload.html.jinja` (CSV upload form) + +### 4. `/static/js/chart.js` +**Purpose**: Handle Chart.js activity visualization + +**Key Features**: +- Chart initialization and rendering +- Data loading from API +- Error handling for missing Chart.js + +**Used by**: +- `scraper.html.jinja` (activity charts) + +### 5. `/static/js/scraper-control.js` +**Purpose**: Handle scraper control operations (start/stop/pause/reset) + +**Key Features**: +- Status polling +- Volume configuration +- Callback system for refreshing other components + +**Used by**: +- `scraper.html.jinja` + +### 6. `/static/js/paper-processor.js` +**Purpose**: Handle paper search and processing functionality + +**Key Features**: +- Paper search +- Single paper processing +- Status polling +- Scraper selection + +**Used by**: +- `scraper.html.jinja` + +### 7. `/static/js/activity-monitor.js` +**Purpose**: Handle activity log display and real-time notifications + +**Key Features**: +- Activity log loading +- Real-time updates +- Notification management + +**Used by**: +- `scraper.html.jinja` + +### 8. `/static/js/scraper-dashboard.js` +**Purpose**: Coordinate all scraper dashboard components + +**Key Features**: +- Component initialization +- Inter-component communication +- Configuration management + +**Used by**: +- `scraper.html.jinja` + +### 9. `/static/js/config-handler.js` +**Purpose**: Handle configuration forms and Alpine.js integration + +**Key Features**: +- Configuration API calls +- Alpine.js data objects +- Schedule management +- Volume updates + +**Used by**: +- `config/schedule.html.jinja` + +## Template Updates + +### Templates Using Modular JavaScript + +1. **scraper.html.jinja** + - Uses all scraper-related modules + - Passes Jinja variables as configuration parameters + - Initializes dashboard with `initScraperDashboard(config)` + +2. **papers.html.jinja** + - Uses `modal-handler.js` for paper detail modals + - Simplified from custom modal code to single line initialization + +3. **upload.html.jinja** + - Uses `form-handler.js` for upload progress tracking + - Custom result display function + - Automatic task status polling + +4. **logger.html.jinja** + - Uses `modal-handler.js` for log detail modals + - Custom URL construction for log endpoints + +5. **config/schedule.html.jinja** + - Uses `config-handler.js` for Alpine.js integration + - Modular schedule management functions + +## Benefits of Modularization + +### 1. **Reusability** +- Modal functionality shared between papers and logger templates +- Common utilities used across all templates +- Form handling can be reused for other forms + +### 2. **Maintainability** +- Single place to update common functionality +- Clear separation of concerns +- Easier debugging and testing + +### 3. **Parameter Passing** +- Jinja variables passed as configuration objects +- No more hardcoded values in JavaScript +- Environment-specific settings easily configurable + +### 4. **Extensibility** +- Easy to add new functionality to existing modules +- New templates can easily use existing modules +- Plugin-like architecture for components + +## Usage Examples + +### Basic Modal Usage +```javascript +const modal = new ModalHandler('modalId', 'contentElementId'); +modal.setupClickHandlers('.clickable-items'); +``` + +### Form with Progress Tracking +```javascript +const formHandler = new FormHandler('formId', { + onSuccess: (result) => console.log('Success:', result), + onError: (error) => console.log('Error:', error) +}); +``` + +### Configuration Management +```javascript +// In Alpine.js template +x-data="configHandler.createScheduleManager(initialData, volume)" +``` + +## Migration Notes + +### Old vs New Approach + +**Before**: Inline JavaScript in each template +```html + +``` + +**After**: Modular imports with configuration +```html + + + +``` + +### Jinja Variable Handling + +To properly separate Jinja template variables from JavaScript code and avoid linting issues, we use a clean JSON configuration approach: + +**Before**: Variables embedded directly in JavaScript (causes linting issues) +```javascript +if (volume > {{ max_volume }}) { + // Error handling - JSLint will complain about {{ }} +} +``` + +**After**: Clean separation using JSON script tags +```html + + + + + +``` + +**Benefits of this approach**: +- **Linter-friendly**: No template syntax in JavaScript files +- **Type-safe**: JSON ensures proper data types +- **Maintainable**: Clear separation of concerns +- **Secure**: Automatic escaping with `|tojson` filter +- **Debuggable**: Easy to inspect configuration in DevTools + +**Real-world example from scraper.html.jinja**: +```html + + + +``` + +## Future Improvements + +### Potential Enhancements +1. **Bundle Management**: Consider using webpack or similar for production builds +2. **Unit Testing**: Add comprehensive test suite for individual modules +3. **JSDoc Comments**: Add detailed documentation for better IDE support +4. **Centralized Error Reporting**: Implement global error handling system +5. **Performance Optimization**: Implement lazy loading for non-critical modules +6. **TypeScript Migration**: Consider migrating to TypeScript for better type safety + +### Adding New Modules +When creating new JavaScript modules: +1. Follow the established class-based pattern +2. Include proper error handling +3. Use the configuration pattern for Jinja variables +4. Add documentation to this README +5. Update templates to use the new module + +## Testing + +A test file `test_js_modularization.py` has been created to verify the modularization. To run comprehensive testing: + +```bash +python test_js_modularization.py +``` + +This will verify: +- All JavaScript files exist and are properly formatted +- Templates correctly reference the modular files +- Configuration patterns are properly implemented +- No inline JavaScript remains in templates + +## Maintenance + +### When Making Changes +1. **Update Single Module**: Changes to functionality only require editing one file +2. **Test Affected Templates**: Ensure all templates using the module still work +3. **Update Documentation**: Keep this README current with any changes +4. **Consider Dependencies**: Check if changes affect other modules + +### File Organization +``` +/static/js/ +├── README.md # This documentation +├── common.js # Shared utilities +├── modal-handler.js # Modal functionality +├── form-handler.js # Form processing +├── chart.js # Chart visualization +├── scraper-control.js # Scraper operations +├── paper-processor.js # Paper management +├── activity-monitor.js # Activity tracking +├── scraper-dashboard.js # Dashboard coordination +├── config-handler.js # Configuration management +└── table-handler.js # Table utilities +``` + +## Migration Summary + +The modularization successfully transformed **~800+ lines of inline JavaScript** from templates into a maintainable, reusable module system. This improvement provides: + +- **Enhanced maintainability** through single-responsibility modules +- **Reduced code duplication** via shared utility functions +- **Improved linter compatibility** by separating template and JavaScript concerns +- **Better testability** with isolated, unit-testable modules +- **Cleaner templates** with minimal, configuration-only JavaScript +- **Easier debugging** with clearly separated concerns and proper error handling + +All existing functionality has been preserved while significantly improving the codebase architecture and developer experience. \ No newline at end of file diff --git a/scipaperloader/static/js/scraper-control.js b/scipaperloader/static/js/scraper-control.js index 614653f..4215420 100644 --- a/scipaperloader/static/js/scraper-control.js +++ b/scipaperloader/static/js/scraper-control.js @@ -38,12 +38,12 @@ class ScraperController { this.resetButton.addEventListener("click", () => this.resetScraper()); } - // Volume form - const volumeForm = document.getElementById("volumeForm"); - if (volumeForm) { - volumeForm.addEventListener("submit", (e) => { + // Configuration form (handles both volume and scraper module) + const configForm = document.getElementById("volumeForm"); + if (configForm) { + configForm.addEventListener("submit", (e) => { e.preventDefault(); - this.updateVolume(); + this.updateConfiguration(); }); } } @@ -245,25 +245,46 @@ class ScraperController { } /** - * Update volume configuration + * Update configuration (volume and/or scraper module) */ - async updateVolume() { + async updateConfiguration() { const volumeInput = document.getElementById("volumeInput"); + const scraperSelect = document.getElementById("mainScraperSelect"); const submitButton = document.querySelector( '#volumeForm button[type="submit"]' ); - if (!volumeInput || !submitButton) return; + if (!submitButton) return; - const volume = volumeInput.value; + const updates = {}; + let hasChanges = false; - // Basic validation - if (!volume || volume < 1 || volume > this.maxVolume) { - showFlashMessage( - `Please enter a valid volume between 1 and ${this.maxVolume}`, - "warning" - ); - volumeInput.focus(); + // Check volume changes + if (volumeInput) { + const volume = volumeInput.value; + + // Basic validation + if (!volume || volume < 1 || volume > this.maxVolume) { + showFlashMessage( + `Please enter a valid volume between 1 and ${this.maxVolume}`, + "warning" + ); + volumeInput.focus(); + return; + } + + updates.volume = volume; + hasChanges = true; + } + + // Check scraper module changes + if (scraperSelect && scraperSelect.value) { + updates.scraper_module = scraperSelect.value; + hasChanges = true; + } + + if (!hasChanges) { + showFlashMessage("No changes to save", "info"); return; } @@ -273,21 +294,24 @@ class ScraperController { try { const data = await apiRequest("/scraper/update_config", { method: "POST", - body: JSON.stringify({ volume: volume }), + body: JSON.stringify(updates), }); if (data.success) { showFlashMessage( - data.message || "Volume updated successfully", + data.message || "Configuration updated successfully", "success" ); } else { - showFlashMessage(data.message || "Failed to update volume", "error"); + showFlashMessage( + data.message || "Failed to update configuration", + "error" + ); } } catch (error) { - console.error("Error updating volume:", error); + console.error("Error updating configuration:", error); showFlashMessage( - "Network error while updating volume. Please try again.", + "Network error while updating configuration. Please try again.", "error" ); } finally { diff --git a/scipaperloader/static/js/scraper-overview.js b/scipaperloader/static/js/scraper-overview.js new file mode 100644 index 0000000..03ce4f1 --- /dev/null +++ b/scipaperloader/static/js/scraper-overview.js @@ -0,0 +1,500 @@ +/** + * Scraper Overview functionality + */ + +class ScraperOverview { + constructor() { + this.modal = null; + this.scrapers = []; + this.systemConfig = {}; + this.init(); + } + + init() { + // Initialize modal reference + this.modal = document.getElementById("scraperOverviewModal"); + + // Load data when modal is shown + if (this.modal) { + this.modal.addEventListener("show.bs.modal", () => { + this.loadScraperOverview(); + }); + } + } + + async loadScraperOverview() { + const loadingEl = document.getElementById("scraperOverviewLoading"); + const errorEl = document.getElementById("scraperOverviewError"); + const contentEl = document.getElementById("scraperOverviewContent"); + + // Show loading state + loadingEl?.classList.remove("d-none"); + errorEl?.classList.add("d-none"); + contentEl?.classList.add("d-none"); + + try { + // Load scrapers, system config, and publishers in parallel + const [scrapersResponse, statusResponse, publishersResponse] = + await Promise.all([ + fetch("/scraper/scrapers"), + fetch("/scraper/status"), + fetch("/scraper/publishers"), + ]); + + if ( + !scrapersResponse.ok || + !statusResponse.ok || + !publishersResponse.ok + ) { + throw new Error("Failed to load scraper information"); + } + + const scrapersData = await scrapersResponse.json(); + const statusData = await statusResponse.json(); + const publishersData = await publishersResponse.json(); + + if ( + !scrapersData.success || + !statusData.success || + !publishersData.success + ) { + throw new Error( + scrapersData.message || + statusData.message || + publishersData.message || + "Unknown error" + ); + } + + this.scrapers = scrapersData.scrapers; + this.systemConfig = statusData; + this.publishersData = publishersData.data; + + // Update UI + this.updateSystemConfig(); + this.updateScrapersTable(); + this.updatePublishersSection(); + this.updateStatusFlowDiagram(); + + // Show content + loadingEl?.classList.add("d-none"); + contentEl?.classList.remove("d-none"); + } catch (error) { + console.error("Error loading scraper overview:", error); + + // Show error state + loadingEl?.classList.add("d-none"); + const errorMessage = document.getElementById( + "scraperOverviewErrorMessage" + ); + if (errorMessage) { + errorMessage.textContent = + error.message || "Failed to load scraper information"; + } + errorEl?.classList.remove("d-none"); + } + } + + updateSystemConfig() { + // Current scraper module + const currentModuleEl = document.getElementById("currentScraperModule"); + if (currentModuleEl) { + const currentModule = + this.systemConfig.current_scraper_module || "System Default"; + currentModuleEl.textContent = currentModule; + currentModuleEl.className = "badge bg-primary"; + } + + // Volume limit + const volumeLimitEl = document.getElementById("currentVolumeLimit"); + if (volumeLimitEl) { + const volumeLimit = this.systemConfig.volume_config || "Unknown"; + volumeLimitEl.textContent = volumeLimit; + } + + // Total modules + const totalModulesEl = document.getElementById("totalScraperModules"); + if (totalModulesEl) { + totalModulesEl.textContent = this.scrapers.length; + } + + // Paper counts summary + const paperCountsEl = document.getElementById("paperCountsSummary"); + if (paperCountsEl && this.systemConfig.paper_counts) { + const counts = this.systemConfig.paper_counts; + paperCountsEl.innerHTML = ` +
+ ${counts.new || 0} New + ${ + counts.processing || 0 + } Processing + ${ + counts.done || 0 + } Done + ${ + counts.failed || 0 + } Failed + ${ + counts.pending || 0 + } Pending + ${ + counts.retrying || 0 + } Retrying +
+ `; + } + } + + updateScrapersTable() { + const tbody = document.getElementById("scrapersTableBody"); + if (!tbody) return; + + tbody.innerHTML = ""; + + this.scrapers.forEach((scraper) => { + const row = document.createElement("tr"); + + // Check if this is the current active scraper + const isCurrentScraper = + scraper.name === this.systemConfig.current_scraper_module; + + if (scraper.error) { + row.innerHTML = ` + ${scraper.name} + + ${scraper.error} + + `; + } else { + row.innerHTML = ` + + ${scraper.name} + ${ + scraper.name === "dummy" + ? 'Test Module' + : "" + } + ${ + isCurrentScraper + ? ' Active' + : "" + } + + + ${this.truncateDescription(scraper.description)} + + + ${this.renderStatusBadges( + scraper.input_statuses, + "bg-info" + )} + + + ${ + scraper.output_status_success + } + + + ${ + scraper.output_status_failure + } + + + ${ + scraper.output_status_processing + } + + `; + } + + // Highlight the current scraper row + if (isCurrentScraper) { + row.classList.add("table-success"); + } + + tbody.appendChild(row); + }); + } + + updateStatusFlowDiagram() { + const diagramEl = document.getElementById("statusFlowDiagram"); + if (!diagramEl) return; + + // Analyze actual scrapers to build real flow + const statusFlow = this.analyzeScraperFlow(); + + let diagramHTML = '
'; + + // Create visual flow based on actual scrapers + statusFlow.forEach((stage, index) => { + if (index > 0) { + diagramHTML += + '
'; + } + + diagramHTML += '
'; + diagramHTML += `
${stage.title}
`; + + if (stage.scrapers && stage.scrapers.length > 0) { + diagramHTML += + '
Handled by: ' + + stage.scrapers.map((s) => `${s}`).join(", ") + + "
"; + } + + diagramHTML += '
'; + stage.statuses.forEach((status, statusIndex) => { + if (statusIndex > 0) { + diagramHTML += ''; + } + + const badgeClass = this.getStatusBadgeClass(status); + diagramHTML += `${status}`; + }); + diagramHTML += "
"; + + if (stage.description) { + diagramHTML += `
${stage.description}
`; + } + + diagramHTML += "
"; + }); + + diagramHTML += "
"; + + // Add explanation + diagramHTML += ` +
+
Flow Explanation:
+ +
+ `; + + diagramEl.innerHTML = diagramHTML; + } + + analyzeScraperFlow() { + // Build actual flow based on available scrapers + const stages = []; + const allInputStatuses = new Set(); + const allOutputStatuses = new Set(); + const scrapersByInput = {}; + + // Analyze scrapers to understand the flow + this.scrapers.forEach((scraper) => { + if (scraper.input_statuses) { + scraper.input_statuses.forEach((status) => { + allInputStatuses.add(status); + if (!scrapersByInput[status]) { + scrapersByInput[status] = []; + } + scrapersByInput[status].push(scraper.name); + }); + } + + if (scraper.output_status_success) + allOutputStatuses.add(scraper.output_status_success); + if (scraper.output_status_failure) + allOutputStatuses.add(scraper.output_status_failure); + }); + + // Entry point + if (allInputStatuses.has("New")) { + stages.push({ + title: "Entry Point", + statuses: ["New"], + scrapers: scrapersByInput["New"] || [], + description: "Newly uploaded papers enter the processing pipeline", + }); + } + + // Processing stages + const processingStatuses = Array.from(allInputStatuses).filter( + (status) => !["New", "Done", "Failed"].includes(status) + ); + + if (processingStatuses.length > 0) { + stages.push({ + title: "Processing Stages", + statuses: processingStatuses, + scrapers: [], + description: "Papers move through various processing stages", + }); + } + + // Final outputs + const finalStatuses = ["Done", "Failed"]; + stages.push({ + title: "Final States", + statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)), + scrapers: [], + description: "Papers end up in final success or failure states", + }); + + // Retry handling + if (allInputStatuses.has("Failed")) { + stages.push({ + title: "Retry Processing", + statuses: ["Failed", "Retrying"], + scrapers: scrapersByInput["Failed"] || [], + description: "Failed papers can be retried with specialized scrapers", + }); + } + + return stages; + } + + getStatusBadgeClass(status) { + const statusClasses = { + New: "bg-primary", + Pending: "bg-warning", + Processing: "bg-warning", + Retrying: "bg-warning", + Done: "bg-success", + Failed: "bg-danger", + HtmlDownloaded: "bg-info", + PublisherDetected: "bg-info", + TextExtracted: "bg-info", + }; + + return statusClasses[status] || "bg-secondary"; + } + + renderStatusBadges(statuses, defaultClass = "bg-secondary") { + if (!Array.isArray(statuses)) return ""; + + return statuses + .map( + (status) => + `${status}` + ) + .join(""); + } + + truncateDescription(description, maxLength = 100) { + if (!description) return "No description available"; + + if (description.length <= maxLength) return description; + + return description.substring(0, maxLength).trim() + "..."; + } + + updatePublishersSection() { + // Update publisher statistics + const publisherStatsEl = document.getElementById("publisherStats"); + if (publisherStatsEl && this.publishersData && this.publishersData.stats) { + const stats = this.publishersData.stats; + publisherStatsEl.innerHTML = ` +
+
+
${stats.total_publishers}
+
Total Publishers
+
+
+
+
+
${stats.publishers_with_parsers}
+
With Parsers
+
+
+
+
+
${stats.publishers_without_parsers}
+
Missing Parsers
+
+
+
+
+
${stats.total_papers_with_publisher}
+
Papers with Publisher
+
+
+ `; + } + + // Update publishers table + const publishersTableBody = document.getElementById("publishersTableBody"); + if ( + publishersTableBody && + this.publishersData && + this.publishersData.publishers + ) { + publishersTableBody.innerHTML = ""; + + if (this.publishersData.publishers.length === 0) { + publishersTableBody.innerHTML = ` + + + No publishers detected yet.
+ Run the publisher_detector scraper to identify publishers from paper URLs. + + + `; + return; + } + + this.publishersData.publishers.forEach((publisher) => { + const row = document.createElement("tr"); + + // Publisher status badge + const statusBadge = publisher.has_parser + ? ' Available' + : ' Missing'; + + // Parser availability indicator + const parserIndicator = publisher.has_parser + ? '' + : ''; + + row.innerHTML = ` + + ${publisher.name} + + + ${publisher.paper_count} + + ${statusBadge} + ${parserIndicator} + `; + + publishersTableBody.appendChild(row); + }); + } + } + + // Public method to show the modal + show() { + if (this.modal) { + const bootstrapModal = new bootstrap.Modal(this.modal); + bootstrapModal.show(); + } + } +} + +// Global function to load scraper overview (used by retry button) +function loadScraperOverview() { + if (window.scraperOverview) { + window.scraperOverview.loadScraperOverview(); + } +} + +// Global function to show scraper overview modal +function showScraperOverview() { + if (!window.scraperOverview) { + window.scraperOverview = new ScraperOverview(); + } + window.scraperOverview.show(); +} + +// Initialize when DOM is ready +document.addEventListener("DOMContentLoaded", function () { + window.scraperOverview = new ScraperOverview(); +}); diff --git a/scipaperloader/templates/config/general.html.jinja b/scipaperloader/templates/config/general.html.jinja index 694ae28..6300e03 100644 --- a/scipaperloader/templates/config/general.html.jinja +++ b/scipaperloader/templates/config/general.html.jinja @@ -65,7 +65,13 @@
-
Scraper Module
+
+
Scraper Module
+ +

Select which scraper module to use for processing papers.

diff --git a/scipaperloader/templates/config/index.html.jinja b/scipaperloader/templates/config/index.html.jinja index 5feb4dd..727147c 100644 --- a/scipaperloader/templates/config/index.html.jinja +++ b/scipaperloader/templates/config/index.html.jinja @@ -53,4 +53,13 @@ {% endif %}
-{% endblock content %} \ No newline at end of file + + +{% include "partials/scraper_overview_modal.html.jinja" %} + +{% endblock content %} + +{% block scripts %} +{{ super() }} + +{% endblock scripts %} \ No newline at end of file diff --git a/scipaperloader/templates/partials/scraper_overview_modal.html.jinja b/scipaperloader/templates/partials/scraper_overview_modal.html.jinja new file mode 100644 index 0000000..eeaae7d --- /dev/null +++ b/scipaperloader/templates/partials/scraper_overview_modal.html.jinja @@ -0,0 +1,249 @@ + + + + \ No newline at end of file diff --git a/scipaperloader/templates/scraper.html.jinja b/scipaperloader/templates/scraper.html.jinja index 2c0f2dc..d67fa5e 100644 --- a/scipaperloader/templates/scraper.html.jinja +++ b/scipaperloader/templates/scraper.html.jinja @@ -114,20 +114,44 @@
-
-
Volume Configuration
+
+
Scraper Configuration
+
-
+
- +
Enter a value between 1 and {{ max_volume }}
-
Enter a value between 1 and {{ max_volume }}
+ +
+ + +
+ Select which scraper module to use for automated processing. Current: {{ + current_scraper_module }} +
+
+ +
@@ -306,6 +330,10 @@
+ + +{% include "partials/scraper_overview_modal.html.jinja" %} + {% endblock content %} {% block scripts %} @@ -320,6 +348,7 @@ +