adds scraper modules and modular publisher parser system

2025-06-13 10:11:59 +02:00 · 2025-06-13 10:11:59 +02:00 · a7964a2f3d
commit a7964a2f3d
parent ce6bc03b46
22 changed files with 2877 additions and 71 deletions
--- a/22
+++ b/22
@ -1,5 +1,5 @@
 # List of phony targets (targets that don't represent files)
-.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics
+.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics clean-papers purge-db

 # Define Python and pip executables inside virtual environment
 PYTHON := venv/bin/python
@ -14,7 +14,7 @@ clean:
 	rm -rf venv build dist .pytest_cache .mypy_cache *.egg-info

 # Define database path
-DB_PATH=scipaperloader/papers.db
+DB_PATH=instance/papers.db

 # Backup the database with timestamp
 backup-db:
@ -90,6 +90,24 @@ reset-db: venv
 	$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
 	$(PYTHON) -m flask --app scipaperloader db upgrade

+# Clean all papers from the database (keep other tables intact)
+clean-papers: venv
+	@echo "Cleaning all papers from the database..."
+	@$(PYTHON) -c "from scipaperloader.db import db; from scipaperloader.models import PaperMetadata; from scipaperloader import create_app; app = create_app(); app.app_context().push(); PaperMetadata.query.delete(); db.session.commit(); print('All papers have been removed from the database')"
+
+# Completely purge all database contents (removes all tables and data)
+purge-db: venv
+	@echo "WARNING: This will completely wipe all database contents!"
+	@read -p "Are you sure you want to continue? (y/N) " -n 1 -r; \
+	echo; \
+	if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
+		echo "Purging database..."; \
+		rm -f $(DB_PATH); \
+		echo "Database completely purged"; \
+	else \
+		echo "Operation cancelled"; \
+	fi
+
 # Create and set up virtual environment
 venv:
 	python3 -m venv venv && \
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,6 +15,8 @@ dependencies = [
    "pandas>=2.2.3,<3",
    "APScheduler>=3.10.4,<4",
    "flask-migrate>=4.1.0,<5",
+    "beautifulsoup4>=4.13.4,<5 ",
+    "requests>=2.32.4,<3"
    ]

 [project.optional-dependencies]
--- a/scipaperloader/blueprints/scraper.py
+++ b/scipaperloader/blueprints/scraper.py
@ -29,6 +29,10 @@ def index():
    # Get volume configuration
    volume_config = VolumeConfig.get_current_volume()
    
+    # Get scraper module configuration
+    from ..models import ScraperModuleConfig
+    current_scraper_module = ScraperModuleConfig.get_current_module()
+    
    # Get paper counts by status
    paper_counts = {
        'new': PaperMetadata.query.filter_by(status='New').count(),
@ -46,7 +50,10 @@ def index():
        recent_logs=recent_logs,
        paper_counts=paper_counts,
        volume_config=volume_config,
-        max_volume=MAX_VOLUME
+        max_volume=MAX_VOLUME,
+        current_scraper_module=current_scraper_module,
+        available_scraper_modules=[s["name"] for s in available_scrapers],
+        scraper_details={s["name"]: s for s in available_scrapers}
    )

@bp.route("/start", methods=["POST"])
@ -219,6 +226,13 @@ def get_status():
        # Get current hour quota info
        current_quota = scraper_manager.get_current_hour_quota()
        
+        # Get current scraper module configuration
+        from ..models import ScraperModuleConfig
+        current_scraper_module = ScraperModuleConfig.get_current_module()
+        
+        # Get volume configuration
+        current_volume = VolumeConfig.get_current_volume()
+        
        return jsonify({
            "success": True,
            "scraper_state": {
@ -227,7 +241,9 @@ def get_status():
                "last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
            },
            "paper_counts": paper_counts,
-            "current_quota": current_quota
+            "current_quota": current_quota,
+            "current_scraper_module": current_scraper_module,
+            "volume_config": current_volume
        })
        
    except Exception as e:
@ -665,6 +681,35 @@ def update_scraper_config():
                    "message": message
                }), 400
        
+        # Handle scraper module configuration updates
+        if "scraper_module" in data:
+            from ..models import ScraperModuleConfig
+            
+            new_module = data["scraper_module"]
+            
+            # Validate that the module exists and is valid
+            available_modules = [m["name"] for m in get_available_scrapers()]
+            
+            if new_module not in available_modules:
+                return jsonify({
+                    "success": False,
+                    "message": f"Invalid scraper module: {new_module}"
+                }), 400
+            
+            # Update the database configuration
+            ScraperModuleConfig.set_module(new_module)
+            
+            ActivityLog.log_scraper_command(
+                action="update_scraper_module",
+                status="success",
+                description=f"Updated scraper module to '{new_module}'"
+            )
+            
+            return jsonify({
+                "success": True,
+                "message": f"Scraper module updated to '{new_module}' successfully"
+            })
+        
        # Handle other configuration updates here if needed in the future
        
        return jsonify({
@ -681,4 +726,73 @@ def update_scraper_config():
        return jsonify({
            "success": False,
            "message": f"Error updating scraper config: {str(e)}"
+        }), 500
+
+@bp.route("/publishers")
+def get_publishers():
+    """Get publisher overview data for the scraper overview modal."""
+    try:
+        import os
+        import glob
+        
+        # Get available parser modules
+        parsers_dir = os.path.join(current_app.root_path, 'parsers')
+        parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py'))
+        available_parsers = []
+        
+        for parser_file in parser_files:
+            filename = os.path.basename(parser_file)
+            if filename != 'base_parser.py':  # Skip the base parser
+                parser_name = filename.replace('_parser.py', '')
+                available_parsers.append(parser_name)
+        
+        # Get publishers from database (papers that have publisher detected)
+        publisher_query = db.session.query(
+            PaperMetadata.publisher,
+            db.func.count(PaperMetadata.id).label('paper_count')
+        ).filter(
+            PaperMetadata.publisher.isnot(None),
+            PaperMetadata.publisher != ''
+        ).group_by(PaperMetadata.publisher).all()
+        
+        publishers_data = []
+        for publisher, count in publisher_query:
+            # Check if a parser exists for this publisher
+            has_parser = publisher in available_parsers
+            
+            publishers_data.append({
+                'name': publisher,
+                'paper_count': count,
+                'has_parser': has_parser,
+                'parser_status': 'available' if has_parser else 'missing'
+            })
+        
+        # Sort by paper count descending
+        publishers_data.sort(key=lambda x: x['paper_count'], reverse=True)
+        
+        # Get totals
+        total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data)
+        total_papers_without_publisher = PaperMetadata.query.filter(
+            db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '')
+        ).count()
+        
+        return jsonify({
+            'success': True,
+            'data': {
+                'publishers': publishers_data,
+                'available_parsers': available_parsers,
+                'stats': {
+                    'total_publishers': len(publishers_data),
+                    'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]),
+                    'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]),
+                    'total_papers_with_publisher': total_papers_with_publisher,
+                    'total_papers_without_publisher': total_papers_without_publisher
+                }
+            }
+        })
+        
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'message': f'Error getting publisher data: {str(e)}'
        }), 500
--- a/scipaperloader/models.py
+++ b/scipaperloader/models.py
@ -191,6 +191,7 @@ class PaperMetadata(db.Model):
    type = db.Column(db.String(50))
    language = db.Column(db.String(50))
    published_online = db.Column(db.Date)  # or DateTime/String
+    publisher = db.Column(db.String(100), nullable=True)  # Detected publisher name
    status = db.Column(db.String(10))  # 'Pending','Done','Failed'
    previous_status = db.Column(db.String(10), nullable=True)  # Store previous status for reversion
    file_path = db.Column(db.Text)
--- a/scipaperloader/parsers/init.py
+++ b/scipaperloader/parsers/init.py
@ -0,0 +1,6 @@
+# Parser modules for extracting full text from publisher-specific HTML content
+from .base_parser import BaseParser, ParsedContent, ParseError
+from .elsevier_parser import ElsevierParser
+from .arxiv_parser import ArxivParser
+
+__all__ = ['BaseParser', 'ParsedContent', 'ParseError', 'ElsevierParser', 'ArxivParser']
--- a/scipaperloader/parsers/arxiv_parser.py
+++ b/scipaperloader/parsers/arxiv_parser.py
@ -0,0 +1,227 @@
+import re
+from bs4 import BeautifulSoup
+from typing import Dict, Optional, List
+from .base_parser import BaseParser, ParsedContent, ParseError
+
+class ArxivParser(BaseParser):
+    """Parser for arXiv papers."""
+    
+    def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
+        """Check if this is an arXiv page."""
+        html_lower = html_content.lower()
+        
+        # Check for arXiv indicators
+        indicators = [
+            'arxiv.org',
+            'export.arxiv.org',
+            'arxiv:',
+            'meta name="citation_publisher" content="arxiv"',
+        ]
+        
+        return any(indicator in html_lower for indicator in indicators)
+    
+    def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
+        """Parse arXiv HTML content."""
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            # Extract title
+            title = self._extract_title(soup)
+            
+            # Extract abstract
+            abstract = self._extract_abstract(soup)
+            
+            # Extract authors
+            authors = self._extract_authors(soup)
+            
+            # Extract full text (arXiv usually just has abstract on the HTML page)
+            full_text = self._extract_full_text(soup, abstract)
+            
+            # Extract keywords/subjects
+            keywords = self._extract_subjects(soup)
+            
+            # Extract arxiv ID
+            arxiv_id = self._extract_arxiv_id(soup)
+            
+            if not full_text or len(full_text.strip()) < 50:
+                raise ParseError("Could not extract meaningful content from arXiv page")
+            
+            return ParsedContent(
+                full_text=full_text,
+                title=title,
+                abstract=abstract,
+                authors=authors,
+                keywords=keywords,
+                sections=None,  # arXiv HTML pages don't usually have full sections
+                references=None,  # References are typically in the PDF
+                doi=doi,
+                journal="arXiv",
+                publication_date=self._extract_submission_date(soup),
+                metadata={
+                    'parser': 'arxiv',
+                    'arxiv_id': arxiv_id,
+                    'source': 'arxiv.org'
+                }
+            )
+            
+        except Exception as e:
+            raise ParseError(f"Failed to parse arXiv content: {str(e)}")
+    
+    def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract paper title."""
+        # Try multiple title selectors for arXiv
+        selectors = [
+            'h1.title',
+            'meta[name="citation_title"]',
+            'title'
+        ]
+        
+        for selector in selectors:
+            if 'meta' in selector:
+                element = soup.find('meta', attrs={'name': 'citation_title'})
+                if element:
+                    return element.get('content', '').strip()
+            else:
+                element = soup.select_one(selector)
+                if element:
+                    text = element.get_text(strip=True)
+                    # Remove "Title:" prefix if present
+                    text = re.sub(r'^Title:\s*', '', text)
+                    return text
+        
+        return None
+    
+    def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract paper abstract."""
+        # arXiv abstract selectors
+        selectors = [
+            'blockquote.abstract',
+            'div.abstract',
+            'meta[name="citation_abstract"]'
+        ]
+        
+        for selector in selectors:
+            if 'meta' in selector:
+                element = soup.find('meta', attrs={'name': 'citation_abstract'})
+                if element:
+                    return element.get('content', '').strip()
+            else:
+                element = soup.select_one(selector)
+                if element:
+                    text = element.get_text(strip=True)
+                    # Remove "Abstract:" prefix if present
+                    text = re.sub(r'^Abstract:\s*', '', text)
+                    return text
+        
+        return None
+    
+    def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
+        """Extract author names."""
+        authors = []
+        
+        # Try author meta tags
+        author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
+        if author_metas:
+            authors = [meta.get('content', '').strip() for meta in author_metas]
+        
+        # Try arXiv author div
+        if not authors:
+            authors_div = soup.select_one('div.authors')
+            if authors_div:
+                # Extract author links or text
+                author_links = authors_div.find_all('a')
+                if author_links:
+                    authors = [link.get_text(strip=True) for link in author_links]
+                else:
+                    # Fallback to text parsing
+                    text = authors_div.get_text()
+                    # Remove "Authors:" prefix and split by commas
+                    text = re.sub(r'^Authors?:\s*', '', text)
+                    authors = [author.strip() for author in text.split(',')]
+        
+        return authors if authors else None
+    
+    def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
+        """Extract main content (usually just abstract for arXiv HTML pages)."""
+        content_parts = []
+        
+        # For arXiv, the HTML page typically only contains abstract and metadata
+        # The full text is in the PDF
+        
+        if abstract:
+            content_parts.append(f"Abstract\n{abstract}")
+        
+        # Look for any additional content sections
+        comments_section = soup.select_one('td.comments')
+        if comments_section:
+            comments = comments_section.get_text(strip=True)
+            if comments:
+                content_parts.append(f"Comments\n{comments}")
+        
+        # Add note about PDF availability
+        content_parts.append(
+            "\nNote: This is the abstract and metadata from the arXiv HTML page. "
+            "The full text is available in the PDF version."
+        )
+        
+        return '\n\n'.join(content_parts)
+    
+    def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
+        """Extract subject classifications."""
+        subjects = []
+        
+        # Look for subject classification
+        subjects_td = soup.select_one('td.subjects')
+        if subjects_td:
+            subjects_text = subjects_td.get_text(strip=True)
+            # Parse subjects (format: "Primary: subject1; Secondary: subject2")
+            subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
+            # Clean up prefixes
+            subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
+            subjects = [subj for subj in subjects if subj]  # Remove empty strings
+        
+        return subjects if subjects else None
+    
+    def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract arXiv ID."""
+        # Look for arXiv ID in various places
+        arxiv_id_patterns = [
+            r'arXiv:(\d+\.\d+(?:v\d+)?)',
+            r'(\d{4}\.\d{4,5}(?:v\d+)?)',
+        ]
+        
+        # Search in page text
+        page_text = soup.get_text()
+        for pattern in arxiv_id_patterns:
+            match = re.search(pattern, page_text)
+            if match:
+                return match.group(1)
+        
+        # Search in URL or meta tags
+        canonical_link = soup.find('link', attrs={'rel': 'canonical'})
+        if canonical_link:
+            href = canonical_link.get('href', '')
+            for pattern in arxiv_id_patterns:
+                match = re.search(pattern, href)
+                if match:
+                    return match.group(1)
+        
+        return None
+    
+    def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract submission date."""
+        # Look for submission date
+        submission_td = soup.select_one('td.submission-history')
+        if submission_td:
+            date_text = submission_td.get_text()
+            # Extract date (format varies)
+            date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
+            if date_match:
+                return date_match.group(1)
+        
+        # Try meta tag
+        date_meta = soup.find('meta', attrs={'name': 'citation_date'})
+        if date_meta:
+            return date_meta.get('content', '').strip()
+        
+        return None
--- a/scipaperloader/parsers/base_parser.py
+++ b/scipaperloader/parsers/base_parser.py
@ -0,0 +1,83 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, List
+from dataclasses import dataclass
+
+@dataclass
+class ParsedContent:
+    """Container for parsed content from a publisher's HTML."""
+    full_text: str
+    title: Optional[str] = None
+    abstract: Optional[str] = None
+    authors: Optional[List[str]] = None
+    keywords: Optional[List[str]] = None
+    sections: Optional[Dict[str, str]] = None  # section_title -> section_content
+    references: Optional[List[str]] = None
+    doi: Optional[str] = None
+    journal: Optional[str] = None
+    publication_date: Optional[str] = None
+    metadata: Optional[Dict] = None  # Additional metadata specific to publisher
+
+class BaseParser(ABC):
+    """Base class for all publisher-specific parsers."""
+    
+    def __init__(self):
+        self.parser_name = self.__class__.__name__.lower().replace('parser', '')
+    
+    @abstractmethod
+    def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
+        """
+        Check if this parser can handle the given HTML content.
+        
+        Args:
+            html_content: The HTML content to check
+            url: Optional URL of the content (for additional context)
+            
+        Returns:
+            True if this parser can handle the content, False otherwise
+        """
+        pass
+    
+    @abstractmethod
+    def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
+        """
+        Parse HTML content and extract structured information.
+        
+        Args:
+            html_content: The HTML content to parse
+            doi: Optional DOI of the paper
+            
+        Returns:
+            ParsedContent object with extracted information
+            
+        Raises:
+            ParseError: If parsing fails
+        """
+        pass
+    
+    def get_name(self) -> str:
+        """Return the name of this parser."""
+        return self.parser_name
+    
+    def get_description(self) -> str:
+        """Return a description of this parser."""
+        return getattr(self.__class__, "__doc__", "No description available")
+    
+    def validate_content(self, content: ParsedContent) -> bool:
+        """
+        Validate the parsed content to ensure it meets minimum requirements.
+        
+        Args:
+            content: The parsed content to validate
+            
+        Returns:
+            True if content is valid, False otherwise
+        """
+        # Basic validation - must have some full text
+        if not content.full_text or len(content.full_text.strip()) < 100:
+            return False
+        
+        return True
+
+class ParseError(Exception):
+    """Exception raised when parsing fails."""
+    pass
--- a/scipaperloader/parsers/elsevier_parser.py
+++ b/scipaperloader/parsers/elsevier_parser.py
@ -0,0 +1,252 @@
+import re
+from bs4 import BeautifulSoup
+from typing import Dict, Optional, List
+from .base_parser import BaseParser, ParsedContent, ParseError
+
+class ElsevierParser(BaseParser):
+    """Parser for Elsevier/ScienceDirect articles."""
+    
+    def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
+        """Check if this is an Elsevier/ScienceDirect page."""
+        html_lower = html_content.lower()
+        
+        # Check for Elsevier/ScienceDirect indicators
+        indicators = [
+            'sciencedirect.com',
+            'elsevier.com',
+            'meta name="citation_publisher" content="elsevier"',
+            'copyright.*elsevier',
+            'sciencedirect',
+        ]
+        
+        return any(indicator in html_lower for indicator in indicators)
+    
+    def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
+        """Parse Elsevier/ScienceDirect HTML content."""
+        try:
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            # Extract title
+            title = self._extract_title(soup)
+            
+            # Extract abstract
+            abstract = self._extract_abstract(soup)
+            
+            # Extract authors
+            authors = self._extract_authors(soup)
+            
+            # Extract full text
+            full_text = self._extract_full_text(soup)
+            
+            # Extract sections
+            sections = self._extract_sections(soup)
+            
+            # Extract keywords
+            keywords = self._extract_keywords(soup)
+            
+            # Extract references
+            references = self._extract_references(soup)
+            
+            # Extract journal info
+            journal = self._extract_journal(soup)
+            
+            # Extract publication date
+            publication_date = self._extract_publication_date(soup)
+            
+            # Combine everything into full text if sections exist
+            if sections:
+                full_text = self._combine_sections(sections, abstract)
+            
+            if not full_text or len(full_text.strip()) < 100:
+                raise ParseError("Could not extract meaningful full text content")
+            
+            return ParsedContent(
+                full_text=full_text,
+                title=title,
+                abstract=abstract,
+                authors=authors,
+                keywords=keywords,
+                sections=sections,
+                references=references,
+                doi=doi,
+                journal=journal,
+                publication_date=publication_date,
+                metadata={
+                    'parser': 'elsevier',
+                    'source': 'sciencedirect'
+                }
+            )
+            
+        except Exception as e:
+            raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
+    
+    def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract article title."""
+        # Try multiple title selectors
+        selectors = [
+            'h1.title-text',
+            'h1[data-testid="title"]',
+            'h1.article-title',
+            'meta[name="citation_title"]',
+            'title'
+        ]
+        
+        for selector in selectors:
+            if 'meta' in selector:
+                element = soup.find('meta', attrs={'name': 'citation_title'})
+                if element:
+                    return element.get('content', '').strip()
+            else:
+                element = soup.select_one(selector)
+                if element:
+                    return element.get_text(strip=True)
+        
+        return None
+    
+    def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract article abstract."""
+        selectors = [
+            'div.abstract-content',
+            'div[data-testid="abstract"]',
+            'div.abstract',
+            'section.abstract',
+            'div#abstract'
+        ]
+        
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                return element.get_text(strip=True)
+        
+        return None
+    
+    def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
+        """Extract author names."""
+        authors = []
+        
+        # Try author meta tags
+        author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
+        if author_metas:
+            authors = [meta.get('content', '').strip() for meta in author_metas]
+        
+        # Try author div/span elements
+        if not authors:
+            author_elements = soup.select('div.author a, span.author, .author-name')
+            authors = [elem.get_text(strip=True) for elem in author_elements]
+        
+        return authors if authors else None
+    
+    def _extract_full_text(self, soup: BeautifulSoup) -> str:
+        """Extract main article content."""
+        content_parts = []
+        
+        # Try main content selectors
+        main_selectors = [
+            'div.article-content',
+            'div.body-content',
+            'main.article-body',
+            'div[data-testid="article-body"]',
+            'section.article-section'
+        ]
+        
+        for selector in main_selectors:
+            elements = soup.select(selector)
+            for element in elements:
+                # Remove script, style, and navigation elements
+                for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
+                    unwanted.decompose()
+                
+                text = element.get_text(separator='\n', strip=True)
+                if text and len(text) > 50:  # Only add substantial content
+                    content_parts.append(text)
+        
+        return '\n\n'.join(content_parts)
+    
+    def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
+        """Extract article sections with headings."""
+        sections = {}
+        
+        # Look for section headings and content
+        section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
+        
+        for heading in section_elements:
+            section_title = heading.get_text(strip=True)
+            
+            # Find content after this heading until next heading
+            content_parts = []
+            current = heading.next_sibling
+            
+            while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
+                if hasattr(current, 'get_text'):
+                    text = current.get_text(strip=True)
+                    if text:
+                        content_parts.append(text)
+                current = current.next_sibling
+            
+            if content_parts:
+                sections[section_title] = '\n'.join(content_parts)
+        
+        return sections if sections else None
+    
+    def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
+        """Extract article keywords."""
+        keywords = []
+        
+        # Try keyword meta tags
+        keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
+        if keyword_metas:
+            for meta in keyword_metas:
+                content = meta.get('content', '')
+                if content:
+                    keywords.extend([kw.strip() for kw in content.split(',')])
+        
+        # Try keyword sections
+        if not keywords:
+            keyword_sections = soup.select('div.keywords, section.keywords')
+            for section in keyword_sections:
+                text = section.get_text()
+                keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
+        
+        return keywords if keywords else None
+    
+    def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
+        """Extract references."""
+        references = []
+        
+        ref_sections = soup.select('section.references, div.references, ol.references li')
+        for section in ref_sections:
+            if section.name == 'li':
+                references.append(section.get_text(strip=True))
+            else:
+                ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
+                references.extend([item.get_text(strip=True) for item in ref_items])
+        
+        return references if references else None
+    
+    def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract journal name."""
+        journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
+        if journal_meta:
+            return journal_meta.get('content', '').strip()
+        
+        return None
+    
+    def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extract publication date."""
+        date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
+        if date_meta:
+            return date_meta.get('content', '').strip()
+        
+        return None
+    
+    def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
+        """Combine all sections into full text."""
+        full_text_parts = []
+        
+        if abstract:
+            full_text_parts.append(f"Abstract\n{abstract}")
+        
+        for section_title, section_content in sections.items():
+            full_text_parts.append(f"{section_title}\n{section_content}")
+        
+        return '\n\n'.join(full_text_parts)
--- a/scipaperloader/scrapers/base.py
+++ b/scipaperloader/scrapers/base.py
@ -18,6 +18,43 @@ class BaseScraper(ABC):
    OUTPUT_STATUS_FAILURE = "Failed"   # Status to set on failed scraping
    OUTPUT_STATUS_PROCESSING = "Pending"  # Status to set while processing
    
+    def __init__(self):
+        """Initialize the scraper."""
+        self.scraper_name = self.get_name().lower()
+    
+    def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
+        """Log the start of a scraping operation."""
+        from ..models import ActivityLog
+        
+        ActivityLog.log_scraper_activity(
+            action=f"{self.scraper_name}_scrape_start",
+            status="info",
+            description=f"Starting {self.get_name()} for DOI: {doi}",
+            paper_id=paper_id
+        )
+    
+    def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
+        """Log successful completion of scraping."""
+        from ..models import ActivityLog
+        
+        ActivityLog.log_scraper_activity(
+            action=f"{self.scraper_name}_scrape_success",
+            status="success",
+            description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
+            paper_id=paper_id
+        )
+    
+    def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
+        """Log failed scraping operation."""
+        from ..models import ActivityLog
+        
+        ActivityLog.log_scraper_activity(
+            action=f"{self.scraper_name}_scrape_failure",
+            status="error",
+            description=f"{self.get_name()} failed for DOI: {doi} - {message}",
+            paper_id=paper_id
+        )
+    
    @abstractmethod
    def scrape(self, doi: str) -> ScrapeResult:
        """
--- a/scipaperloader/scrapers/dummy.py
+++ b/scipaperloader/scrapers/dummy.py
@ -30,6 +30,9 @@ class Scraper(BaseScraper):
                timestamp=datetime.utcnow()
            )

+        # Log start of scraping
+        self.log_scrape_start(doi, paper.id)
+
        # Simulate processing time (1-3 seconds)
        processing_time = random.uniform(1, 3)
        time.sleep(processing_time)
@ -145,12 +148,7 @@ class Scraper(BaseScraper):
                )
            
            # Log success
-            ActivityLog.log_scraper_activity(
-                action="dummy_scrape",
-                status="success",
-                description=f"Successfully scraped {doi}",
-                paper_id=paper.id
-            )
+            self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)
            
            result = ScrapeResult(
                status="success",
@ -178,12 +176,7 @@ class Scraper(BaseScraper):
            paper.error_msg = error_msg
            
            # Log failure
-            ActivityLog.log_scraper_activity(
-                action="dummy_scrape",
-                status="error",
-                description=f"Failed to scrape {doi}: {error_msg}",
-                paper_id=paper.id
-            )
+            self.log_scrape_failure(doi, error_msg, paper.id)
            
            result = ScrapeResult(
                status="error",
--- a/scipaperloader/scrapers/failed_retry.py
+++ b/scipaperloader/scrapers/failed_retry.py
@ -30,13 +30,8 @@ class Scraper(BaseScraper):
                timestamp=datetime.utcnow()
            )

-        # Log retry attempt
-        ActivityLog.log_scraper_activity(
-            action="retry_failed_paper",
-            status="info",
-            description=f"Retrying failed paper: {paper.title}",
-            paper_id=paper.id
-        )
+        # Log start of retry
+        self.log_scrape_start(doi, paper.id)

        # Simulate longer processing time for retry (2-5 seconds)
        processing_time = random.uniform(2, 5)
@ -64,12 +59,7 @@ class Scraper(BaseScraper):
                result_data = {"file_path": file_path}
                
                # Log success
-                ActivityLog.log_scraper_activity(
-                    action="retry_scrape_success",
-                    status="success",
-                    description=f"Successfully retried {doi} on second attempt",
-                    paper_id=paper.id
-                )
+                self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id)
                
                result = ScrapeResult(
                    status="success",
@ -81,12 +71,7 @@ class Scraper(BaseScraper):
                
            except Exception as e:
                error_msg = f"Failed to save retry file: {str(e)}"
-                ActivityLog.log_scraper_activity(
-                    action="retry_scrape_file_error",
-                    status="error", 
-                    description=error_msg,
-                    paper_id=paper.id
-                )
+                self.log_scrape_failure(doi, error_msg, paper.id)
                
                result = ScrapeResult(
                    status="error",
@ -105,12 +90,7 @@ class Scraper(BaseScraper):
            ]
            error_msg = random.choice(error_messages)
            
-            ActivityLog.log_scraper_activity(
-                action="retry_scrape_failure",
-                status="error",
-                description=f"Retry failed for {doi}: {error_msg}",
-                paper_id=paper.id
-            )
+            self.log_scrape_failure(doi, error_msg, paper.id)
            
            result = ScrapeResult(
                status="error",
--- a/scipaperloader/scrapers/html_fetcher.py
+++ b/scipaperloader/scrapers/html_fetcher.py
@ -0,0 +1,172 @@
+import time
+import os
+import requests
+from datetime import datetime
+from .base import BaseScraper, ScrapeResult
+from flask import current_app
+from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
+from ..db import db
+
+class Scraper(BaseScraper):
+    """Scraper that fetches HTML content from DOI and saves it for further processing."""
+    
+    # This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed"
+    INPUT_STATUSES = ["New"]
+    OUTPUT_STATUS_SUCCESS = "HtmlDownloaded"
+    OUTPUT_STATUS_FAILURE = "Failed"
+    OUTPUT_STATUS_PROCESSING = "FetchingHtml"
+    
+    def scrape(self, doi: str) -> ScrapeResult:
+        """Fetch HTML content from DOI and save to download path."""
+        start_time = time.time()
+        
+        paper = PaperMetadata.query.filter_by(doi=doi).first()
+        if not paper:
+            return ScrapeResult(
+                status="error",
+                message=f"No paper found for DOI {doi}",
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        # Log start of scraping
+        self.log_scrape_start(doi, paper.id)
+
+        # Update status to processing
+        paper.status = self.OUTPUT_STATUS_PROCESSING
+        db.session.commit()
+
+        # Prepare file paths
+        download_path = DownloadPathConfig.get_path()
+        file_name = f"{doi.replace('/', '_')}.html"
+        file_path = os.path.join(download_path, file_name)
+        
+        # Check/create download directory (same pattern as dummy)
+        if not os.path.exists(download_path):
+            try:
+                os.makedirs(download_path, exist_ok=True)
+            except OSError as e:
+                error_msg = f"Failed to create download directory: {str(e)}"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "path_creation_error"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+        
+        # Check path permissions (same pattern as dummy)
+        if not os.access(download_path, os.W_OK):
+            error_msg = f"Download path '{download_path}' is not writable"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            ActivityLog.log_scraper_activity(
+                action="html_fetch_path_error",
+                status="error",
+                description=error_msg,
+                paper_id=paper.id
+            )
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "path_write_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        try:
+            # Fetch HTML from DOI
+            doi_url = f"https://doi.org/{doi}"
+            headers = {'User-Agent': 'SciPaperLoader/1.0'}
+            response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True)
+            
+            # Check for invalid DOI (404) or other HTTP errors
+            if response.status_code == 404:
+                error_msg = f"Invalid DOI: {doi} not found"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "invalid_doi"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+            
+            response.raise_for_status()  # Raise for other HTTP errors
+            
+            # Save HTML content
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(response.text)
+            
+            # Update paper status to success
+            paper.status = self.OUTPUT_STATUS_SUCCESS
+            paper.file_path = file_path
+            paper.error_msg = None
+            db.session.commit()
+            
+            # Log success
+            self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id)
+            
+            return ScrapeResult(
+                status="success",
+                message=f"Successfully fetched HTML for {doi}",
+                data={
+                    "file_path": file_path,
+                    "url": response.url,  # Final URL after redirects
+                    "title": paper.title
+                },
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            ActivityLog.log_scraper_activity(
+                action="html_fetch",
+                status="error",
+                description=error_msg,
+                paper_id=paper.id
+            )
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "network_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            error_msg = f"Failed to save HTML file: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "file_creation_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
--- a/scipaperloader/scrapers/publisher_detector.py
+++ b/scipaperloader/scrapers/publisher_detector.py
@ -0,0 +1,282 @@
+import time
+import requests
+import re
+from urllib.parse import urlparse
+from datetime import datetime
+from typing import Optional
+from .base import BaseScraper, ScrapeResult
+from flask import current_app
+from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
+from ..db import db
+
+class Scraper(BaseScraper):
+    """Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
+    
+    # This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
+    INPUT_STATUSES = ["New"]
+    OUTPUT_STATUS_SUCCESS = "PublisherDetected"
+    OUTPUT_STATUS_FAILURE = "Failed"
+    OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
+    
+    # Publisher detection patterns based on URL domains and paths
+    PUBLISHER_URL_PATTERNS = {
+        'elsevier': [
+            r'sciencedirect\.com',
+            r'elsevier\.com',
+            r'.*\.elsevier\.com'
+        ],
+        'springer': [
+            r'link\.springer\.com',
+            r'springer\.com',
+            r'.*\.springer\.com'
+        ],
+        'wiley': [
+            r'onlinelibrary\.wiley\.com',
+            r'wiley\.com',
+            r'.*\.wiley\.com'
+        ],
+        'ieee': [
+            r'ieeexplore\.ieee\.org',
+            r'ieee\.org',
+            r'.*\.ieee\.org'
+        ],
+        'plos': [
+            r'journals\.plos\.org',
+            r'plos\.org',
+            r'.*\.plos\.org'
+        ],
+        'nature': [
+            r'nature\.com',
+            r'.*\.nature\.com'
+        ],
+        'sage': [
+            r'journals\.sagepub\.com',
+            r'sagepub\.com',
+            r'.*\.sagepub\.com'
+        ],
+        'taylor_francis': [
+            r'tandfonline\.com',
+            r'.*\.tandfonline\.com'
+        ],
+        'acs': [
+            r'pubs\.acs\.org',
+            r'acs\.org',
+            r'.*\.acs\.org'
+        ],
+        'arxiv': [
+            r'arxiv\.org',
+            r'export\.arxiv\.org'
+        ],
+        'pubmed': [
+            r'pubmed\.ncbi\.nlm\.nih\.gov',
+            r'ncbi\.nlm\.nih\.gov'
+        ],
+        'oxford': [
+            r'academic\.oup\.com',
+            r'oup\.com',
+            r'.*\.oup\.com'
+        ],
+        'cambridge': [
+            r'cambridge\.org',
+            r'.*\.cambridge\.org'
+        ],
+        'biorxiv': [
+            r'biorxiv\.org',
+            r'.*\.biorxiv\.org'
+        ],
+        'researchgate': [
+            r'researchgate\.net',
+            r'.*\.researchgate\.net'
+        ]
+    }
+    
+    def scrape(self, doi: str) -> ScrapeResult:
+        """Detect publisher from the final URL after DOI redirect."""
+        start_time = time.time()
+        
+        paper = PaperMetadata.query.filter_by(doi=doi).first()
+        if not paper:
+            return ScrapeResult(
+                status="error",
+                message=f"No paper found for DOI {doi}",
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        # Log start of scraping
+        self.log_scrape_start(doi, paper.id)
+
+        # Update status to processing
+        paper.status = self.OUTPUT_STATUS_PROCESSING
+        db.session.commit()
+
+        try:
+            # Get the final URL by following the DOI redirect
+            final_url = self._get_final_url(doi)
+            
+            if not final_url:
+                error_msg = f"Could not resolve DOI {doi} to a URL"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "doi_resolution_failed"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+            
+            # Detect publisher from URL
+            detected_publisher = self._detect_publisher_from_url(final_url)
+            
+            if detected_publisher:
+                # Update paper with detected publisher
+                paper.publisher = detected_publisher
+                paper.status = self.OUTPUT_STATUS_SUCCESS
+                paper.error_msg = None
+                db.session.commit()
+                
+                success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
+                self.log_scrape_success(doi, success_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="success",
+                    message=success_msg,
+                    data={
+                        "publisher": detected_publisher,
+                        "final_url": final_url
+                    },
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+            else:
+                error_msg = f"Could not detect publisher from URL: {final_url}"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={
+                        "final_url": final_url,
+                        "error_code": "publisher_not_detected"
+                    },
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+
+        except Exception as e:
+            error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "publisher_detection_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+    
+    def _get_final_url(self, doi: str) -> Optional[str]:
+        """
+        Get the final URL after following DOI redirects.
+        
+        Args:
+            doi: The DOI to resolve
+            
+        Returns:
+            Final URL after redirects, or None if resolution fails
+        """
+        try:
+            doi_url = f"https://doi.org/{doi}"
+            headers = {
+                'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+            }
+            
+            # Make a HEAD request to get the final URL without downloading content
+            response = requests.head(
+                doi_url, 
+                headers=headers, 
+                timeout=15, 
+                allow_redirects=True
+            )
+            
+            # If HEAD is not allowed, try GET but with minimal content
+            if response.status_code == 405:  # Method Not Allowed
+                response = requests.get(
+                    doi_url,
+                    headers=headers,
+                    timeout=15,
+                    allow_redirects=True,
+                    stream=True  # Don't download the full content
+                )
+                response.close()  # Close connection after getting headers
+            
+            if response.status_code in [200, 302, 301]:
+                return response.url
+            else:
+                return None
+                
+        except Exception as e:
+            # Log error but don't raise - we'll handle this gracefully
+            return None
+    
+    def _detect_publisher_from_url(self, url: str) -> Optional[str]:
+        """
+        Detect publisher from URL using domain patterns.
+        
+        Args:
+            url: The URL to analyze
+            
+        Returns:
+            Publisher name if detected, None otherwise
+        """
+        if not url:
+            return None
+            
+        # Parse the URL to get the domain
+        parsed_url = urlparse(url)
+        domain = parsed_url.netloc.lower()
+        
+        # Remove 'www.' prefix if present
+        if domain.startswith('www.'):
+            domain = domain[4:]
+        
+        # Score each publisher based on URL pattern matches
+        publisher_scores = {}
+        
+        for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
+            score = 0
+            for pattern in patterns:
+                if re.search(pattern, domain, re.IGNORECASE):
+                    score += 10  # Strong match for domain patterns
+                    
+                # Also check the full URL for path-based patterns
+                if re.search(pattern, url.lower(), re.IGNORECASE):
+                    score += 5
+            
+            if score > 0:
+                publisher_scores[publisher] = score
+        
+        # Return the publisher with the highest score
+        if publisher_scores:
+            best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
+            
+            # Only return if we have a reasonable confidence (score > 5)
+            if publisher_scores[best_publisher] > 5:
+                return best_publisher
+        
+        return None
--- a/scipaperloader/scrapers/text_extractor.py
+++ b/scipaperloader/scrapers/text_extractor.py
@ -0,0 +1,237 @@
+import time
+import os
+from datetime import datetime
+from typing import Optional
+from .base import BaseScraper, ScrapeResult
+from flask import current_app
+from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
+from ..db import db
+from ..parsers.base_parser import BaseParser, ParseError
+from ..parsers.elsevier_parser import ElsevierParser
+from ..parsers.arxiv_parser import ArxivParser
+
+class Scraper(BaseScraper):
+    """Full text extraction scraper that uses publisher-specific parsers."""
+    
+    # This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
+    INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
+    OUTPUT_STATUS_SUCCESS = "TextExtracted"
+    OUTPUT_STATUS_FAILURE = "Failed"
+    OUTPUT_STATUS_PROCESSING = "ExtractingText"
+    
+    def __init__(self):
+        super().__init__()
+        # Registry of available parsers
+        self.parsers = [
+            ElsevierParser(),
+            ArxivParser(),
+            # Add more parsers here as you create them
+            # SpringerParser(),
+            # WileyParser(),
+            # IEEEParser(),
+        ]
+    
+    def scrape(self, doi: str) -> ScrapeResult:
+        """Extract full text using appropriate publisher parser."""
+        start_time = time.time()
+        
+        paper = PaperMetadata.query.filter_by(doi=doi).first()
+        if not paper:
+            return ScrapeResult(
+                status="error",
+                message=f"No paper found for DOI {doi}",
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        # Log start of scraping
+        self.log_scrape_start(doi, paper.id)
+
+        # Update status to processing
+        paper.status = self.OUTPUT_STATUS_PROCESSING
+        db.session.commit()
+
+        # Check if HTML file exists
+        if not paper.file_path or not os.path.exists(paper.file_path):
+            error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "html_file_not_found"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        try:
+            # Read HTML content
+            with open(paper.file_path, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+            
+            # Find appropriate parser
+            parser = self._select_parser(html_content)
+            
+            if not parser:
+                error_msg = f"No suitable parser found for DOI {doi}"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "no_parser_available"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+            
+            # Parse content
+            parsed_content = parser.parse(html_content, doi)
+            
+            # Validate parsed content
+            if not parser.validate_content(parsed_content):
+                error_msg = f"Parsed content validation failed for DOI {doi}"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "content_validation_failed"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+            
+            # Save extracted text to file
+            text_file_path = self._save_extracted_text(parsed_content, doi)
+            
+            # Update paper status to success
+            paper.status = self.OUTPUT_STATUS_SUCCESS
+            paper.error_msg = None
+            # You might want to add a text_file_path field to store the text file location
+            # paper.text_file_path = text_file_path
+            db.session.commit()
+            
+            success_msg = f"Successfully extracted text using {parser.get_name()} parser"
+            self.log_scrape_success(doi, success_msg, paper.id)
+            
+            return ScrapeResult(
+                status="success",
+                message=f"Successfully extracted full text for {doi}",
+                data={
+                    "text_file_path": text_file_path,
+                    "parser_used": parser.get_name(),
+                    "title": parsed_content.title,
+                    "word_count": len(parsed_content.full_text.split()),
+                    "has_abstract": bool(parsed_content.abstract),
+                    "has_sections": bool(parsed_content.sections),
+                    "author_count": len(parsed_content.authors) if parsed_content.authors else 0,
+                    "keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
+                    "reference_count": len(parsed_content.references) if parsed_content.references else 0
+                },
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        except ParseError as e:
+            error_msg = f"Parser error for DOI {doi}: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "parser_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "extraction_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+    
+    def _select_parser(self, html_content: str) -> Optional[BaseParser]:
+        """
+        Select the most appropriate parser for the HTML content.
+        
+        Args:
+            html_content: The HTML content to analyze
+            
+        Returns:
+            The best parser for this content, or None if no parser can handle it
+        """
+        for parser in self.parsers:
+            if parser.can_parse(html_content):
+                return parser
+        
+        return None
+    
+    def _save_extracted_text(self, parsed_content, doi: str) -> str:
+        """
+        Save extracted text to a file.
+        
+        Args:
+            parsed_content: The parsed content object
+            doi: The DOI of the paper
+            
+        Returns:
+            Path to the saved text file
+        """
+        download_path = DownloadPathConfig.get_path()
+        text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
+        text_file_path = os.path.join(download_path, text_file_name)
+        
+        with open(text_file_path, 'w', encoding='utf-8') as f:
+            # Write structured content
+            f.write(f"DOI: {parsed_content.doi or doi}\n")
+            f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
+            f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
+            f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
+            
+            if parsed_content.authors:
+                f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
+            
+            if parsed_content.keywords:
+                f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
+            
+            f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
+            f.write("=" * 80 + "\n\n")
+            
+            # Write full text
+            f.write(parsed_content.full_text)
+            
+            # Optionally write references at the end
+            if parsed_content.references:
+                f.write("\n\n" + "=" * 80 + "\n")
+                f.write("REFERENCES\n")
+                f.write("=" * 80 + "\n")
+                for i, ref in enumerate(parsed_content.references, 1):
+                    f.write(f"{i}. {ref}\n")
+        
+        return text_file_path
--- a/scipaperloader/scrapers/web_fetcher.py
+++ b/scipaperloader/scrapers/web_fetcher.py
@ -0,0 +1,201 @@
+import time
+import os
+import requests
+from urllib.parse import urlparse
+from datetime import datetime
+from .base import BaseScraper, ScrapeResult
+from flask import current_app
+from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
+from ..db import db
+
+class Scraper(BaseScraper):
+    """Web fetcher scraper that downloads HTML content from DOI URLs."""
+    
+    # This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed"
+    INPUT_STATUSES = ["New"]
+    OUTPUT_STATUS_SUCCESS = "WebContentDownloaded"
+    OUTPUT_STATUS_FAILURE = "Failed"
+    OUTPUT_STATUS_PROCESSING = "FetchingWebContent"
+    
+    def scrape(self, doi: str) -> ScrapeResult:
+        """Fetch HTML content from DOI and save to download path."""
+        start_time = time.time()
+        
+        paper = PaperMetadata.query.filter_by(doi=doi).first()
+        if not paper:
+            return ScrapeResult(
+                status="error",
+                message=f"No paper found for DOI {doi}",
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        # Log start of scraping
+        self.log_scrape_start(doi, paper.id)
+
+        # Update status to processing
+        paper.status = self.OUTPUT_STATUS_PROCESSING
+        db.session.commit()
+
+        # Prepare file paths
+        download_path = DownloadPathConfig.get_path()
+        file_name = f"{doi.replace('/', '_')}.html"
+        file_path = os.path.join(download_path, file_name)
+        
+        # Check/create download directory
+        if not os.path.exists(download_path):
+            try:
+                os.makedirs(download_path, exist_ok=True)
+            except OSError as e:
+                error_msg = f"Failed to create download directory: {str(e)}"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "path_creation_error"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+        
+        # Check path permissions
+        if not os.access(download_path, os.W_OK):
+            error_msg = f"Download path '{download_path}' is not writable"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "path_write_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        try:
+            # Fetch HTML from DOI
+            doi_url = f"https://doi.org/{doi}"
+            headers = {
+                'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1'
+            }
+            
+            response = requests.get(
+                doi_url, 
+                headers=headers, 
+                timeout=30, 
+                allow_redirects=True,
+                verify=True
+            )
+            
+            # Check for invalid DOI (404) or other HTTP errors
+            if response.status_code == 404:
+                error_msg = f"Invalid DOI: {doi} not found (404)"
+                paper.status = self.OUTPUT_STATUS_FAILURE
+                paper.error_msg = error_msg
+                db.session.commit()
+                
+                self.log_scrape_failure(doi, error_msg, paper.id)
+                
+                return ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data={"error_code": "invalid_doi"},
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+            
+            # Check for other HTTP errors
+            response.raise_for_status()
+            
+            # Save HTML content
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(response.text)
+            
+            # Extract final URL after redirects (for publisher detection)
+            final_url = response.url
+            
+            # Update paper status to success
+            paper.status = self.OUTPUT_STATUS_SUCCESS
+            paper.file_path = file_path
+            paper.error_msg = None
+            db.session.commit()
+            
+            # Log success
+            success_msg = f"Successfully fetched HTML content for {doi} from {final_url}"
+            self.log_scrape_success(doi, success_msg, paper.id)
+            
+            return ScrapeResult(
+                status="success",
+                message=f"Successfully fetched HTML for {doi}",
+                data={
+                    "file_path": file_path,
+                    "final_url": final_url,
+                    "content_length": len(response.text),
+                    "content_type": response.headers.get('content-type', 'unknown'),
+                    "title": paper.title,
+                    "domain": urlparse(final_url).netloc if final_url else None
+                },
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        except requests.exceptions.HTTPError as e:
+            error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "http_error", "status_code": e.response.status_code},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+            
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Network error fetching {doi_url}: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "network_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+            
+        except Exception as e:
+            error_msg = f"Failed to save HTML file: {str(e)}"
+            paper.status = self.OUTPUT_STATUS_FAILURE
+            paper.error_msg = error_msg
+            db.session.commit()
+            
+            self.log_scrape_failure(doi, error_msg, paper.id)
+            
+            return ScrapeResult(
+                status="error",
+                message=error_msg,
+                data={"error_code": "file_creation_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
--- a/scipaperloader/static/js/README.md
+++ b/scipaperloader/static/js/README.md
@ -0,0 +1,384 @@
+# JavaScript Modularization Documentation
+
+## Overview
+
+The JavaScript code in the SciPaperLoader application has been modularized into reusable components to improve maintainability, reduce code duplication, and enable easier testing and updates.
+
+## Modularization Task Completed
+
+### Problem Statement
+The original codebase had ~800+ lines of inline JavaScript scattered across multiple Jinja templates with several critical issues:
+- **Code Duplication**: Similar functionality replicated across templates
+- **Maintenance Difficulty**: Changes required editing multiple template files
+- **Linter Issues**: Jinja template syntax mixed with JavaScript caused linting errors
+- **Testing Challenges**: Inline code was difficult to unit test
+- **Poor Separation of Concerns**: Template logic mixed with application logic
+
+### Solution Implemented
+Successfully transformed the codebase by:
+
+1. **Extracted 10 Modular JavaScript Files** (~800+ lines of code moved from templates)
+2. **Eliminated Code Duplication** by creating reusable components
+3. **Fixed Linter Compatibility** by separating template syntax from JavaScript logic
+4. **Implemented Clean Variable Passing** using JSON script tags instead of direct Jinja embedding
+5. **Created Class-Based Architecture** with proper inheritance and composition patterns
+6. **Established Inter-Component Communication** through callback systems
+7. **Added Comprehensive Error Handling** and loading states throughout
+
+### Key Achievements
+- ✅ **5 templates modularized**: `scraper.html.jinja`, `papers.html.jinja`, `upload.html.jinja`, `logger.html.jinja`, `config/schedule.html.jinja`
+- ✅ **10 JavaScript modules created**: Covering all functionality from utilities to dashboard coordination
+- ✅ **Zero functionality loss**: All existing features preserved during modularization
+- ✅ **Improved maintainability**: Changes now require editing single module files
+- ✅ **Enhanced testability**: Individual modules can be unit tested
+- ✅ **Clean variable handling**: Jinja variables passed as JSON configuration instead of inline embedding
+
+### Before vs After Example
+**Before (inline in template)**:
+```html
+<script>
+var maxVolume = {{ max_volume }};  // Linter error
+$('#start-scraper').click(function() {
+    // 50+ lines of mixed template/JS code
+});
+</script>
+```
+
+**After (modular)**:
+```html
+<script type="application/json" id="config-data">
+{"maxVolume": {{ max_volume|tojson }}}
+</script>
+<script src="{{ url_for('static', filename='js/scraper-control.js') }}"></script>
+<script>
+const config = JSON.parse(document.getElementById('config-data').textContent);
+new ScraperControl(config).init();
+</script>
+```
+
+## Modular JavaScript Files
+
+### 1. `/static/js/common.js`
+**Purpose**: Common utilities used across the application
+
+**Key Functions**:
+- `showFlashMessage(message, type)` - Display flash messages to users
+- `createStatusBadge(status)` - Generate status badge HTML
+- `formatTimestamp(timestamp)` - Format timestamps for display
+- `truncateText(text, maxLength)` - Truncate text with ellipsis
+- `toggleButtonLoading(button, loading, loadingText)` - Handle button loading states
+- `apiRequest(url, options)` - Generic API request wrapper
+
+**Used by**: All templates that need basic utilities
+
+### 2. `/static/js/modal-handler.js`
+**Purpose**: Handle modal dialogs with dynamic content loading
+
+**Key Features**:
+- AJAX content loading
+- Error handling
+- Automatic click handler setup
+- Bootstrap modal integration
+
+**Used by**: 
+- `papers.html.jinja` (paper details modal)
+- `logger.html.jinja` (log details modal)
+
+### 3. `/static/js/form-handler.js`
+**Purpose**: Handle form submissions with progress tracking
+
+**Key Features**:
+- Progress modal display
+- Task status polling
+- Error handling
+- Customizable callbacks
+
+**Used by**: 
+- `upload.html.jinja` (CSV upload form)
+
+### 4. `/static/js/chart.js`
+**Purpose**: Handle Chart.js activity visualization
+
+**Key Features**:
+- Chart initialization and rendering
+- Data loading from API
+- Error handling for missing Chart.js
+
+**Used by**: 
+- `scraper.html.jinja` (activity charts)
+
+### 5. `/static/js/scraper-control.js`
+**Purpose**: Handle scraper control operations (start/stop/pause/reset)
+
+**Key Features**:
+- Status polling
+- Volume configuration
+- Callback system for refreshing other components
+
+**Used by**: 
+- `scraper.html.jinja`
+
+### 6. `/static/js/paper-processor.js`
+**Purpose**: Handle paper search and processing functionality
+
+**Key Features**:
+- Paper search
+- Single paper processing
+- Status polling
+- Scraper selection
+
+**Used by**: 
+- `scraper.html.jinja`
+
+### 7. `/static/js/activity-monitor.js`
+**Purpose**: Handle activity log display and real-time notifications
+
+**Key Features**:
+- Activity log loading
+- Real-time updates
+- Notification management
+
+**Used by**: 
+- `scraper.html.jinja`
+
+### 8. `/static/js/scraper-dashboard.js`
+**Purpose**: Coordinate all scraper dashboard components
+
+**Key Features**:
+- Component initialization
+- Inter-component communication
+- Configuration management
+
+**Used by**: 
+- `scraper.html.jinja`
+
+### 9. `/static/js/config-handler.js`
+**Purpose**: Handle configuration forms and Alpine.js integration
+
+**Key Features**:
+- Configuration API calls
+- Alpine.js data objects
+- Schedule management
+- Volume updates
+
+**Used by**: 
+- `config/schedule.html.jinja`
+
+## Template Updates
+
+### Templates Using Modular JavaScript
+
+1. **scraper.html.jinja**
+   - Uses all scraper-related modules
+   - Passes Jinja variables as configuration parameters
+   - Initializes dashboard with `initScraperDashboard(config)`
+
+2. **papers.html.jinja**
+   - Uses `modal-handler.js` for paper detail modals
+   - Simplified from custom modal code to single line initialization
+
+3. **upload.html.jinja**
+   - Uses `form-handler.js` for upload progress tracking
+   - Custom result display function
+   - Automatic task status polling
+
+4. **logger.html.jinja**
+   - Uses `modal-handler.js` for log detail modals
+   - Custom URL construction for log endpoints
+
+5. **config/schedule.html.jinja**
+   - Uses `config-handler.js` for Alpine.js integration
+   - Modular schedule management functions
+
+## Benefits of Modularization
+
+### 1. **Reusability**
+- Modal functionality shared between papers and logger templates
+- Common utilities used across all templates
+- Form handling can be reused for other forms
+
+### 2. **Maintainability**
+- Single place to update common functionality
+- Clear separation of concerns
+- Easier debugging and testing
+
+### 3. **Parameter Passing**
+- Jinja variables passed as configuration objects
+- No more hardcoded values in JavaScript
+- Environment-specific settings easily configurable
+
+### 4. **Extensibility**
+- Easy to add new functionality to existing modules
+- New templates can easily use existing modules
+- Plugin-like architecture for components
+
+## Usage Examples
+
+### Basic Modal Usage
+```javascript
+const modal = new ModalHandler('modalId', 'contentElementId');
+modal.setupClickHandlers('.clickable-items');
+```
+
+### Form with Progress Tracking
+```javascript
+const formHandler = new FormHandler('formId', {
+    onSuccess: (result) => console.log('Success:', result),
+    onError: (error) => console.log('Error:', error)
+});
+```
+
+### Configuration Management
+```javascript
+// In Alpine.js template
+x-data="configHandler.createScheduleManager(initialData, volume)"
+```
+
+## Migration Notes
+
+### Old vs New Approach
+
+**Before**: Inline JavaScript in each template
+```html
+<script>
+document.addEventListener('DOMContentLoaded', function() {
+    // Lots of inline JavaScript code
+});
+</script>
+```
+
+**After**: Modular imports with configuration
+```html
+<script src="{{ url_for('static', filename='js/common.js') }}"></script>
+<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
+<script>
+const modal = new ModalHandler('modalId', 'contentId');
+modal.setupClickHandlers('.links');
+</script>
+```
+
+### Jinja Variable Handling
+
+To properly separate Jinja template variables from JavaScript code and avoid linting issues, we use a clean JSON configuration approach:
+
+**Before**: Variables embedded directly in JavaScript (causes linting issues)
+```javascript
+if (volume > {{ max_volume }}) {
+    // Error handling - JSLint will complain about {{ }}
+}
+```
+
+**After**: Clean separation using JSON script tags
+```html
+<!-- Jinja variables in JSON format -->
+<script type="application/json" id="config-data">
+{
+    "maxVolume": {{ max_volume|tojson }},
+    "currentVolume": {{ volume|tojson }},
+    "apiUrl": {{ url_for('api.endpoint')|tojson }},
+    "csrfToken": {{ csrf_token()|tojson }}
+}
+</script>
+
+<!-- Clean JavaScript that reads the configuration -->
+<script>
+document.addEventListener('DOMContentLoaded', function() {
+    const config = JSON.parse(document.getElementById('config-data').textContent);
+    const handler = new VolumeHandler(config);
+});
+</script>
+```
+
+**Benefits of this approach**:
+- **Linter-friendly**: No template syntax in JavaScript files
+- **Type-safe**: JSON ensures proper data types
+- **Maintainable**: Clear separation of concerns
+- **Secure**: Automatic escaping with `|tojson` filter
+- **Debuggable**: Easy to inspect configuration in DevTools
+
+**Real-world example from scraper.html.jinja**:
+```html
+<script type="application/json" id="scraper-config">
+{
+    "statusUrl": {{ url_for('api.scraper_status')|tojson }},
+    "startUrl": {{ url_for('api.start_scraper')|tojson }},
+    "volume": {{ volume|tojson }},
+    "scraperType": {{ scraper_type|tojson }},
+    "csrfToken": {{ csrf_token()|tojson }}
+}
+</script>
+
+<script>
+const config = JSON.parse(document.getElementById('scraper-config').textContent);
+initScraperDashboard(config);
+</script>
+```
+
+## Future Improvements
+
+### Potential Enhancements
+1. **Bundle Management**: Consider using webpack or similar for production builds
+2. **Unit Testing**: Add comprehensive test suite for individual modules
+3. **JSDoc Comments**: Add detailed documentation for better IDE support
+4. **Centralized Error Reporting**: Implement global error handling system
+5. **Performance Optimization**: Implement lazy loading for non-critical modules
+6. **TypeScript Migration**: Consider migrating to TypeScript for better type safety
+
+### Adding New Modules
+When creating new JavaScript modules:
+1. Follow the established class-based pattern
+2. Include proper error handling
+3. Use the configuration pattern for Jinja variables
+4. Add documentation to this README
+5. Update templates to use the new module
+
+## Testing
+
+A test file `test_js_modularization.py` has been created to verify the modularization. To run comprehensive testing:
+
+```bash
+python test_js_modularization.py
+```
+
+This will verify:
+- All JavaScript files exist and are properly formatted
+- Templates correctly reference the modular files
+- Configuration patterns are properly implemented
+- No inline JavaScript remains in templates
+
+## Maintenance
+
+### When Making Changes
+1. **Update Single Module**: Changes to functionality only require editing one file
+2. **Test Affected Templates**: Ensure all templates using the module still work
+3. **Update Documentation**: Keep this README current with any changes
+4. **Consider Dependencies**: Check if changes affect other modules
+
+### File Organization
+```
+/static/js/
+├── README.md              # This documentation
+├── common.js              # Shared utilities
+├── modal-handler.js       # Modal functionality
+├── form-handler.js        # Form processing
+├── chart.js               # Chart visualization
+├── scraper-control.js     # Scraper operations
+├── paper-processor.js     # Paper management
+├── activity-monitor.js    # Activity tracking
+├── scraper-dashboard.js   # Dashboard coordination
+├── config-handler.js      # Configuration management
+└── table-handler.js       # Table utilities
+```
+
+## Migration Summary
+
+The modularization successfully transformed **~800+ lines of inline JavaScript** from templates into a maintainable, reusable module system. This improvement provides:
+
+- **Enhanced maintainability** through single-responsibility modules
+- **Reduced code duplication** via shared utility functions  
+- **Improved linter compatibility** by separating template and JavaScript concerns
+- **Better testability** with isolated, unit-testable modules
+- **Cleaner templates** with minimal, configuration-only JavaScript
+- **Easier debugging** with clearly separated concerns and proper error handling
+
+All existing functionality has been preserved while significantly improving the codebase architecture and developer experience.
--- a/scipaperloader/static/js/scraper-control.js
+++ b/scipaperloader/static/js/scraper-control.js
@ -38,12 +38,12 @@ class ScraperController {
      this.resetButton.addEventListener("click", () => this.resetScraper());
    }

-    // Volume form
-    const volumeForm = document.getElementById("volumeForm");
-    if (volumeForm) {
-      volumeForm.addEventListener("submit", (e) => {
+    // Configuration form (handles both volume and scraper module)
+    const configForm = document.getElementById("volumeForm");
+    if (configForm) {
+      configForm.addEventListener("submit", (e) => {
        e.preventDefault();
-        this.updateVolume();
+        this.updateConfiguration();
      });
    }
  }
@ -245,25 +245,46 @@ class ScraperController {
  }

  /**
-   * Update volume configuration
+   * Update configuration (volume and/or scraper module)
   */
-  async updateVolume() {
+  async updateConfiguration() {
    const volumeInput = document.getElementById("volumeInput");
+    const scraperSelect = document.getElementById("mainScraperSelect");
    const submitButton = document.querySelector(
      '#volumeForm button[type="submit"]'
    );

-    if (!volumeInput || !submitButton) return;
+    if (!submitButton) return;

-    const volume = volumeInput.value;
+    const updates = {};
+    let hasChanges = false;

-    // Basic validation
-    if (!volume || volume < 1 || volume > this.maxVolume) {
-      showFlashMessage(
-        `Please enter a valid volume between 1 and ${this.maxVolume}`,
-        "warning"
-      );
-      volumeInput.focus();
+    // Check volume changes
+    if (volumeInput) {
+      const volume = volumeInput.value;
+
+      // Basic validation
+      if (!volume || volume < 1 || volume > this.maxVolume) {
+        showFlashMessage(
+          `Please enter a valid volume between 1 and ${this.maxVolume}`,
+          "warning"
+        );
+        volumeInput.focus();
+        return;
+      }
+
+      updates.volume = volume;
+      hasChanges = true;
+    }
+
+    // Check scraper module changes
+    if (scraperSelect && scraperSelect.value) {
+      updates.scraper_module = scraperSelect.value;
+      hasChanges = true;
+    }
+
+    if (!hasChanges) {
+      showFlashMessage("No changes to save", "info");
      return;
    }

@ -273,21 +294,24 @@ class ScraperController {
    try {
      const data = await apiRequest("/scraper/update_config", {
        method: "POST",
-        body: JSON.stringify({ volume: volume }),
+        body: JSON.stringify(updates),
      });

      if (data.success) {
        showFlashMessage(
-          data.message || "Volume updated successfully",
+          data.message || "Configuration updated successfully",
          "success"
        );
      } else {
-        showFlashMessage(data.message || "Failed to update volume", "error");
+        showFlashMessage(
+          data.message || "Failed to update configuration",
+          "error"
+        );
      }
    } catch (error) {
-      console.error("Error updating volume:", error);
+      console.error("Error updating configuration:", error);
      showFlashMessage(
-        "Network error while updating volume. Please try again.",
+        "Network error while updating configuration. Please try again.",
        "error"
      );
    } finally {
--- a/scipaperloader/static/js/scraper-overview.js
+++ b/scipaperloader/static/js/scraper-overview.js
@ -0,0 +1,500 @@
+/**
+ * Scraper Overview functionality
+ */
+
+class ScraperOverview {
+  constructor() {
+    this.modal = null;
+    this.scrapers = [];
+    this.systemConfig = {};
+    this.init();
+  }
+
+  init() {
+    // Initialize modal reference
+    this.modal = document.getElementById("scraperOverviewModal");
+
+    // Load data when modal is shown
+    if (this.modal) {
+      this.modal.addEventListener("show.bs.modal", () => {
+        this.loadScraperOverview();
+      });
+    }
+  }
+
+  async loadScraperOverview() {
+    const loadingEl = document.getElementById("scraperOverviewLoading");
+    const errorEl = document.getElementById("scraperOverviewError");
+    const contentEl = document.getElementById("scraperOverviewContent");
+
+    // Show loading state
+    loadingEl?.classList.remove("d-none");
+    errorEl?.classList.add("d-none");
+    contentEl?.classList.add("d-none");
+
+    try {
+      // Load scrapers, system config, and publishers in parallel
+      const [scrapersResponse, statusResponse, publishersResponse] =
+        await Promise.all([
+          fetch("/scraper/scrapers"),
+          fetch("/scraper/status"),
+          fetch("/scraper/publishers"),
+        ]);
+
+      if (
+        !scrapersResponse.ok ||
+        !statusResponse.ok ||
+        !publishersResponse.ok
+      ) {
+        throw new Error("Failed to load scraper information");
+      }
+
+      const scrapersData = await scrapersResponse.json();
+      const statusData = await statusResponse.json();
+      const publishersData = await publishersResponse.json();
+
+      if (
+        !scrapersData.success ||
+        !statusData.success ||
+        !publishersData.success
+      ) {
+        throw new Error(
+          scrapersData.message ||
+            statusData.message ||
+            publishersData.message ||
+            "Unknown error"
+        );
+      }
+
+      this.scrapers = scrapersData.scrapers;
+      this.systemConfig = statusData;
+      this.publishersData = publishersData.data;
+
+      // Update UI
+      this.updateSystemConfig();
+      this.updateScrapersTable();
+      this.updatePublishersSection();
+      this.updateStatusFlowDiagram();
+
+      // Show content
+      loadingEl?.classList.add("d-none");
+      contentEl?.classList.remove("d-none");
+    } catch (error) {
+      console.error("Error loading scraper overview:", error);
+
+      // Show error state
+      loadingEl?.classList.add("d-none");
+      const errorMessage = document.getElementById(
+        "scraperOverviewErrorMessage"
+      );
+      if (errorMessage) {
+        errorMessage.textContent =
+          error.message || "Failed to load scraper information";
+      }
+      errorEl?.classList.remove("d-none");
+    }
+  }
+
+  updateSystemConfig() {
+    // Current scraper module
+    const currentModuleEl = document.getElementById("currentScraperModule");
+    if (currentModuleEl) {
+      const currentModule =
+        this.systemConfig.current_scraper_module || "System Default";
+      currentModuleEl.textContent = currentModule;
+      currentModuleEl.className = "badge bg-primary";
+    }
+
+    // Volume limit
+    const volumeLimitEl = document.getElementById("currentVolumeLimit");
+    if (volumeLimitEl) {
+      const volumeLimit = this.systemConfig.volume_config || "Unknown";
+      volumeLimitEl.textContent = volumeLimit;
+    }
+
+    // Total modules
+    const totalModulesEl = document.getElementById("totalScraperModules");
+    if (totalModulesEl) {
+      totalModulesEl.textContent = this.scrapers.length;
+    }
+
+    // Paper counts summary
+    const paperCountsEl = document.getElementById("paperCountsSummary");
+    if (paperCountsEl && this.systemConfig.paper_counts) {
+      const counts = this.systemConfig.paper_counts;
+      paperCountsEl.innerHTML = `
+                <div class="d-flex flex-wrap gap-2">
+                    <span class="badge bg-primary">${counts.new || 0} New</span>
+                    <span class="badge bg-warning">${
+                      counts.processing || 0
+                    } Processing</span>
+                    <span class="badge bg-success">${
+                      counts.done || 0
+                    } Done</span>
+                    <span class="badge bg-danger">${
+                      counts.failed || 0
+                    } Failed</span>
+                    <span class="badge bg-info">${
+                      counts.pending || 0
+                    } Pending</span>
+                    <span class="badge bg-secondary">${
+                      counts.retrying || 0
+                    } Retrying</span>
+                </div>
+            `;
+    }
+  }
+
+  updateScrapersTable() {
+    const tbody = document.getElementById("scrapersTableBody");
+    if (!tbody) return;
+
+    tbody.innerHTML = "";
+
+    this.scrapers.forEach((scraper) => {
+      const row = document.createElement("tr");
+
+      // Check if this is the current active scraper
+      const isCurrentScraper =
+        scraper.name === this.systemConfig.current_scraper_module;
+
+      if (scraper.error) {
+        row.innerHTML = `
+                    <td>${scraper.name}</td>
+                    <td colspan="5" class="text-danger">
+                        <i class="fas fa-exclamation-triangle"></i> ${scraper.error}
+                    </td>
+                `;
+      } else {
+        row.innerHTML = `
+                    <td>
+                        <strong>${scraper.name}</strong>
+                        ${
+                          scraper.name === "dummy"
+                            ? '<span class="badge bg-info ms-2">Test Module</span>'
+                            : ""
+                        }
+                        ${
+                          isCurrentScraper
+                            ? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
+                            : ""
+                        }
+                    </td>
+                    <td class="scraper-description">
+                        ${this.truncateDescription(scraper.description)}
+                    </td>
+                    <td class="input-status-list">
+                        ${this.renderStatusBadges(
+                          scraper.input_statuses,
+                          "bg-info"
+                        )}
+                    </td>
+                    <td class="status-output">
+                        <span class="badge bg-success">${
+                          scraper.output_status_success
+                        }</span>
+                    </td>
+                    <td class="status-output">
+                        <span class="badge bg-danger">${
+                          scraper.output_status_failure
+                        }</span>
+                    </td>
+                    <td class="status-output">
+                        <span class="badge bg-warning">${
+                          scraper.output_status_processing
+                        }</span>
+                    </td>
+                `;
+      }
+
+      // Highlight the current scraper row
+      if (isCurrentScraper) {
+        row.classList.add("table-success");
+      }
+
+      tbody.appendChild(row);
+    });
+  }
+
+  updateStatusFlowDiagram() {
+    const diagramEl = document.getElementById("statusFlowDiagram");
+    if (!diagramEl) return;
+
+    // Analyze actual scrapers to build real flow
+    const statusFlow = this.analyzeScraperFlow();
+
+    let diagramHTML = '<div class="status-flow-container">';
+
+    // Create visual flow based on actual scrapers
+    statusFlow.forEach((stage, index) => {
+      if (index > 0) {
+        diagramHTML +=
+          '<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
+      }
+
+      diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
+      diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
+
+      if (stage.scrapers && stage.scrapers.length > 0) {
+        diagramHTML +=
+          '<div class="mb-2"><small class="text-muted">Handled by: ' +
+          stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
+          "</small></div>";
+      }
+
+      diagramHTML += '<div class="status-badges">';
+      stage.statuses.forEach((status, statusIndex) => {
+        if (statusIndex > 0) {
+          diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
+        }
+
+        const badgeClass = this.getStatusBadgeClass(status);
+        diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
+      });
+      diagramHTML += "</div>";
+
+      if (stage.description) {
+        diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
+      }
+
+      diagramHTML += "</div>";
+    });
+
+    diagramHTML += "</div>";
+
+    // Add explanation
+    diagramHTML += `
+            <div class="mt-4 p-3 bg-light rounded">
+                <h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
+                <ul class="small mb-0">
+                    <li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
+                    <li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
+                    <li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
+                    <li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
+                    <li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
+                </ul>
+            </div>
+        `;
+
+    diagramEl.innerHTML = diagramHTML;
+  }
+
+  analyzeScraperFlow() {
+    // Build actual flow based on available scrapers
+    const stages = [];
+    const allInputStatuses = new Set();
+    const allOutputStatuses = new Set();
+    const scrapersByInput = {};
+
+    // Analyze scrapers to understand the flow
+    this.scrapers.forEach((scraper) => {
+      if (scraper.input_statuses) {
+        scraper.input_statuses.forEach((status) => {
+          allInputStatuses.add(status);
+          if (!scrapersByInput[status]) {
+            scrapersByInput[status] = [];
+          }
+          scrapersByInput[status].push(scraper.name);
+        });
+      }
+
+      if (scraper.output_status_success)
+        allOutputStatuses.add(scraper.output_status_success);
+      if (scraper.output_status_failure)
+        allOutputStatuses.add(scraper.output_status_failure);
+    });
+
+    // Entry point
+    if (allInputStatuses.has("New")) {
+      stages.push({
+        title: "Entry Point",
+        statuses: ["New"],
+        scrapers: scrapersByInput["New"] || [],
+        description: "Newly uploaded papers enter the processing pipeline",
+      });
+    }
+
+    // Processing stages
+    const processingStatuses = Array.from(allInputStatuses).filter(
+      (status) => !["New", "Done", "Failed"].includes(status)
+    );
+
+    if (processingStatuses.length > 0) {
+      stages.push({
+        title: "Processing Stages",
+        statuses: processingStatuses,
+        scrapers: [],
+        description: "Papers move through various processing stages",
+      });
+    }
+
+    // Final outputs
+    const finalStatuses = ["Done", "Failed"];
+    stages.push({
+      title: "Final States",
+      statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
+      scrapers: [],
+      description: "Papers end up in final success or failure states",
+    });
+
+    // Retry handling
+    if (allInputStatuses.has("Failed")) {
+      stages.push({
+        title: "Retry Processing",
+        statuses: ["Failed", "Retrying"],
+        scrapers: scrapersByInput["Failed"] || [],
+        description: "Failed papers can be retried with specialized scrapers",
+      });
+    }
+
+    return stages;
+  }
+
+  getStatusBadgeClass(status) {
+    const statusClasses = {
+      New: "bg-primary",
+      Pending: "bg-warning",
+      Processing: "bg-warning",
+      Retrying: "bg-warning",
+      Done: "bg-success",
+      Failed: "bg-danger",
+      HtmlDownloaded: "bg-info",
+      PublisherDetected: "bg-info",
+      TextExtracted: "bg-info",
+    };
+
+    return statusClasses[status] || "bg-secondary";
+  }
+
+  renderStatusBadges(statuses, defaultClass = "bg-secondary") {
+    if (!Array.isArray(statuses)) return "";
+
+    return statuses
+      .map(
+        (status) =>
+          `<span class="badge ${this.getStatusBadgeClass(
+            status
+          )} status-badge">${status}</span>`
+      )
+      .join("");
+  }
+
+  truncateDescription(description, maxLength = 100) {
+    if (!description) return "No description available";
+
+    if (description.length <= maxLength) return description;
+
+    return description.substring(0, maxLength).trim() + "...";
+  }
+
+  updatePublishersSection() {
+    // Update publisher statistics
+    const publisherStatsEl = document.getElementById("publisherStats");
+    if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
+      const stats = this.publishersData.stats;
+      publisherStatsEl.innerHTML = `
+        <div class="col-md-3">
+          <div class="text-center">
+            <div class="h4 text-primary mb-1">${stats.total_publishers}</div>
+            <div class="text-muted small">Total Publishers</div>
+          </div>
+        </div>
+        <div class="col-md-3">
+          <div class="text-center">
+            <div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
+            <div class="text-muted small">With Parsers</div>
+          </div>
+        </div>
+        <div class="col-md-3">
+          <div class="text-center">
+            <div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
+            <div class="text-muted small">Missing Parsers</div>
+          </div>
+        </div>
+        <div class="col-md-3">
+          <div class="text-center">
+            <div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
+            <div class="text-muted small">Papers with Publisher</div>
+          </div>
+        </div>
+      `;
+    }
+
+    // Update publishers table
+    const publishersTableBody = document.getElementById("publishersTableBody");
+    if (
+      publishersTableBody &&
+      this.publishersData &&
+      this.publishersData.publishers
+    ) {
+      publishersTableBody.innerHTML = "";
+
+      if (this.publishersData.publishers.length === 0) {
+        publishersTableBody.innerHTML = `
+          <tr>
+            <td colspan="4" class="text-center text-muted py-4">
+              <i class="fas fa-info-circle"></i> No publishers detected yet.<br>
+              <small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
+            </td>
+          </tr>
+        `;
+        return;
+      }
+
+      this.publishersData.publishers.forEach((publisher) => {
+        const row = document.createElement("tr");
+
+        // Publisher status badge
+        const statusBadge = publisher.has_parser
+          ? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
+          : '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
+
+        // Parser availability indicator
+        const parserIndicator = publisher.has_parser
+          ? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
+          : '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
+
+        row.innerHTML = `
+          <td>
+            <strong>${publisher.name}</strong>
+          </td>
+          <td>
+            <span class="badge bg-info">${publisher.paper_count}</span>
+          </td>
+          <td>${statusBadge}</td>
+          <td class="text-center">${parserIndicator}</td>
+        `;
+
+        publishersTableBody.appendChild(row);
+      });
+    }
+  }
+
+  // Public method to show the modal
+  show() {
+    if (this.modal) {
+      const bootstrapModal = new bootstrap.Modal(this.modal);
+      bootstrapModal.show();
+    }
+  }
+}
+
+// Global function to load scraper overview (used by retry button)
+function loadScraperOverview() {
+  if (window.scraperOverview) {
+    window.scraperOverview.loadScraperOverview();
+  }
+}
+
+// Global function to show scraper overview modal
+function showScraperOverview() {
+  if (!window.scraperOverview) {
+    window.scraperOverview = new ScraperOverview();
+  }
+  window.scraperOverview.show();
+}
+
+// Initialize when DOM is ready
+document.addEventListener("DOMContentLoaded", function () {
+  window.scraperOverview = new ScraperOverview();
+});
--- a/scipaperloader/templates/config/general.html.jinja
+++ b/scipaperloader/templates/config/general.html.jinja
@ -65,7 +65,13 @@
                    <div class="col-md-6">
                        <form method="post" action="{{ url_for('config.update_scraper_module') }}">
                            <div class="form-section">
-                                <h6>Scraper Module</h6>
+                                <div class="d-flex justify-content-between align-items-center mb-2">
+                                    <h6>Scraper Module</h6>
+                                    <button type="button" class="btn btn-outline-info btn-sm"
+                                        onclick="showScraperOverview()" title="View scraper modules overview">
+                                        <i class="fas fa-info-circle"></i> How Scrapers Work
+                                    </button>
+                                </div>
                                <p class="text-muted">Select which scraper module to use for processing papers.</p>

                                <div class="mb-3">
--- a/scipaperloader/templates/config/index.html.jinja
+++ b/scipaperloader/templates/config/index.html.jinja
@ -53,4 +53,13 @@
        {% endif %}
    </div>
 </div>
-{% endblock content %}
+
+<!-- Include the scraper overview modal -->
+{% include "partials/scraper_overview_modal.html.jinja" %}
+
+{% endblock content %}
+
+{% block scripts %}
+{{ super() }}
+<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
+{% endblock scripts %}
--- a/scipaperloader/templates/partials/scraper_overview_modal.html.jinja
+++ b/scipaperloader/templates/partials/scraper_overview_modal.html.jinja
@ -0,0 +1,249 @@
+<!-- Scraper Overview Modal -->
+<div class="modal fade" id="scraperOverviewModal" tabindex="-1" role="dialog"
+    aria-labelledby="scraperOverviewModalLabel" aria-hidden="true">
+    <div class="modal-dialog modal-xl" role="document">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h5 class="modal-title" id="scraperOverviewModalLabel">
+                    <i class="fas fa-cogs"></i> Scraper Modules Overview
+                </h5>
+                <button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
+            </div>
+            <div class="modal-body">
+                <!-- Loading state -->
+                <div id="scraperOverviewLoading" class="text-center py-4">
+                    <div class="spinner-border text-primary" role="status">
+                        <span class="visually-hidden">Loading...</span>
+                    </div>
+                    <p class="mt-2 text-muted">Loading scraper information...</p>
+                </div>
+
+                <!-- Error state -->
+                <div id="scraperOverviewError" class="alert alert-danger d-none" role="alert">
+                    <h6 class="alert-heading">Error Loading Scrapers</h6>
+                    <p id="scraperOverviewErrorMessage"></p>
+                    <button class="btn btn-outline-danger btn-sm" onclick="loadScraperOverview()">
+                        <i class="fas fa-redo"></i> Retry
+                    </button>
+                </div>
+
+                <!-- Content -->
+                <div id="scraperOverviewContent" class="d-none">
+                    <!-- Scraper Architecture Overview -->
+                    <div class="card mb-4">
+                        <div class="card-header">
+                            <h6 class="mb-0">
+                                <i class="fas fa-info-circle"></i> How Scraper Modules Work
+                            </h6>
+                        </div>
+                        <div class="card-body">
+                            <p class="mb-3">
+                                SciPaperLoader uses a modular scraper architecture where each scraper module handles
+                                specific paper processing stages. Papers flow through different statuses as they are
+                                processed by various scrapers.
+                            </p>
+
+                            <div class="row">
+                                <div class="col-md-6">
+                                    <h6>Key Concepts:</h6>
+                                    <ul class="small">
+                                        <li><strong>Input Statuses:</strong> Paper statuses this scraper can process
+                                        </li>
+                                        <li><strong>Output Statuses:</strong> Statuses papers get after processing</li>
+                                        <li><strong>Processing Status:</strong> Temporary status while scraper works
+                                        </li>
+                                        <li><strong>Pipeline:</strong> Scrapers can be chained together</li>
+                                    </ul>
+                                </div>
+                                <div class="col-md-6">
+                                    <h6>Status Flow Example:</h6>
+                                    <div class="d-flex align-items-center small">
+                                        <span class="badge bg-info">New</span>
+                                        <i class="fas fa-arrow-right mx-2"></i>
+                                        <span class="badge bg-warning">Processing</span>
+                                        <i class="fas fa-arrow-right mx-2"></i>
+                                        <span class="badge bg-success">Done</span>
+                                    </div>
+                                    <div class="text-muted mt-1">Papers transition through these statuses</div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+
+                    <!-- Current System Configuration -->
+                    <div class="card mb-4">
+                        <div class="card-header">
+                            <h6 class="mb-0">
+                                <i class="fas fa-server"></i> System Configuration
+                            </h6>
+                        </div>
+                        <div class="card-body">
+                            <div class="row">
+                                <div class="col-md-4">
+                                    <p><strong>Active Scraper Module:</strong> <span id="currentScraperModule"
+                                            class="badge bg-primary">Loading...</span></p>
+                                    <p><strong>Daily Volume Limit:</strong> <span
+                                            id="currentVolumeLimit">Loading...</span> papers</p>
+                                </div>
+                                <div class="col-md-4">
+                                    <p><strong>Total Available Modules:</strong> <span
+                                            id="totalScraperModules">Loading...</span></p>
+                                    <p><strong>Processing Pipeline:</strong> <span
+                                            id="processingPipeline">Multi-stage</span></p>
+                                </div>
+                                <div class="col-md-4">
+                                    <p><strong>Current Paper Counts:</strong></p>
+                                    <div id="paperCountsSummary" class="small">
+                                        <!-- Will be populated by JavaScript -->
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+
+                    <!-- Available Scrapers Table -->
+                    <div class="card">
+                        <div class="card-header">
+                            <h6 class="mb-0">
+                                <i class="fas fa-list"></i> Available Scraper Modules
+                            </h6>
+                        </div>
+                        <div class="card-body">
+                            <div class="table-responsive">
+                                <table class="table table-hover">
+                                    <thead>
+                                        <tr>
+                                            <th>Module Name</th>
+                                            <th>Description</th>
+                                            <th>Input Statuses</th>
+                                            <th>Success Output</th>
+                                            <th>Failure Output</th>
+                                            <th>Processing Status</th>
+                                        </tr>
+                                    </thead>
+                                    <tbody id="scrapersTableBody">
+                                        <!-- Table content will be populated by JavaScript -->
+                                    </tbody>
+                                </table>
+                            </div>
+                        </div>
+                    </div>
+
+                    <!-- Publisher Parser Overview -->
+                    <div class="card mt-4">
+                        <div class="card-header">
+                            <h6 class="mb-0">
+                                <i class="fas fa-building"></i> Publisher Parser Overview
+                            </h6>
+                        </div>
+                        <div class="card-body">
+                            <div class="row mb-3">
+                                <div class="col-md-12">
+                                    <p class="text-muted mb-2">
+                                        <i class="fas fa-info-circle"></i>
+                                        Publishers are detected from paper URLs and mapped to specific parser modules
+                                        for content extraction.
+                                    </p>
+                                </div>
+                            </div>
+
+                            <!-- Publisher Statistics -->
+                            <div class="row mb-4" id="publisherStats">
+                                <!-- Will be populated by JavaScript -->
+                            </div>
+
+                            <!-- Publishers Table -->
+                            <div class="table-responsive">
+                                <table class="table table-hover table-sm">
+                                    <thead>
+                                        <tr>
+                                            <th>Publisher</th>
+                                            <th>Papers</th>
+                                            <th>Parser Status</th>
+                                            <th>Parser Available</th>
+                                        </tr>
+                                    </thead>
+                                    <tbody id="publishersTableBody">
+                                        <!-- Table content will be populated by JavaScript -->
+                                    </tbody>
+                                </table>
+                            </div>
+                        </div>
+                    </div>
+
+                    <!-- Status Flow Diagram -->
+                    <div class="card mt-4">
+                        <div class="card-header">
+                            <h6 class="mb-0">
+                                <i class="fas fa-project-diagram"></i> Paper Status Flow Diagram
+                            </h6>
+                        </div>
+                        <div class="card-body">
+                            <div id="statusFlowDiagram" class="text-center py-4">
+                                <!-- This will be populated by JavaScript -->
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <div class="modal-footer">
+                <div class="d-flex justify-content-between w-100">
+                    <small class="text-muted">
+                        <i class="fas fa-lightbulb"></i>
+                        Tip: Scrapers can be chained to create complex processing pipelines
+                    </small>
+                    <button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
+                </div>
+            </div>
+        </div>
+    </div>
+</div>
+
+<style>
+    /* Custom styles for the scraper overview modal */
+    #scraperOverviewModal .modal-xl {
+        max-width: 1200px;
+    }
+
+    #scraperOverviewModal .table th {
+        font-size: 0.9rem;
+        background-color: #f8f9fa;
+    }
+
+    #scraperOverviewModal .badge {
+        font-size: 0.75rem;
+    }
+
+    #scraperOverviewModal .status-badge {
+        margin: 2px;
+        display: inline-block;
+    }
+
+    .status-flow-node {
+        display: inline-block;
+        padding: 8px 16px;
+        margin: 4px;
+        border-radius: 20px;
+        font-size: 0.9rem;
+        font-weight: 500;
+    }
+
+    .status-flow-arrow {
+        color: #6c757d;
+        margin: 0 8px;
+    }
+
+    .scraper-description {
+        max-width: 300px;
+        word-break: break-word;
+    }
+
+    .input-status-list {
+        max-width: 150px;
+    }
+
+    .status-output {
+        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+        font-size: 0.8rem;
+    }
+</style>
--- a/scipaperloader/templates/scraper.html.jinja
+++ b/scipaperloader/templates/scraper.html.jinja
@ -114,20 +114,44 @@

        <div class="col-md-6">
            <div class="card">
-                <div class="card-header">
-                    <h5>Volume Configuration</h5>
+                <div class="card-header d-flex justify-content-between align-items-center">
+                    <h5>Scraper Configuration</h5>
+                    <button type="button" class="btn btn-outline-info btn-sm" onclick="showScraperOverview()"
+                        title="View scraper modules overview">
+                        <i class="fas fa-info-circle"></i> How Scrapers Work
+                    </button>
                </div>
                <div class="card-body">
                    <form id="volumeForm">
-                        <div class="form-group">
+                        <div class="form-group mb-3">
                            <label for="volumeInput">Papers per day:</label>
                            <input type="number" class="form-control" id="volumeInput"
                                value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
-                            <button type="submit" class="btn btn-primary mt-2">
-                                <i class="fas fa-save"></i> Update Volume
-                            </button>
+                            <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
                        </div>
-                        <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
+
+                        <div class="form-group mb-3">
+                            <label for="mainScraperSelect">Scraper Module:</label>
+                            <select class="form-control" id="mainScraperSelect">
+                                {% for module in available_scraper_modules %}
+                                <option value="{{ module }}" {% if module==current_scraper_module %}selected{% endif %}>
+                                    {{ module }}
+                                    {% if scraper_details[module] %}
+                                    - {{ scraper_details[module].description[:50] }}{% if
+                                    scraper_details[module].description|length > 50 %}...{% endif %}
+                                    {% endif %}
+                                </option>
+                                {% endfor %}
+                            </select>
+                            <div class="form-text">
+                                Select which scraper module to use for automated processing. Current: <strong>{{
+                                    current_scraper_module }}</strong>
+                            </div>
+                        </div>
+
+                        <button type="submit" class="btn btn-primary">
+                            <i class="fas fa-save"></i> Update Configuration
+                        </button>
                    </form>
                </div>
            </div>
@ -306,6 +330,10 @@
        </div>
    </div>
 </div>
+
+<!-- Include the scraper overview modal -->
+{% include "partials/scraper_overview_modal.html.jinja" %}
+
 {% endblock content %}

 {% block scripts %}
@ -320,6 +348,7 @@
 <script src="{{ url_for('static', filename='js/paper-processor.js') }}"></script>
 <script src="{{ url_for('static', filename='js/activity-monitor.js') }}"></script>
 <script src="{{ url_for('static', filename='js/scraper-dashboard.js') }}"></script>
+<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>

 <script id="scraper-config" type="application/json">
    {