/**
* Scraper Overview functionality
*/
class ScraperOverview {
constructor() {
this.modal = null;
this.scrapers = [];
this.systemConfig = {};
this.init();
}
init() {
// Initialize modal reference
this.modal = document.getElementById("scraperOverviewModal");
// Load data when modal is shown
if (this.modal) {
this.modal.addEventListener("show.bs.modal", () => {
this.loadScraperOverview();
});
}
}
async loadScraperOverview() {
const loadingEl = document.getElementById("scraperOverviewLoading");
const errorEl = document.getElementById("scraperOverviewError");
const contentEl = document.getElementById("scraperOverviewContent");
// Show loading state
loadingEl?.classList.remove("d-none");
errorEl?.classList.add("d-none");
contentEl?.classList.add("d-none");
try {
// Load scrapers, system config, and publishers in parallel
const [scrapersResponse, statusResponse, publishersResponse] =
await Promise.all([
fetch("/scraper/scrapers"),
fetch("/scraper/status"),
fetch("/scraper/publishers"),
]);
if (
!scrapersResponse.ok ||
!statusResponse.ok ||
!publishersResponse.ok
) {
throw new Error("Failed to load scraper information");
}
const scrapersData = await scrapersResponse.json();
const statusData = await statusResponse.json();
const publishersData = await publishersResponse.json();
if (
!scrapersData.success ||
!statusData.success ||
!publishersData.success
) {
throw new Error(
scrapersData.message ||
statusData.message ||
publishersData.message ||
"Unknown error"
);
}
this.scrapers = scrapersData.scrapers;
this.systemConfig = statusData;
this.publishersData = publishersData.data;
// Update UI
this.updateSystemConfig();
this.updateScrapersTable();
this.updatePublishersSection();
this.updateStatusFlowDiagram();
// Show content
loadingEl?.classList.add("d-none");
contentEl?.classList.remove("d-none");
} catch (error) {
console.error("Error loading scraper overview:", error);
// Show error state
loadingEl?.classList.add("d-none");
const errorMessage = document.getElementById(
"scraperOverviewErrorMessage"
);
if (errorMessage) {
errorMessage.textContent =
error.message || "Failed to load scraper information";
}
errorEl?.classList.remove("d-none");
}
}
updateSystemConfig() {
// Current scraper module
const currentModuleEl = document.getElementById("currentScraperModule");
if (currentModuleEl) {
const currentModule =
this.systemConfig.current_scraper_module || "System Default";
currentModuleEl.textContent = currentModule;
currentModuleEl.className = "badge bg-primary";
}
// Volume limit
const volumeLimitEl = document.getElementById("currentVolumeLimit");
if (volumeLimitEl) {
const volumeLimit = this.systemConfig.volume_config || "Unknown";
volumeLimitEl.textContent = volumeLimit;
}
// Total modules
const totalModulesEl = document.getElementById("totalScraperModules");
if (totalModulesEl) {
totalModulesEl.textContent = this.scrapers.length;
}
// Paper counts summary
const paperCountsEl = document.getElementById("paperCountsSummary");
if (paperCountsEl && this.systemConfig.paper_counts) {
const counts = this.systemConfig.paper_counts;
paperCountsEl.innerHTML = `
${counts.new || 0} New
${
counts.processing || 0
} Processing
${
counts.done || 0
} Done
${
counts.failed || 0
} Failed
${
counts.pending || 0
} Pending
${
counts.retrying || 0
} Retrying
`;
}
}
updateScrapersTable() {
const tbody = document.getElementById("scrapersTableBody");
if (!tbody) return;
tbody.innerHTML = "";
this.scrapers.forEach((scraper) => {
const row = document.createElement("tr");
// Check if this is the current active scraper
const isCurrentScraper =
scraper.name === this.systemConfig.current_scraper_module;
if (scraper.error) {
row.innerHTML = `
${scraper.name} |
${scraper.error}
|
`;
} else {
row.innerHTML = `
${scraper.name}
${
scraper.name === "dummy"
? 'Test Module'
: ""
}
${
isCurrentScraper
? ' Active'
: ""
}
|
${this.truncateDescription(scraper.description)}
|
${this.renderStatusBadges(
scraper.input_statuses,
"bg-info"
)}
|
${
scraper.output_status_success
}
|
${
scraper.output_status_failure
}
|
${
scraper.output_status_processing
}
|
`;
}
// Highlight the current scraper row
if (isCurrentScraper) {
row.classList.add("table-success");
}
tbody.appendChild(row);
});
}
updateStatusFlowDiagram() {
const diagramEl = document.getElementById("statusFlowDiagram");
if (!diagramEl) return;
// Analyze actual scrapers to build real flow
const statusFlow = this.analyzeScraperFlow();
let diagramHTML = '';
// Create visual flow based on actual scrapers
statusFlow.forEach((stage, index) => {
if (index > 0) {
diagramHTML +=
'
';
}
diagramHTML += '
';
diagramHTML += `
${stage.title}
`;
if (stage.scrapers && stage.scrapers.length > 0) {
diagramHTML +=
'
Handled by: ' +
stage.scrapers.map((s) => `${s}`).join(", ") +
"
";
}
diagramHTML += '
';
stage.statuses.forEach((status, statusIndex) => {
if (statusIndex > 0) {
diagramHTML += '';
}
const badgeClass = this.getStatusBadgeClass(status);
diagramHTML += `${status}`;
});
diagramHTML += "
";
if (stage.description) {
diagramHTML += `
${stage.description}
`;
}
diagramHTML += "
";
});
diagramHTML += "
";
// Add explanation
diagramHTML += `
Flow Explanation:
- Modular Processing: Each scraper handles specific input statuses
- Status Transitions: Papers move through statuses as they are processed
- Pipeline Architecture: Output from one scraper can become input to another
- Error Handling: Failed papers can be retried by specialized scrapers
- Parallel Processing: Multiple scrapers can work on different papers simultaneously
`;
diagramEl.innerHTML = diagramHTML;
}
analyzeScraperFlow() {
// Build actual flow based on available scrapers
const stages = [];
const allInputStatuses = new Set();
const allOutputStatuses = new Set();
const scrapersByInput = {};
// Analyze scrapers to understand the flow
this.scrapers.forEach((scraper) => {
if (scraper.input_statuses) {
scraper.input_statuses.forEach((status) => {
allInputStatuses.add(status);
if (!scrapersByInput[status]) {
scrapersByInput[status] = [];
}
scrapersByInput[status].push(scraper.name);
});
}
if (scraper.output_status_success)
allOutputStatuses.add(scraper.output_status_success);
if (scraper.output_status_failure)
allOutputStatuses.add(scraper.output_status_failure);
});
// Entry point
if (allInputStatuses.has("New")) {
stages.push({
title: "Entry Point",
statuses: ["New"],
scrapers: scrapersByInput["New"] || [],
description: "Newly uploaded papers enter the processing pipeline",
});
}
// Processing stages
const processingStatuses = Array.from(allInputStatuses).filter(
(status) => !["New", "Done", "Failed"].includes(status)
);
if (processingStatuses.length > 0) {
stages.push({
title: "Processing Stages",
statuses: processingStatuses,
scrapers: [],
description: "Papers move through various processing stages",
});
}
// Final outputs
const finalStatuses = ["Done", "Failed"];
stages.push({
title: "Final States",
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
scrapers: [],
description: "Papers end up in final success or failure states",
});
// Retry handling
if (allInputStatuses.has("Failed")) {
stages.push({
title: "Retry Processing",
statuses: ["Failed", "Retrying"],
scrapers: scrapersByInput["Failed"] || [],
description: "Failed papers can be retried with specialized scrapers",
});
}
return stages;
}
getStatusBadgeClass(status) {
const statusClasses = {
New: "bg-primary",
Pending: "bg-warning",
Processing: "bg-warning",
Retrying: "bg-warning",
Done: "bg-success",
Failed: "bg-danger",
HtmlDownloaded: "bg-info",
PublisherDetected: "bg-info",
TextExtracted: "bg-info",
};
return statusClasses[status] || "bg-secondary";
}
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
if (!Array.isArray(statuses)) return "";
return statuses
.map(
(status) =>
`${status}`
)
.join("");
}
truncateDescription(description, maxLength = 100) {
if (!description) return "No description available";
if (description.length <= maxLength) return description;
return description.substring(0, maxLength).trim() + "...";
}
updatePublishersSection() {
// Update publisher statistics
const publisherStatsEl = document.getElementById("publisherStats");
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
const stats = this.publishersData.stats;
publisherStatsEl.innerHTML = `
${stats.total_publishers}
Total Publishers
${stats.publishers_with_parsers}
With Parsers
${stats.publishers_without_parsers}
Missing Parsers
${stats.total_papers_with_publisher}
Papers with Publisher
`;
}
// Update publishers table
const publishersTableBody = document.getElementById("publishersTableBody");
if (
publishersTableBody &&
this.publishersData &&
this.publishersData.publishers
) {
publishersTableBody.innerHTML = "";
if (this.publishersData.publishers.length === 0) {
publishersTableBody.innerHTML = `
No publishers detected yet.
Run the publisher_detector scraper to identify publishers from paper URLs.
|
`;
return;
}
this.publishersData.publishers.forEach((publisher) => {
const row = document.createElement("tr");
// Publisher status badge
const statusBadge = publisher.has_parser
? ' Available'
: ' Missing';
// Parser availability indicator
const parserIndicator = publisher.has_parser
? ''
: '';
row.innerHTML = `
${publisher.name}
|
${publisher.paper_count}
|
${statusBadge} |
${parserIndicator} |
`;
publishersTableBody.appendChild(row);
});
}
}
// Public method to show the modal
show() {
if (this.modal) {
const bootstrapModal = new bootstrap.Modal(this.modal);
bootstrapModal.show();
}
}
}
// Global function to load scraper overview (used by retry button)
function loadScraperOverview() {
if (window.scraperOverview) {
window.scraperOverview.loadScraperOverview();
}
}
// Global function to show scraper overview modal
function showScraperOverview() {
if (!window.scraperOverview) {
window.scraperOverview = new ScraperOverview();
}
window.scraperOverview.show();
}
// Initialize when DOM is ready
document.addEventListener("DOMContentLoaded", function () {
window.scraperOverview = new ScraperOverview();
});