SciPaperLoader/scipaperloader/static/js/scraper-overview.js

501 lines
16 KiB
JavaScript

/**
* Scraper Overview functionality
*/
class ScraperOverview {
constructor() {
this.modal = null;
this.scrapers = [];
this.systemConfig = {};
this.init();
}
init() {
// Initialize modal reference
this.modal = document.getElementById("scraperOverviewModal");
// Load data when modal is shown
if (this.modal) {
this.modal.addEventListener("show.bs.modal", () => {
this.loadScraperOverview();
});
}
}
async loadScraperOverview() {
const loadingEl = document.getElementById("scraperOverviewLoading");
const errorEl = document.getElementById("scraperOverviewError");
const contentEl = document.getElementById("scraperOverviewContent");
// Show loading state
loadingEl?.classList.remove("d-none");
errorEl?.classList.add("d-none");
contentEl?.classList.add("d-none");
try {
// Load scrapers, system config, and publishers in parallel
const [scrapersResponse, statusResponse, publishersResponse] =
await Promise.all([
fetch("/scraper/scrapers"),
fetch("/scraper/status"),
fetch("/scraper/publishers"),
]);
if (
!scrapersResponse.ok ||
!statusResponse.ok ||
!publishersResponse.ok
) {
throw new Error("Failed to load scraper information");
}
const scrapersData = await scrapersResponse.json();
const statusData = await statusResponse.json();
const publishersData = await publishersResponse.json();
if (
!scrapersData.success ||
!statusData.success ||
!publishersData.success
) {
throw new Error(
scrapersData.message ||
statusData.message ||
publishersData.message ||
"Unknown error"
);
}
this.scrapers = scrapersData.scrapers;
this.systemConfig = statusData;
this.publishersData = publishersData.data;
// Update UI
this.updateSystemConfig();
this.updateScrapersTable();
this.updatePublishersSection();
this.updateStatusFlowDiagram();
// Show content
loadingEl?.classList.add("d-none");
contentEl?.classList.remove("d-none");
} catch (error) {
console.error("Error loading scraper overview:", error);
// Show error state
loadingEl?.classList.add("d-none");
const errorMessage = document.getElementById(
"scraperOverviewErrorMessage"
);
if (errorMessage) {
errorMessage.textContent =
error.message || "Failed to load scraper information";
}
errorEl?.classList.remove("d-none");
}
}
updateSystemConfig() {
// Current scraper module
const currentModuleEl = document.getElementById("currentScraperModule");
if (currentModuleEl) {
const currentModule =
this.systemConfig.current_scraper_module || "System Default";
currentModuleEl.textContent = currentModule;
currentModuleEl.className = "badge bg-primary";
}
// Volume limit
const volumeLimitEl = document.getElementById("currentVolumeLimit");
if (volumeLimitEl) {
const volumeLimit = this.systemConfig.volume_config || "Unknown";
volumeLimitEl.textContent = volumeLimit;
}
// Total modules
const totalModulesEl = document.getElementById("totalScraperModules");
if (totalModulesEl) {
totalModulesEl.textContent = this.scrapers.length;
}
// Paper counts summary
const paperCountsEl = document.getElementById("paperCountsSummary");
if (paperCountsEl && this.systemConfig.paper_counts) {
const counts = this.systemConfig.paper_counts;
paperCountsEl.innerHTML = `
<div class="d-flex flex-wrap gap-2">
<span class="badge bg-primary">${counts.new || 0} New</span>
<span class="badge bg-warning">${
counts.processing || 0
} Processing</span>
<span class="badge bg-success">${
counts.done || 0
} Done</span>
<span class="badge bg-danger">${
counts.failed || 0
} Failed</span>
<span class="badge bg-info">${
counts.pending || 0
} Pending</span>
<span class="badge bg-secondary">${
counts.retrying || 0
} Retrying</span>
</div>
`;
}
}
updateScrapersTable() {
const tbody = document.getElementById("scrapersTableBody");
if (!tbody) return;
tbody.innerHTML = "";
this.scrapers.forEach((scraper) => {
const row = document.createElement("tr");
// Check if this is the current active scraper
const isCurrentScraper =
scraper.name === this.systemConfig.current_scraper_module;
if (scraper.error) {
row.innerHTML = `
<td>${scraper.name}</td>
<td colspan="5" class="text-danger">
<i class="fas fa-exclamation-triangle"></i> ${scraper.error}
</td>
`;
} else {
row.innerHTML = `
<td>
<strong>${scraper.name}</strong>
${
scraper.name === "dummy"
? '<span class="badge bg-info ms-2">Test Module</span>'
: ""
}
${
isCurrentScraper
? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
: ""
}
</td>
<td class="scraper-description">
${this.truncateDescription(scraper.description)}
</td>
<td class="input-status-list">
${this.renderStatusBadges(
scraper.input_statuses,
"bg-info"
)}
</td>
<td class="status-output">
<span class="badge bg-success">${
scraper.output_status_success
}</span>
</td>
<td class="status-output">
<span class="badge bg-danger">${
scraper.output_status_failure
}</span>
</td>
<td class="status-output">
<span class="badge bg-warning">${
scraper.output_status_processing
}</span>
</td>
`;
}
// Highlight the current scraper row
if (isCurrentScraper) {
row.classList.add("table-success");
}
tbody.appendChild(row);
});
}
updateStatusFlowDiagram() {
const diagramEl = document.getElementById("statusFlowDiagram");
if (!diagramEl) return;
// Analyze actual scrapers to build real flow
const statusFlow = this.analyzeScraperFlow();
let diagramHTML = '<div class="status-flow-container">';
// Create visual flow based on actual scrapers
statusFlow.forEach((stage, index) => {
if (index > 0) {
diagramHTML +=
'<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
}
diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
if (stage.scrapers && stage.scrapers.length > 0) {
diagramHTML +=
'<div class="mb-2"><small class="text-muted">Handled by: ' +
stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
"</small></div>";
}
diagramHTML += '<div class="status-badges">';
stage.statuses.forEach((status, statusIndex) => {
if (statusIndex > 0) {
diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
}
const badgeClass = this.getStatusBadgeClass(status);
diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
});
diagramHTML += "</div>";
if (stage.description) {
diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
}
diagramHTML += "</div>";
});
diagramHTML += "</div>";
// Add explanation
diagramHTML += `
<div class="mt-4 p-3 bg-light rounded">
<h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
<ul class="small mb-0">
<li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
<li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
<li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
<li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
<li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
</ul>
</div>
`;
diagramEl.innerHTML = diagramHTML;
}
analyzeScraperFlow() {
// Build actual flow based on available scrapers
const stages = [];
const allInputStatuses = new Set();
const allOutputStatuses = new Set();
const scrapersByInput = {};
// Analyze scrapers to understand the flow
this.scrapers.forEach((scraper) => {
if (scraper.input_statuses) {
scraper.input_statuses.forEach((status) => {
allInputStatuses.add(status);
if (!scrapersByInput[status]) {
scrapersByInput[status] = [];
}
scrapersByInput[status].push(scraper.name);
});
}
if (scraper.output_status_success)
allOutputStatuses.add(scraper.output_status_success);
if (scraper.output_status_failure)
allOutputStatuses.add(scraper.output_status_failure);
});
// Entry point
if (allInputStatuses.has("New")) {
stages.push({
title: "Entry Point",
statuses: ["New"],
scrapers: scrapersByInput["New"] || [],
description: "Newly uploaded papers enter the processing pipeline",
});
}
// Processing stages
const processingStatuses = Array.from(allInputStatuses).filter(
(status) => !["New", "Done", "Failed"].includes(status)
);
if (processingStatuses.length > 0) {
stages.push({
title: "Processing Stages",
statuses: processingStatuses,
scrapers: [],
description: "Papers move through various processing stages",
});
}
// Final outputs
const finalStatuses = ["Done", "Failed"];
stages.push({
title: "Final States",
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
scrapers: [],
description: "Papers end up in final success or failure states",
});
// Retry handling
if (allInputStatuses.has("Failed")) {
stages.push({
title: "Retry Processing",
statuses: ["Failed", "Retrying"],
scrapers: scrapersByInput["Failed"] || [],
description: "Failed papers can be retried with specialized scrapers",
});
}
return stages;
}
getStatusBadgeClass(status) {
const statusClasses = {
New: "bg-primary",
Pending: "bg-warning",
Processing: "bg-warning",
Retrying: "bg-warning",
Done: "bg-success",
Failed: "bg-danger",
HtmlDownloaded: "bg-info",
PublisherDetected: "bg-info",
TextExtracted: "bg-info",
};
return statusClasses[status] || "bg-secondary";
}
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
if (!Array.isArray(statuses)) return "";
return statuses
.map(
(status) =>
`<span class="badge ${this.getStatusBadgeClass(
status
)} status-badge">${status}</span>`
)
.join("");
}
truncateDescription(description, maxLength = 100) {
if (!description) return "No description available";
if (description.length <= maxLength) return description;
return description.substring(0, maxLength).trim() + "...";
}
updatePublishersSection() {
// Update publisher statistics
const publisherStatsEl = document.getElementById("publisherStats");
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
const stats = this.publishersData.stats;
publisherStatsEl.innerHTML = `
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-primary mb-1">${stats.total_publishers}</div>
<div class="text-muted small">Total Publishers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
<div class="text-muted small">With Parsers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
<div class="text-muted small">Missing Parsers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
<div class="text-muted small">Papers with Publisher</div>
</div>
</div>
`;
}
// Update publishers table
const publishersTableBody = document.getElementById("publishersTableBody");
if (
publishersTableBody &&
this.publishersData &&
this.publishersData.publishers
) {
publishersTableBody.innerHTML = "";
if (this.publishersData.publishers.length === 0) {
publishersTableBody.innerHTML = `
<tr>
<td colspan="4" class="text-center text-muted py-4">
<i class="fas fa-info-circle"></i> No publishers detected yet.<br>
<small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
</td>
</tr>
`;
return;
}
this.publishersData.publishers.forEach((publisher) => {
const row = document.createElement("tr");
// Publisher status badge
const statusBadge = publisher.has_parser
? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
: '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
// Parser availability indicator
const parserIndicator = publisher.has_parser
? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
: '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
row.innerHTML = `
<td>
<strong>${publisher.name}</strong>
</td>
<td>
<span class="badge bg-info">${publisher.paper_count}</span>
</td>
<td>${statusBadge}</td>
<td class="text-center">${parserIndicator}</td>
`;
publishersTableBody.appendChild(row);
});
}
}
// Public method to show the modal
show() {
if (this.modal) {
const bootstrapModal = new bootstrap.Modal(this.modal);
bootstrapModal.show();
}
}
}
// Global function to load scraper overview (used by retry button)
function loadScraperOverview() {
if (window.scraperOverview) {
window.scraperOverview.loadScraperOverview();
}
}
// Global function to show scraper overview modal
function showScraperOverview() {
if (!window.scraperOverview) {
window.scraperOverview = new ScraperOverview();
}
window.scraperOverview.show();
}
// Initialize when DOM is ready
document.addEventListener("DOMContentLoaded", function () {
window.scraperOverview = new ScraperOverview();
});