501 lines
16 KiB
JavaScript
501 lines
16 KiB
JavaScript
/**
|
|
* Scraper Overview functionality
|
|
*/
|
|
|
|
class ScraperOverview {
|
|
constructor() {
|
|
this.modal = null;
|
|
this.scrapers = [];
|
|
this.systemConfig = {};
|
|
this.init();
|
|
}
|
|
|
|
init() {
|
|
// Initialize modal reference
|
|
this.modal = document.getElementById("scraperOverviewModal");
|
|
|
|
// Load data when modal is shown
|
|
if (this.modal) {
|
|
this.modal.addEventListener("show.bs.modal", () => {
|
|
this.loadScraperOverview();
|
|
});
|
|
}
|
|
}
|
|
|
|
async loadScraperOverview() {
|
|
const loadingEl = document.getElementById("scraperOverviewLoading");
|
|
const errorEl = document.getElementById("scraperOverviewError");
|
|
const contentEl = document.getElementById("scraperOverviewContent");
|
|
|
|
// Show loading state
|
|
loadingEl?.classList.remove("d-none");
|
|
errorEl?.classList.add("d-none");
|
|
contentEl?.classList.add("d-none");
|
|
|
|
try {
|
|
// Load scrapers, system config, and publishers in parallel
|
|
const [scrapersResponse, statusResponse, publishersResponse] =
|
|
await Promise.all([
|
|
fetch("/scraper/scrapers"),
|
|
fetch("/scraper/status"),
|
|
fetch("/scraper/publishers"),
|
|
]);
|
|
|
|
if (
|
|
!scrapersResponse.ok ||
|
|
!statusResponse.ok ||
|
|
!publishersResponse.ok
|
|
) {
|
|
throw new Error("Failed to load scraper information");
|
|
}
|
|
|
|
const scrapersData = await scrapersResponse.json();
|
|
const statusData = await statusResponse.json();
|
|
const publishersData = await publishersResponse.json();
|
|
|
|
if (
|
|
!scrapersData.success ||
|
|
!statusData.success ||
|
|
!publishersData.success
|
|
) {
|
|
throw new Error(
|
|
scrapersData.message ||
|
|
statusData.message ||
|
|
publishersData.message ||
|
|
"Unknown error"
|
|
);
|
|
}
|
|
|
|
this.scrapers = scrapersData.scrapers;
|
|
this.systemConfig = statusData;
|
|
this.publishersData = publishersData.data;
|
|
|
|
// Update UI
|
|
this.updateSystemConfig();
|
|
this.updateScrapersTable();
|
|
this.updatePublishersSection();
|
|
this.updateStatusFlowDiagram();
|
|
|
|
// Show content
|
|
loadingEl?.classList.add("d-none");
|
|
contentEl?.classList.remove("d-none");
|
|
} catch (error) {
|
|
console.error("Error loading scraper overview:", error);
|
|
|
|
// Show error state
|
|
loadingEl?.classList.add("d-none");
|
|
const errorMessage = document.getElementById(
|
|
"scraperOverviewErrorMessage"
|
|
);
|
|
if (errorMessage) {
|
|
errorMessage.textContent =
|
|
error.message || "Failed to load scraper information";
|
|
}
|
|
errorEl?.classList.remove("d-none");
|
|
}
|
|
}
|
|
|
|
updateSystemConfig() {
|
|
// Current scraper module
|
|
const currentModuleEl = document.getElementById("currentScraperModule");
|
|
if (currentModuleEl) {
|
|
const currentModule =
|
|
this.systemConfig.current_scraper_module || "System Default";
|
|
currentModuleEl.textContent = currentModule;
|
|
currentModuleEl.className = "badge bg-primary";
|
|
}
|
|
|
|
// Volume limit
|
|
const volumeLimitEl = document.getElementById("currentVolumeLimit");
|
|
if (volumeLimitEl) {
|
|
const volumeLimit = this.systemConfig.volume_config || "Unknown";
|
|
volumeLimitEl.textContent = volumeLimit;
|
|
}
|
|
|
|
// Total modules
|
|
const totalModulesEl = document.getElementById("totalScraperModules");
|
|
if (totalModulesEl) {
|
|
totalModulesEl.textContent = this.scrapers.length;
|
|
}
|
|
|
|
// Paper counts summary
|
|
const paperCountsEl = document.getElementById("paperCountsSummary");
|
|
if (paperCountsEl && this.systemConfig.paper_counts) {
|
|
const counts = this.systemConfig.paper_counts;
|
|
paperCountsEl.innerHTML = `
|
|
<div class="d-flex flex-wrap gap-2">
|
|
<span class="badge bg-primary">${counts.new || 0} New</span>
|
|
<span class="badge bg-warning">${
|
|
counts.processing || 0
|
|
} Processing</span>
|
|
<span class="badge bg-success">${
|
|
counts.done || 0
|
|
} Done</span>
|
|
<span class="badge bg-danger">${
|
|
counts.failed || 0
|
|
} Failed</span>
|
|
<span class="badge bg-info">${
|
|
counts.pending || 0
|
|
} Pending</span>
|
|
<span class="badge bg-secondary">${
|
|
counts.retrying || 0
|
|
} Retrying</span>
|
|
</div>
|
|
`;
|
|
}
|
|
}
|
|
|
|
updateScrapersTable() {
|
|
const tbody = document.getElementById("scrapersTableBody");
|
|
if (!tbody) return;
|
|
|
|
tbody.innerHTML = "";
|
|
|
|
this.scrapers.forEach((scraper) => {
|
|
const row = document.createElement("tr");
|
|
|
|
// Check if this is the current active scraper
|
|
const isCurrentScraper =
|
|
scraper.name === this.systemConfig.current_scraper_module;
|
|
|
|
if (scraper.error) {
|
|
row.innerHTML = `
|
|
<td>${scraper.name}</td>
|
|
<td colspan="5" class="text-danger">
|
|
<i class="fas fa-exclamation-triangle"></i> ${scraper.error}
|
|
</td>
|
|
`;
|
|
} else {
|
|
row.innerHTML = `
|
|
<td>
|
|
<strong>${scraper.name}</strong>
|
|
${
|
|
scraper.name === "dummy"
|
|
? '<span class="badge bg-info ms-2">Test Module</span>'
|
|
: ""
|
|
}
|
|
${
|
|
isCurrentScraper
|
|
? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
|
|
: ""
|
|
}
|
|
</td>
|
|
<td class="scraper-description">
|
|
${this.truncateDescription(scraper.description)}
|
|
</td>
|
|
<td class="input-status-list">
|
|
${this.renderStatusBadges(
|
|
scraper.input_statuses,
|
|
"bg-info"
|
|
)}
|
|
</td>
|
|
<td class="status-output">
|
|
<span class="badge bg-success">${
|
|
scraper.output_status_success
|
|
}</span>
|
|
</td>
|
|
<td class="status-output">
|
|
<span class="badge bg-danger">${
|
|
scraper.output_status_failure
|
|
}</span>
|
|
</td>
|
|
<td class="status-output">
|
|
<span class="badge bg-warning">${
|
|
scraper.output_status_processing
|
|
}</span>
|
|
</td>
|
|
`;
|
|
}
|
|
|
|
// Highlight the current scraper row
|
|
if (isCurrentScraper) {
|
|
row.classList.add("table-success");
|
|
}
|
|
|
|
tbody.appendChild(row);
|
|
});
|
|
}
|
|
|
|
updateStatusFlowDiagram() {
|
|
const diagramEl = document.getElementById("statusFlowDiagram");
|
|
if (!diagramEl) return;
|
|
|
|
// Analyze actual scrapers to build real flow
|
|
const statusFlow = this.analyzeScraperFlow();
|
|
|
|
let diagramHTML = '<div class="status-flow-container">';
|
|
|
|
// Create visual flow based on actual scrapers
|
|
statusFlow.forEach((stage, index) => {
|
|
if (index > 0) {
|
|
diagramHTML +=
|
|
'<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
|
|
}
|
|
|
|
diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
|
|
diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
|
|
|
|
if (stage.scrapers && stage.scrapers.length > 0) {
|
|
diagramHTML +=
|
|
'<div class="mb-2"><small class="text-muted">Handled by: ' +
|
|
stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
|
|
"</small></div>";
|
|
}
|
|
|
|
diagramHTML += '<div class="status-badges">';
|
|
stage.statuses.forEach((status, statusIndex) => {
|
|
if (statusIndex > 0) {
|
|
diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
|
|
}
|
|
|
|
const badgeClass = this.getStatusBadgeClass(status);
|
|
diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
|
|
});
|
|
diagramHTML += "</div>";
|
|
|
|
if (stage.description) {
|
|
diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
|
|
}
|
|
|
|
diagramHTML += "</div>";
|
|
});
|
|
|
|
diagramHTML += "</div>";
|
|
|
|
// Add explanation
|
|
diagramHTML += `
|
|
<div class="mt-4 p-3 bg-light rounded">
|
|
<h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
|
|
<ul class="small mb-0">
|
|
<li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
|
|
<li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
|
|
<li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
|
|
<li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
|
|
<li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
|
|
</ul>
|
|
</div>
|
|
`;
|
|
|
|
diagramEl.innerHTML = diagramHTML;
|
|
}
|
|
|
|
analyzeScraperFlow() {
|
|
// Build actual flow based on available scrapers
|
|
const stages = [];
|
|
const allInputStatuses = new Set();
|
|
const allOutputStatuses = new Set();
|
|
const scrapersByInput = {};
|
|
|
|
// Analyze scrapers to understand the flow
|
|
this.scrapers.forEach((scraper) => {
|
|
if (scraper.input_statuses) {
|
|
scraper.input_statuses.forEach((status) => {
|
|
allInputStatuses.add(status);
|
|
if (!scrapersByInput[status]) {
|
|
scrapersByInput[status] = [];
|
|
}
|
|
scrapersByInput[status].push(scraper.name);
|
|
});
|
|
}
|
|
|
|
if (scraper.output_status_success)
|
|
allOutputStatuses.add(scraper.output_status_success);
|
|
if (scraper.output_status_failure)
|
|
allOutputStatuses.add(scraper.output_status_failure);
|
|
});
|
|
|
|
// Entry point
|
|
if (allInputStatuses.has("New")) {
|
|
stages.push({
|
|
title: "Entry Point",
|
|
statuses: ["New"],
|
|
scrapers: scrapersByInput["New"] || [],
|
|
description: "Newly uploaded papers enter the processing pipeline",
|
|
});
|
|
}
|
|
|
|
// Processing stages
|
|
const processingStatuses = Array.from(allInputStatuses).filter(
|
|
(status) => !["New", "Done", "Failed"].includes(status)
|
|
);
|
|
|
|
if (processingStatuses.length > 0) {
|
|
stages.push({
|
|
title: "Processing Stages",
|
|
statuses: processingStatuses,
|
|
scrapers: [],
|
|
description: "Papers move through various processing stages",
|
|
});
|
|
}
|
|
|
|
// Final outputs
|
|
const finalStatuses = ["Done", "Failed"];
|
|
stages.push({
|
|
title: "Final States",
|
|
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
|
|
scrapers: [],
|
|
description: "Papers end up in final success or failure states",
|
|
});
|
|
|
|
// Retry handling
|
|
if (allInputStatuses.has("Failed")) {
|
|
stages.push({
|
|
title: "Retry Processing",
|
|
statuses: ["Failed", "Retrying"],
|
|
scrapers: scrapersByInput["Failed"] || [],
|
|
description: "Failed papers can be retried with specialized scrapers",
|
|
});
|
|
}
|
|
|
|
return stages;
|
|
}
|
|
|
|
getStatusBadgeClass(status) {
|
|
const statusClasses = {
|
|
New: "bg-primary",
|
|
Pending: "bg-warning",
|
|
Processing: "bg-warning",
|
|
Retrying: "bg-warning",
|
|
Done: "bg-success",
|
|
Failed: "bg-danger",
|
|
HtmlDownloaded: "bg-info",
|
|
PublisherDetected: "bg-info",
|
|
TextExtracted: "bg-info",
|
|
};
|
|
|
|
return statusClasses[status] || "bg-secondary";
|
|
}
|
|
|
|
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
|
|
if (!Array.isArray(statuses)) return "";
|
|
|
|
return statuses
|
|
.map(
|
|
(status) =>
|
|
`<span class="badge ${this.getStatusBadgeClass(
|
|
status
|
|
)} status-badge">${status}</span>`
|
|
)
|
|
.join("");
|
|
}
|
|
|
|
truncateDescription(description, maxLength = 100) {
|
|
if (!description) return "No description available";
|
|
|
|
if (description.length <= maxLength) return description;
|
|
|
|
return description.substring(0, maxLength).trim() + "...";
|
|
}
|
|
|
|
updatePublishersSection() {
|
|
// Update publisher statistics
|
|
const publisherStatsEl = document.getElementById("publisherStats");
|
|
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
|
|
const stats = this.publishersData.stats;
|
|
publisherStatsEl.innerHTML = `
|
|
<div class="col-md-3">
|
|
<div class="text-center">
|
|
<div class="h4 text-primary mb-1">${stats.total_publishers}</div>
|
|
<div class="text-muted small">Total Publishers</div>
|
|
</div>
|
|
</div>
|
|
<div class="col-md-3">
|
|
<div class="text-center">
|
|
<div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
|
|
<div class="text-muted small">With Parsers</div>
|
|
</div>
|
|
</div>
|
|
<div class="col-md-3">
|
|
<div class="text-center">
|
|
<div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
|
|
<div class="text-muted small">Missing Parsers</div>
|
|
</div>
|
|
</div>
|
|
<div class="col-md-3">
|
|
<div class="text-center">
|
|
<div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
|
|
<div class="text-muted small">Papers with Publisher</div>
|
|
</div>
|
|
</div>
|
|
`;
|
|
}
|
|
|
|
// Update publishers table
|
|
const publishersTableBody = document.getElementById("publishersTableBody");
|
|
if (
|
|
publishersTableBody &&
|
|
this.publishersData &&
|
|
this.publishersData.publishers
|
|
) {
|
|
publishersTableBody.innerHTML = "";
|
|
|
|
if (this.publishersData.publishers.length === 0) {
|
|
publishersTableBody.innerHTML = `
|
|
<tr>
|
|
<td colspan="4" class="text-center text-muted py-4">
|
|
<i class="fas fa-info-circle"></i> No publishers detected yet.<br>
|
|
<small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
|
|
</td>
|
|
</tr>
|
|
`;
|
|
return;
|
|
}
|
|
|
|
this.publishersData.publishers.forEach((publisher) => {
|
|
const row = document.createElement("tr");
|
|
|
|
// Publisher status badge
|
|
const statusBadge = publisher.has_parser
|
|
? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
|
|
: '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
|
|
|
|
// Parser availability indicator
|
|
const parserIndicator = publisher.has_parser
|
|
? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
|
|
: '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
|
|
|
|
row.innerHTML = `
|
|
<td>
|
|
<strong>${publisher.name}</strong>
|
|
</td>
|
|
<td>
|
|
<span class="badge bg-info">${publisher.paper_count}</span>
|
|
</td>
|
|
<td>${statusBadge}</td>
|
|
<td class="text-center">${parserIndicator}</td>
|
|
`;
|
|
|
|
publishersTableBody.appendChild(row);
|
|
});
|
|
}
|
|
}
|
|
|
|
// Public method to show the modal
|
|
show() {
|
|
if (this.modal) {
|
|
const bootstrapModal = new bootstrap.Modal(this.modal);
|
|
bootstrapModal.show();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Global function to load scraper overview (used by retry button)
|
|
function loadScraperOverview() {
|
|
if (window.scraperOverview) {
|
|
window.scraperOverview.loadScraperOverview();
|
|
}
|
|
}
|
|
|
|
// Global function to show scraper overview modal
|
|
function showScraperOverview() {
|
|
if (!window.scraperOverview) {
|
|
window.scraperOverview = new ScraperOverview();
|
|
}
|
|
window.scraperOverview.show();
|
|
}
|
|
|
|
// Initialize when DOM is ready
|
|
document.addEventListener("DOMContentLoaded", function () {
|
|
window.scraperOverview = new ScraperOverview();
|
|
});
|