Files
firecrawl/apps/api/src/controllers/v0/crawl-status.ts
T

128 lines
3.7 KiB
TypeScript
Raw Normal View History

2024-04-20 16:38:05 -07:00
import { Request, Response } from "express";
2024-08-26 18:48:00 -03:00
import { authenticateUser } from "../auth";
2024-08-15 21:51:59 +02:00
import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
2024-11-07 20:57:33 +01:00
import { logger } from "../../../src/lib/logger";
2024-08-15 21:51:59 +02:00
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
2024-09-04 15:57:57 -03:00
import { configDotenv } from "dotenv";
2024-11-07 20:57:33 +01:00
import { Job } from "bullmq";
import { toLegacyDocument } from "../v1/types";
2024-09-04 15:57:57 -03:00
configDotenv();
2024-08-16 23:39:39 +02:00
export async function getJobs(crawlId: string, ids: string[]) {
2024-12-11 19:46:11 -03:00
const jobs = (
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
).filter((x) => x) as Job[];
2024-08-16 23:39:39 +02:00
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
2024-08-16 23:39:39 +02:00
2024-12-11 19:46:11 -03:00
supabaseData.forEach((x) => {
const job = jobs.find((y) => y.id === x.job_id);
2024-08-16 23:39:39 +02:00
if (job) {
job.returnvalue = x.docs;
}
2024-12-11 19:46:11 -03:00
});
2024-08-16 23:39:39 +02:00
}
2024-12-11 19:46:11 -03:00
jobs.forEach((job) => {
job.returnvalue = Array.isArray(job.returnvalue)
? job.returnvalue[0]
: job.returnvalue;
2024-08-16 23:39:39 +02:00
});
return jobs;
}
2024-04-20 16:38:05 -07:00
export async function crawlStatusController(req: Request, res: Response) {
try {
2024-12-11 19:46:11 -03:00
const auth = await authenticateUser(req, res, RateLimiterMode.CrawlStatus);
2024-11-07 20:57:33 +01:00
if (!auth.success) {
return res.status(auth.status).json({ error: auth.error });
2024-04-20 16:38:05 -07:00
}
2024-08-13 20:51:43 +02:00
2024-11-07 20:57:33 +01:00
const { team_id } = auth;
2024-08-13 20:51:43 +02:00
const sc = await getCrawl(req.params.jobId);
if (!sc) {
2024-04-20 16:38:05 -07:00
return res.status(404).json({ error: "Job not found" });
}
2024-08-13 21:40:59 +02:00
if (sc.team_id !== team_id) {
return res.status(403).json({ error: "Forbidden" });
}
2024-10-03 16:37:58 -03:00
let jobIDs = await getCrawlJobs(req.params.jobId);
2024-10-03 17:33:38 -03:00
let jobs = await getJobs(req.params.jobId, jobIDs);
2024-12-11 19:46:11 -03:00
let jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
2024-10-03 17:33:38 -03:00
// Combine jobs and jobStatuses into a single array of objects
let jobsWithStatuses = jobs.map((job, index) => ({
job,
status: jobStatuses[index]
}));
// Filter out failed jobs
2024-12-11 19:46:11 -03:00
jobsWithStatuses = jobsWithStatuses.filter(
(x) => x.status !== "failed" && x.status !== "unknown"
);
2024-10-03 17:33:38 -03:00
// Sort jobs by timestamp
jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp);
// Extract sorted jobs and statuses
2024-12-11 19:46:11 -03:00
jobs = jobsWithStatuses.map((x) => x.job);
jobStatuses = jobsWithStatuses.map((x) => x.status);
const jobStatus = sc.cancelled
? "failed"
: jobStatuses.every((x) => x === "completed")
? "completed"
: "active";
const data = jobs
.filter(
(x) =>
x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null
)
.map((x) =>
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
);
2024-08-29 12:24:14 -03:00
if (
jobs.length > 0 &&
jobs[0].data &&
jobs[0].data.pageOptions &&
!jobs[0].data.pageOptions.includeRawHtml
) {
2024-12-11 19:46:11 -03:00
data.forEach((item) => {
2024-08-29 12:24:14 -03:00
if (item) {
delete item.rawHtml;
}
});
}
2024-04-20 16:38:05 -07:00
res.json({
2024-08-13 20:51:43 +02:00
status: jobStatus,
2024-12-11 19:46:11 -03:00
current: jobStatuses.filter((x) => x === "completed" || x === "failed")
.length,
2024-08-13 20:51:43 +02:00
total: jobs.length,
2024-12-11 19:46:11 -03:00
data:
jobStatus === "completed"
? data.map((x) => toLegacyDocument(x, sc.internalOptions))
: null,
partial_data:
jobStatus === "completed"
? []
: data
.filter((x) => x !== null)
.map((x) => toLegacyDocument(x, sc.internalOptions))
2024-04-20 16:38:05 -07:00
});
} catch (error) {
Sentry.captureException(error);
2024-11-07 20:57:33 +01:00
logger.error(error);
2024-04-20 16:38:05 -07:00
return res.status(500).json({ error: error.message });
}
}