Files
firecrawl/apps/api/src/services/queue-worker.ts
T

241 lines
7.1 KiB
TypeScript
Raw Normal View History

2024-04-15 17:01:47 -04:00
import { CustomError } from "../lib/custom-error";
2024-07-30 14:44:13 -04:00
import {
getWebScraperQueue,
getScrapeQueue,
redisConnection,
webScraperQueueName,
scrapeQueueName,
} from "./queue-service";
2024-04-15 17:01:47 -04:00
import "dotenv/config";
import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook";
2024-04-20 13:53:11 -07:00
import { logJob } from "./logging/log_job";
2024-07-30 14:44:13 -04:00
import { initSDK } from "@hyperdx/node-opentelemetry";
2024-08-06 16:26:46 +02:00
import { Job, QueueEvents, tryCatch } from "bullmq";
2024-07-23 17:30:46 -03:00
import { Logger } from "../lib/logger";
2024-07-24 18:44:14 +02:00
import { ScrapeEvents } from "../lib/scrape-events";
2024-07-30 13:27:23 -04:00
import { Worker } from "bullmq";
import systemMonitor from "./system-monitor";
import { v4 as uuidv4 } from "uuid";
2024-05-20 13:36:34 -07:00
2024-07-30 14:44:13 -04:00
if (process.env.ENV === "production") {
2024-07-11 23:14:15 +02:00
initSDK({
consoleCapture: true,
additionalInstrumentations: [],
});
}
2024-07-30 13:27:23 -04:00
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
2024-04-15 17:01:47 -04:00
2024-07-30 13:27:23 -04:00
const workerLockDuration = Number(process.env.WORKER_LOCK_DURATION) || 60000;
const workerStalledCheckInterval =
Number(process.env.WORKER_STALLED_CHECK_INTERVAL) || 30000;
const jobLockExtendInterval =
Number(process.env.JOB_LOCK_EXTEND_INTERVAL) || 15000;
const jobLockExtensionTime =
Number(process.env.JOB_LOCK_EXTENSION_TIME) || 60000;
2024-07-30 13:27:23 -04:00
const cantAcceptConnectionInterval =
Number(process.env.CANT_ACCEPT_CONNECTION_INTERVAL) || 2000;
const connectionMonitorInterval =
Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10;
const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
2024-07-11 20:08:21 +02:00
const wsq = getWebScraperQueue();
2024-07-30 14:44:13 -04:00
const sq = getScrapeQueue();
2024-07-11 20:08:21 +02:00
2024-07-30 13:27:23 -04:00
const processJobInternal = async (token: string, job: Job) => {
const extendLockInterval = setInterval(async () => {
2024-08-07 19:49:48 +02:00
Logger.info(`🐂 Worker extending lock on job ${job.id}`);
2024-07-30 13:27:23 -04:00
await job.extendLock(token, jobLockExtensionTime);
}, jobLockExtendInterval);
try {
const result = await processJob(job, token);
2024-08-09 14:07:46 -04:00
try{
2024-08-07 19:39:09 +02:00
await job.moveToCompleted(result.docs, token, false);
2024-08-09 14:07:46 -04:00
}catch(e){
2024-07-30 14:44:13 -04:00
}
2024-07-30 13:27:23 -04:00
} catch (error) {
console.log("Job failed, error:", error);
await job.moveToFailed(error, token, false);
} finally {
clearInterval(extendLockInterval);
}
};
let isShuttingDown = false;
process.on("SIGINT", () => {
console.log("Received SIGINT. Shutting down gracefully...");
isShuttingDown = true;
});
2024-07-30 14:44:13 -04:00
const workerFun = async (queueName: string, processJobInternal: (token: string, job: Job) => Promise<void>) => {
const worker = new Worker(queueName, null, {
2024-07-30 13:27:23 -04:00
connection: redisConnection,
lockDuration: 1 * 60 * 1000, // 1 minute
// lockRenewTime: 15 * 1000, // 15 seconds
stalledInterval: 30 * 1000, // 30 seconds
maxStalledCount: 10, // 10 times
});
worker.startStalledCheckTimer();
const monitor = await systemMonitor;
while (true) {
if (isShuttingDown) {
console.log("No longer accepting new jobs. SIGINT");
break;
}
const token = uuidv4();
const canAcceptConnection = await monitor.acceptConnection();
if (!canAcceptConnection) {
console.log("Cant accept connection");
await sleep(cantAcceptConnectionInterval); // more sleep
continue;
}
const job = await worker.getNextJob(token);
if (job) {
2024-08-06 16:57:00 +02:00
processJobInternal(token, job);
2024-07-30 13:27:23 -04:00
await sleep(gotJobInterval);
} else {
await sleep(connectionMonitorInterval);
}
}
};
2024-07-30 14:44:13 -04:00
workerFun(webScraperQueueName, processJobInternal);
workerFun(scrapeQueueName, processJobInternal);
2024-07-30 13:27:23 -04:00
async function processJob(job: Job, token: string) {
Logger.info(`🐂 Worker taking job ${job.id}`);
2024-07-24 18:44:14 +02:00
try {
2024-07-30 13:27:23 -04:00
job.updateProgress({
current: 1,
total: 100,
current_step: "SCRAPING",
current_url: "",
});
const start = Date.now();
2024-07-30 14:44:13 -04:00
const { success, message, docs } = await startWebScraperPipeline({
job,
token,
});
const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000;
2024-04-20 19:37:45 -07:00
2024-08-06 16:26:46 +02:00
const isCancelled = await (await getWebScraperQueue().client).exists("cancelled:" + job.id);
if (isCancelled) {
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), job.token);
await job.discard();
}
const data = {
2024-08-06 16:26:46 +02:00
success,
result: {
2024-08-06 16:26:46 +02:00
links: isCancelled ? [] : docs.map((doc) => {
2024-07-30 14:44:13 -04:00
return {
content: doc,
source: doc?.metadata?.sourceURL ?? doc?.url ?? "",
};
}),
},
project_id: job.data.project_id,
2024-08-06 16:26:46 +02:00
error: isCancelled ? "Job cancelled by user" : message /* etc... */,
docs: isCancelled ? [] : docs,
};
2024-04-15 17:01:47 -04:00
2024-08-06 16:26:46 +02:00
if (job.data.mode === "crawl" && !isCancelled) {
2024-07-25 00:14:25 +02:00
await callWebhook(job.data.team_id, job.id as string, data);
}
2024-04-20 19:37:45 -07:00
await logJob({
job_id: job.id as string,
2024-08-06 16:26:46 +02:00
success: success && !isCancelled,
message: isCancelled ? "Job cancelled by user" : message,
num_docs: isCancelled ? 0 : docs.length,
docs: isCancelled ? [] : docs,
time_taken: timeTakenInSeconds,
team_id: job.data.team_id,
2024-07-25 00:14:25 +02:00
mode: job.data.mode,
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
});
Logger.info(`🐂 Job done ${job.id}`);
2024-07-30 13:27:23 -04:00
return data;
} catch (error) {
2024-07-23 17:30:46 -03:00
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
2024-07-30 13:27:23 -04:00
if (await getWebScraperQueue().isPaused()) {
2024-07-23 17:30:46 -03:00
Logger.debug("🐂Queue is paused, ignoring");
return;
}
2024-04-15 17:01:47 -04:00
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job
2024-07-23 17:30:46 -03:00
Logger.error(error.message); // or any other error handling
2024-04-15 17:01:47 -04:00
logtail.error("Custom error while ingesting", {
2024-04-15 17:01:47 -04:00
job_id: job.id,
error: error.message,
dataIngestionJob: error.dataIngestionJob,
2024-04-15 17:01:47 -04:00
});
}
2024-07-23 17:30:46 -03:00
Logger.error(error);
logtail.error("Overall error ingesting", {
job_id: job.id,
error: error.message,
});
const data = {
success: false,
2024-07-30 14:44:13 -04:00
docs: [],
project_id: job.data.project_id,
error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
};
2024-07-25 00:14:25 +02:00
if (job.data.mode === "crawl") {
await callWebhook(job.data.team_id, job.id as string, data);
}
await logJob({
job_id: job.id as string,
success: false,
2024-07-30 14:44:13 -04:00
message:
typeof error === "string"
? error
: error.message ?? "Something went wrong... Contact help@mendable.ai",
num_docs: 0,
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
url: job.data.url,
crawlerOptions: job.data.crawlerOptions,
pageOptions: job.data.pageOptions,
origin: job.data.origin,
});
2024-07-30 13:27:23 -04:00
// done(null, data);
return data;
2024-04-15 17:01:47 -04:00
}
}
2024-07-30 13:27:23 -04:00
// wsq.process(
// Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
// processJob
// );
// wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting"));
// wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active"));
// wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed"));
// wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
// wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
// wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));