From b96b97ed721c677620020b91791e1b222db0641f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Tue, 7 Jan 2025 10:09:15 +0100 Subject: [PATCH] fix(crawl): don't push rawhtml to db unless requested --- apps/api/src/controllers/v1/crawl-status.ts | 2 ++ apps/api/src/services/queue-worker.ts | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index ce3831f2..f5fcf6b6 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -196,6 +196,7 @@ export async function crawlStatusController( nextURL.searchParams.set("limit", req.query.limit); } + // deprecated: this is done on queue-worker side now. if you see this after january 8, 2025, remove this if (data.length > 0) { if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) { for (let ii = 0; ii < doneJobs.length; ii++) { @@ -205,6 +206,7 @@ export async function crawlStatusController( } } } + // remove until here res.status(200).json({ success: true, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a48c798b..90c96cf6 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -674,6 +674,10 @@ async function processJob(job: Job & { id: string }, token: string) { const rawHtml = doc.rawHtml ?? ""; + if (!job.data.scrapeOptions.formats.includes("rawHtml")) { + delete doc.rawHtml; + } + const data = { success: true, result: {