From d347160ff9bac2b63e6c72db1b828a326634d3b0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 30 Aug 2024 17:32:41 -0300 Subject: [PATCH] Nick: --- apps/api/src/controllers/v0/scrape.ts | 15 ++++++++++++++- apps/api/src/controllers/v1/scrape.ts | 12 ++++++++++++ apps/api/src/services/queue-worker.ts | 18 ------------------ 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index d2614d4d..40df5021 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -254,13 +254,26 @@ export async function scrapeController(req: Request, res: Response) { } } } + + let doc = result.data; + if (!pageOptions || !pageOptions.includeRawHtml) { + if (doc && doc.rawHtml) { + delete doc.rawHtml; + } + } + + if(pageOptions && pageOptions.includeExtract) { + if(!pageOptions.includeMarkdown && doc && doc.markdown) { + delete doc.markdown; + } + } logJob({ job_id: jobId, success: result.success, message: result.error, num_docs: 1, - docs: [result.data], + docs: [doc], time_taken: timeTakenInSeconds, team_id: team_id, mode: "scrape", diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 20ec250a..c573e100 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -115,6 +115,18 @@ export async function scrapeController( }); } + if (!pageOptions || !pageOptions.includeRawHtml) { + if (doc && doc.rawHtml) { + delete doc.rawHtml; + } + } + + if(pageOptions && pageOptions.includeExtract) { + if(!pageOptions.includeMarkdown && doc && doc.markdown) { + delete doc.markdown; + } + } + logJob({ job_id: jobId, success: true, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index dc3f03f7..af2ec851 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -202,24 +202,6 @@ async function processJob(job: Job, token: string) { const rawHtml = docs[0] ? docs[0].rawHtml : ""; - if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) { - if (docs[0] && docs[0].rawHtml) { - delete docs[0].rawHtml; - } - } - - if(job.data.pageOptions && job.data.pageOptions.includeExtract ) { - if(!job.data.pageOptions.includeMarkdown) { - delete docs[0].markdown; - } - // if(!job.data.pageOptions.includeRawHtml) { - // delete docs[0].rawHtml; - // } - // if(!job.data.pageOptions.includeHtml) { - // delete docs[0].html; - // } - } - const data = { success, result: {