From 1a1ac9fd60baa54e95d160f789d111a83b4e248e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 3 Oct 2024 16:37:58 -0300 Subject: [PATCH 1/5] Nick: --- apps/api/src/controllers/v0/crawl-status.ts | 10 +++++--- .../api/src/controllers/v1/crawl-status-ws.ts | 8 +++++-- apps/api/src/controllers/v1/crawl-status.ts | 8 +++++-- apps/api/src/main/runWebScraper.ts | 1 + apps/api/src/services/queue-worker.ts | 24 +++++++++---------- 5 files changed, 32 insertions(+), 19 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 1b1ffdc5..f809c7be 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -50,11 +50,15 @@ export async function crawlStatusController(req: Request, res: Response) { return res.status(403).json({ error: "Forbidden" }); } - const jobIDs = await getCrawlJobs(req.params.jobId); + let jobIDs = await getCrawlJobs(req.params.jobId); const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); - const jobStatuses = await Promise.all(jobs.map(x => x.getState())); - const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobs.some((x, i) => jobStatuses[i] === "failed" && x.failedReason !== "Concurrency limit hit") ? "failed" : "active"; + let jobStatuses = await Promise.all(jobs.map(x => x.getState())); + // filter out failed jobs + jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); + // filter the job statues + jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 9832a948..b67e559b 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -94,11 +94,15 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed - const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : jobStatuses.some(x => x[1] === "failed") ? "failed" : "scraping"; + // filter out failed jobs + jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); + // filter the job statues + jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; const doneJobs = await getJobs(doneJobIDs); const data = doneJobs.map(x => x.returnvalue); diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 9c0026a0..63331c9c 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -57,11 +57,15 @@ export async function crawlStatusController(req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed - const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : jobStatuses.some(x => x[1] === "failed") ? "failed" : "scraping"; + // filter out failed jobs + jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); + // filter the job statues + jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 571122f9..6e642c65 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -59,6 +59,7 @@ export async function startWebScraperPipeline({ is_scrape: job.data.is_scrape ?? false, })) as { success: boolean; message: string; docs: Document[] }; } + export async function runWebScraper({ url, mode, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 532e8fee..1ccf486e 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -395,6 +395,7 @@ async function processJob(job: Job, token: string) { pageOptions: sc.pageOptions, origin: job.data.origin, crawl_id: job.data.crawl_id, + webhook: job.data.webhook, v1: job.data.v1, }, {}, @@ -468,9 +469,8 @@ async function processJob(job: Job, token: string) { } } else { const jobIDs = await getCrawlJobs(job.data.crawl_id); - const jobStatuses = await Promise.all(jobIDs.map((x) => getScrapeQueue().getJobState(x))); const jobStatus = - sc.cancelled || jobStatuses.some((x) => x === "failed") + sc.cancelled ? "failed" : "completed"; @@ -554,16 +554,16 @@ async function processJob(job: Job, token: string) { job.data.v1 ); } - if (job.data.v1) { - callWebhook( - job.data.team_id, - job.id as string, - [], - job.data.webhook, - job.data.v1, - "crawl.failed" - ); - } + // if (job.data.v1) { + // callWebhook( + // job.data.team_id, + // job.id as string, + // [], + // job.data.webhook, + // job.data.v1, + // "crawl.failed" + // ); + // } if (job.data.crawl_id) { await logJob({ From 49bd95327e189164801c1e66bab3b48049fd323c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 3 Oct 2024 17:00:33 -0300 Subject: [PATCH 2/5] Update types.ts --- apps/api/src/controllers/v1/types.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 3781eb78..01dff86a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -461,8 +461,8 @@ export function legacyDocumentConverter(doc: any): Document { ...doc.metadata, pageError: undefined, pageStatusCode: undefined, - error: doc.metadata.pageError, - statusCode: doc.metadata.pageStatusCode, + error: doc.metadata?.pageError, + statusCode: doc.metadata?.pageStatusCode, }, }; } From 82551bb6bc74c1d61b96f91fe8d2c20a157a6deb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 3 Oct 2024 17:13:30 -0300 Subject: [PATCH 3/5] Update index.test.ts --- .../__tests__/e2e_v1_withAuth/index.test.ts | 71 ++++++++++--------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 8aabf748..552a7333 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -278,23 +278,24 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(401); }, 60000); - it.concurrent('should return a successful response for a scrape with 403 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/403' }); - await new Promise((r) => setTimeout(r, 5000)); + // Removed it as we want to retry fallback to the next scraper + // it.concurrent('should return a successful response for a scrape with 403 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/403' }); + // await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(403); - }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(403); + // }, 60000); it.concurrent('should return a successful response for a scrape with 404 page', async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -314,23 +315,23 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(404); }, 60000); - it.concurrent('should return a successful response for a scrape with 405 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/405' }); - await new Promise((r) => setTimeout(r, 5000)); + // it.concurrent('should return a successful response for a scrape with 405 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/405' }); + // await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(405); - }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(405); + // }, 60000); it.concurrent('should return a successful response for a scrape with 500 page', async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -680,7 +681,7 @@ describe("POST /v1/crawl", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - limit: 10, + limit: 20, includePaths: ["blog/*"], }); @@ -736,7 +737,7 @@ describe("POST /v1/crawl", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - limit: 10, + limit: 20, excludePaths: ["blog/*"], }); @@ -932,7 +933,7 @@ describe("GET /v1/crawl/:jobId", () => { expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 10000)); + await new Promise((r) => setTimeout(r, 5000)); const responseCancel = await request(TEST_URL) .delete(`/v1/crawl/${crawlResponse.body.id}`) From ddd774ed6869c2bb0388766ba0fb20febe1dc359 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 3 Oct 2024 17:20:57 -0300 Subject: [PATCH 4/5] Nick: --- .../__tests__/e2e_v1_withAuth/index.test.ts | 40 +++++++++---------- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 552a7333..eef65125 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -333,23 +333,23 @@ describe("E2E Tests for v1 API Routes", () => { // expect(response.body.data.metadata.statusCode).toBe(405); // }, 60000); - it.concurrent('should return a successful response for a scrape with 500 page', async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/500' }); - await new Promise((r) => setTimeout(r, 5000)); + // it.concurrent('should return a successful response for a scrape with 500 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/500' }); + // await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(500); - }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(500); + // }, 60000); it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -681,7 +681,7 @@ describe("POST /v1/crawl", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - limit: 20, + limit: 40, includePaths: ["blog/*"], }); @@ -737,7 +737,7 @@ describe("POST /v1/crawl", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - limit: 20, + limit: 40, excludePaths: ["blog/*"], }); @@ -929,11 +929,11 @@ describe("GET /v1/crawl/:jobId", () => { .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://docs.tatum.io", limit: 200 }); + .send({ url: "https://docs.firecrawl.dev", limit: 10 }); expect(crawlResponse.statusCode).toBe(200); - await new Promise((r) => setTimeout(r, 5000)); + await new Promise((r) => setTimeout(r, 10000)); const responseCancel = await request(TEST_URL) .delete(`/v1/crawl/${crawlResponse.body.id}`) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 13ca7dd2..767f30e1 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -425,7 +425,7 @@ export async function scrapSingleUrl( Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; } - if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) { + if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400 || pageStatusCode == 401)) { Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`); break; } From c6a29efbed94100b1cfa8c90d8dbb0693f17207b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 3 Oct 2024 17:33:38 -0300 Subject: [PATCH 5/5] Update crawl-status.ts --- apps/api/src/controllers/v0/crawl-status.ts | 25 +++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index f809c7be..4c50b375 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -49,15 +49,26 @@ export async function crawlStatusController(req: Request, res: Response) { if (sc.team_id !== team_id) { return res.status(403).json({ error: "Forbidden" }); } - let jobIDs = await getCrawlJobs(req.params.jobId); - - const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + let jobs = await getJobs(req.params.jobId, jobIDs); let jobStatuses = await Promise.all(jobs.map(x => x.getState())); - // filter out failed jobs - jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed")); - // filter the job statues - jobStatuses = jobStatuses.filter(x => x[1] !== "failed"); + + // Combine jobs and jobStatuses into a single array of objects + let jobsWithStatuses = jobs.map((job, index) => ({ + job, + status: jobStatuses[index] + })); + + // Filter out failed jobs + jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed"); + + // Sort jobs by timestamp + jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp); + + // Extract sorted jobs and statuses + jobs = jobsWithStatuses.map(x => x.job); + jobStatuses = jobsWithStatuses.map(x => x.status); + const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);