From 00335e2ba9a827db9964a9b82c5feaa990633533 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 11 Dec 2024 19:46:11 -0300 Subject: [PATCH 1/3] Nick: fixed prettier --- apps/api/.prettierrc | 3 + apps/api/package.json | 2 +- .../src/__tests__/e2e_extract/index.test.ts | 509 ++-- .../__tests__/e2e_full_withAuth/index.test.ts | 2246 +++++++++-------- apps/api/src/__tests__/e2e_map/index.test.ts | 10 +- .../src/__tests__/e2e_noAuth/index.test.ts | 15 +- .../__tests__/e2e_v1_withAuth/index.test.ts | 1487 +++++------ .../e2e_v1_withAuth_all_params/index.test.ts | 576 +++-- .../src/__tests__/e2e_withAuth/index.test.ts | 51 +- .../src/controllers/__tests__/crawl.test.ts | 38 +- apps/api/src/controllers/auth.ts | 39 +- apps/api/src/controllers/v0/admin/queue.ts | 18 +- .../src/controllers/v0/admin/redis-health.ts | 2 +- apps/api/src/controllers/v0/crawl-cancel.ts | 8 +- apps/api/src/controllers/v0/crawl-status.ts | 66 +- apps/api/src/controllers/v0/crawl.ts | 75 +- apps/api/src/controllers/v0/crawlPreview.ts | 92 +- apps/api/src/controllers/v0/keyAuth.ts | 8 +- apps/api/src/controllers/v0/scrape.ts | 87 +- apps/api/src/controllers/v0/search.ts | 77 +- apps/api/src/controllers/v0/status.ts | 29 +- .../v1/__tests__/urlValidation.test.ts | 32 +- apps/api/src/controllers/v1/batch-scrape.ts | 76 +- .../src/controllers/v1/concurrency-check.ts | 2 +- apps/api/src/controllers/v1/crawl-cancel.ts | 7 +- .../api/src/controllers/v1/crawl-status-ws.ts | 113 +- apps/api/src/controllers/v1/crawl-status.ts | 122 +- apps/api/src/controllers/v1/crawl.ts | 91 +- apps/api/src/controllers/v1/extract.ts | 124 +- apps/api/src/controllers/v1/map.ts | 50 +- apps/api/src/controllers/v1/scrape-status.ts | 12 +- apps/api/src/controllers/v1/scrape.ts | 44 +- apps/api/src/controllers/v1/types.ts | 566 +++-- apps/api/src/index.ts | 125 +- apps/api/src/lib/LLM-extraction/index.ts | 2 +- apps/api/src/lib/LLM-extraction/models.ts | 26 +- .../lib/__tests__/html-to-markdown.test.ts | 46 +- .../src/lib/__tests__/job-priority.test.ts | 6 +- apps/api/src/lib/batch-process.ts | 27 +- apps/api/src/lib/cache.ts | 74 +- apps/api/src/lib/concurrency-limit.ts | 81 +- apps/api/src/lib/crawl-redis.test.ts | 62 +- apps/api/src/lib/crawl-redis.ts | 401 +-- apps/api/src/lib/custom-error.ts | 3 +- apps/api/src/lib/default-values.ts | 6 +- apps/api/src/lib/entities.ts | 86 +- apps/api/src/lib/extract/build-document.ts | 8 +- apps/api/src/lib/extract/reranker.ts | 12 +- apps/api/src/lib/html-to-markdown.ts | 50 +- apps/api/src/lib/job-priority.ts | 2 +- apps/api/src/lib/logger.ts | 58 +- apps/api/src/lib/parseApi.ts | 1 - apps/api/src/lib/ranker.test.ts | 57 +- apps/api/src/lib/ranker.ts | 66 +- apps/api/src/lib/scrape-events.ts | 96 +- apps/api/src/lib/supabase-jobs.ts | 5 +- apps/api/src/lib/timeout.ts | 2 +- apps/api/src/lib/validate-country.ts | 502 ++-- apps/api/src/lib/validateUrl.test.ts | 68 +- apps/api/src/lib/validateUrl.ts | 65 +- apps/api/src/lib/withAuth.ts | 4 +- apps/api/src/main/runWebScraper.ts | 114 +- apps/api/src/routes/admin.ts | 9 +- apps/api/src/routes/v0.ts | 2 +- apps/api/src/routes/v1.ts | 270 +- apps/api/src/run-req.ts | 18 +- .../WebScraper/__tests__/crawler.test.ts | 67 +- .../scraper/WebScraper/__tests__/dns.test.ts | 4 +- apps/api/src/scraper/WebScraper/crawler.ts | 190 +- .../WebScraper/custom/handleCustomScraping.ts | 27 +- apps/api/src/scraper/WebScraper/sitemap.ts | 72 +- .../utils/__tests__/blocklist.test.ts | 138 +- .../utils/__tests__/maxDepthUtils.test.ts | 47 +- .../src/scraper/WebScraper/utils/blocklist.ts | 95 +- .../scraper/WebScraper/utils/maxDepthUtils.ts | 11 +- .../WebScraper/utils/removeBase64Images.ts | 6 +- .../scraper/scrapeURL/engines/cache/index.ts | 22 +- .../scraper/scrapeURL/engines/docx/index.ts | 12 +- .../scraper/scrapeURL/engines/fetch/index.ts | 46 +- .../engines/fire-engine/checkStatus.ts | 181 +- .../scrapeURL/engines/fire-engine/delete.ts | 53 +- .../scrapeURL/engines/fire-engine/index.ts | 426 ++-- .../scrapeURL/engines/fire-engine/scrape.ts | 136 +- .../src/scraper/scrapeURL/engines/index.ts | 583 +++-- .../scraper/scrapeURL/engines/pdf/index.ts | 283 ++- .../scrapeURL/engines/playwright/index.ts | 71 +- .../scrapeURL/engines/scrapingbee/index.ts | 116 +- .../scrapeURL/engines/utils/downloadFile.ts | 79 +- .../engines/utils/specialtyHandler.ts | 36 +- apps/api/src/scraper/scrapeURL/error.ts | 66 +- apps/api/src/scraper/scrapeURL/index.ts | 614 +++-- .../src/scraper/scrapeURL/lib/extractLinks.ts | 61 +- .../scraper/scrapeURL/lib/extractMetadata.ts | 57 +- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 317 ++- .../scrapeURL/lib/removeUnwantedElements.ts | 181 +- .../scrapeURL/lib/urlSpecificParams.ts | 76 +- .../src/scraper/scrapeURL/scrapeURL.test.ts | 847 ++++--- .../scraper/scrapeURL/transformers/cache.ts | 46 +- .../scraper/scrapeURL/transformers/index.ts | 235 +- .../scrapeURL/transformers/llmExtract.ts | 366 +-- .../transformers/removeBase64Images.ts | 13 +- .../transformers/uploadScreenshot.ts | 36 +- apps/api/src/search/fireEngine.ts | 6 +- apps/api/src/search/googlesearch.ts | 238 +- apps/api/src/search/index.ts | 4 +- apps/api/src/search/searchapi.ts | 16 +- apps/api/src/search/serper.ts | 17 +- apps/api/src/services/alerts/index.ts | 2 +- apps/api/src/services/alerts/slack.ts | 6 +- apps/api/src/services/billing/auto_charge.ts | 246 +- .../src/services/billing/credit_billing.ts | 82 +- .../api/src/services/billing/issue_credits.ts | 2 +- apps/api/src/services/billing/stripe.ts | 11 +- apps/api/src/services/idempotency/create.ts | 6 +- apps/api/src/services/idempotency/validate.ts | 24 +- apps/api/src/services/logging/crawl_log.ts | 16 +- apps/api/src/services/logging/log_job.ts | 42 +- apps/api/src/services/logging/scrape_log.ts | 6 +- .../notification/email_notification.ts | 188 +- apps/api/src/services/posthog.ts | 6 +- apps/api/src/services/queue-jobs.ts | 95 +- apps/api/src/services/queue-service.ts | 13 +- apps/api/src/services/queue-worker.ts | 359 ++- apps/api/src/services/rate-limiter.test.ts | 8 +- apps/api/src/services/rate-limiter.ts | 66 +- apps/api/src/services/redis.ts | 7 +- apps/api/src/services/redlock.ts | 2 +- apps/api/src/services/sentry.ts | 6 +- apps/api/src/services/supabase.ts | 4 +- apps/api/src/services/system-monitor.ts | 401 +-- apps/api/src/services/webhook.ts | 43 +- apps/api/src/strings.ts | 2 +- apps/api/src/supabase_types.ts | 30 +- apps/api/src/types.ts | 72 +- 134 files changed, 9565 insertions(+), 7108 deletions(-) create mode 100644 apps/api/.prettierrc diff --git a/apps/api/.prettierrc b/apps/api/.prettierrc new file mode 100644 index 00000000..d93a7f24 --- /dev/null +++ b/apps/api/.prettierrc @@ -0,0 +1,3 @@ +{ + "trailingComma": "none" +} \ No newline at end of file diff --git a/apps/api/package.json b/apps/api/package.json index 56724de7..86f798e9 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -6,7 +6,7 @@ "scripts": { "start": "nodemon --exec ts-node src/index.ts", "start:production": "tsc && node dist/src/index.js", - "format": "prettier --write \"src/**/*.(js|ts)\"", + "format": "npx prettier --write \"src/**/*.(js|ts)\"", "flyio": "node dist/src/index.js", "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc && pnpm sentry:sourcemaps", diff --git a/apps/api/src/__tests__/e2e_extract/index.test.ts b/apps/api/src/__tests__/e2e_extract/index.test.ts index 679dc3cd..117cbab1 100644 --- a/apps/api/src/__tests__/e2e_extract/index.test.ts +++ b/apps/api/src/__tests__/e2e_extract/index.test.ts @@ -3,264 +3,305 @@ import dotenv from "dotenv"; import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, - FirecrawlScrapeResponse, + FirecrawlScrapeResponse } from "../../types"; dotenv.config(); const TEST_URL = "http://127.0.0.1:3002"; describe("E2E Tests for Extract API Routes", () => { - it.concurrent("should return authors of blog posts on firecrawl.dev", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["https://firecrawl.dev/*"], - prompt: "Who are the authors of the blog posts?", - schema: { - type: "object", - properties: { authors: { type: "array", items: { type: "string" } } }, - }, - }); - - console.log(response.body); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("authors"); - - let gotItRight = 0; - for (const author of response.body.data?.authors) { - if (author.includes("Caleb Peffer")) gotItRight++; - if (author.includes("Gergő Móricz")) gotItRight++; - if (author.includes("Eric Ciarla")) gotItRight++; - if (author.includes("Nicolas Camara")) gotItRight++; - if (author.includes("Jon")) gotItRight++; - if (author.includes("Wendong")) gotItRight++; - - } - - expect(gotItRight).toBeGreaterThan(1); - }, 60000); - - it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["firecrawl.dev/*"], - prompt: "Who are the founders of the company?", - allowExternalLinks: true, - schema: { - type: "object", - properties: { founders: { type: "array", items: { type: "string" } } }, - }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("founders"); - - console.log(response.body.data?.founders); - let gotItRight = 0; - for (const founder of response.body.data?.founders) { - if (founder.includes("Caleb")) gotItRight++; - if (founder.includes("Eric")) gotItRight++; - if (founder.includes("Nicolas")) gotItRight++; - if (founder.includes("nick")) gotItRight++; - if (founder.includes("eric")) gotItRight++; - if (founder.includes("jon-noronha")) gotItRight++; - - } - - expect(gotItRight).toBeGreaterThanOrEqual(2); - }, 60000); - - it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["https://firecrawl.dev/*"], - prompt: "What are they hiring for?", - allowExternalLinks: true, - schema: { - type: "array", - items: { - type: "string" - }, - required: ["items"] - }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - console.log(response.body.data); - - let gotItRight = 0; - for (const hiring of response.body.data?.items) { - if (hiring.includes("Developer Support Engineer")) gotItRight++; - if (hiring.includes("Dev Ops Engineer")) gotItRight++; - if (hiring.includes("Founding Web Automation Engineer")) gotItRight++; - } - - expect(gotItRight).toBeGreaterThan(2); - }, 60000); - - it.concurrent("should return PCI DSS compliance for Fivetran", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["fivetran.com/*"], - prompt: "Does Fivetran have PCI DSS compliance?", - allowExternalLinks: true, - schema: { - type: "object", - properties: { - pciDssCompliance: { type: "boolean" } - } - }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data?.pciDssCompliance).toBe(true); - }, 60000); - - it.concurrent("should return Azure Data Connectors for Fivetran", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["fivetran.com/*"], - prompt: "What are the Azure Data Connectors they offer?", - schema: { - type: "array", - items: { + it.concurrent( + "should return authors of blog posts on firecrawl.dev", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://firecrawl.dev/*"], + prompt: "Who are the authors of the blog posts?", + schema: { type: "object", properties: { - connector: { type: "string" }, - description: { type: "string" }, - supportsCaptureDelete: { type: "boolean" } + authors: { type: "array", items: { type: "string" } } } } - } - }) + }); - console.log(response.body); - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty("data"); - // expect(response.body.data?.pciDssCompliance).toBe(true); - }, 60000); + console.log(response.body); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("authors"); - it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"], - prompt: "what applicant tracking system is this company using?", - schema: { - type: "object", - properties: { - isGreenhouseATS: { type: "boolean" }, - answer: { type: "string" } - } - }, - allowExternalLinks: true - }) + let gotItRight = 0; + for (const author of response.body.data?.authors) { + if (author.includes("Caleb Peffer")) gotItRight++; + if (author.includes("Gergő Móricz")) gotItRight++; + if (author.includes("Eric Ciarla")) gotItRight++; + if (author.includes("Nicolas Camara")) gotItRight++; + if (author.includes("Jon")) gotItRight++; + if (author.includes("Wendong")) gotItRight++; + } - console.log(response.body); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data?.isGreenhouseATS).toBe(true); - }, 60000); + expect(gotItRight).toBeGreaterThan(1); + }, + 60000 + ); - it.concurrent("should return mintlify api components", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["https://mintlify.com/docs/*"], - prompt: "what are the 4 API components?", - schema: { - type: "array", - items: { + it.concurrent( + "should return founders of firecrawl.dev (allowExternalLinks = true)", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["firecrawl.dev/*"], + prompt: "Who are the founders of the company?", + allowExternalLinks: true, + schema: { type: "object", properties: { - component: { type: "string" } + founders: { type: "array", items: { type: "string" } } + } + } + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("founders"); + + console.log(response.body.data?.founders); + let gotItRight = 0; + for (const founder of response.body.data?.founders) { + if (founder.includes("Caleb")) gotItRight++; + if (founder.includes("Eric")) gotItRight++; + if (founder.includes("Nicolas")) gotItRight++; + if (founder.includes("nick")) gotItRight++; + if (founder.includes("eric")) gotItRight++; + if (founder.includes("jon-noronha")) gotItRight++; + } + + expect(gotItRight).toBeGreaterThanOrEqual(2); + }, + 60000 + ); + + it.concurrent( + "should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://firecrawl.dev/*"], + prompt: "What are they hiring for?", + allowExternalLinks: true, + schema: { + type: "array", + items: { + type: "string" + }, + required: ["items"] + } + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + console.log(response.body.data); + + let gotItRight = 0; + for (const hiring of response.body.data?.items) { + if (hiring.includes("Developer Support Engineer")) gotItRight++; + if (hiring.includes("Dev Ops Engineer")) gotItRight++; + if (hiring.includes("Founding Web Automation Engineer")) gotItRight++; + } + + expect(gotItRight).toBeGreaterThan(2); + }, + 60000 + ); + + it.concurrent( + "should return PCI DSS compliance for Fivetran", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["fivetran.com/*"], + prompt: "Does Fivetran have PCI DSS compliance?", + allowExternalLinks: true, + schema: { + type: "object", + properties: { + pciDssCompliance: { type: "boolean" } + } + } + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data?.pciDssCompliance).toBe(true); + }, + 60000 + ); + + it.concurrent( + "should return Azure Data Connectors for Fivetran", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["fivetran.com/*"], + prompt: "What are the Azure Data Connectors they offer?", + schema: { + type: "array", + items: { + type: "object", + properties: { + connector: { type: "string" }, + description: { type: "string" }, + supportsCaptureDelete: { type: "boolean" } + } + } + } + }); + + console.log(response.body); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty("data"); + // expect(response.body.data?.pciDssCompliance).toBe(true); + }, + 60000 + ); + + it.concurrent( + "should return Greenhouse Applicant Tracking System for Abnormal Security", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: [ + "https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003" + ], + prompt: "what applicant tracking system is this company using?", + schema: { + type: "object", + properties: { + isGreenhouseATS: { type: "boolean" }, + answer: { type: "string" } } }, - required: ["items"] - }, - allowExternalLinks: true - }) + allowExternalLinks: true + }); - console.log(response.body.data?.items); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data?.items.length).toBe(4); - let gotItRight = 0; - for (const component of response.body.data?.items) { - if (component.component.toLowerCase().includes("parameter")) gotItRight++; - if (component.component.toLowerCase().includes("response")) gotItRight++; - if (component.component.toLowerCase().includes("expandable")) gotItRight++; - if (component.component.toLowerCase().includes("sticky")) gotItRight++; - if (component.component.toLowerCase().includes("examples")) gotItRight++; + console.log(response.body); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data?.isGreenhouseATS).toBe(true); + }, + 60000 + ); - } - expect(gotItRight).toBeGreaterThan(2); - }, 60000); - - it.concurrent("should return information about Eric Ciarla", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["https://ericciarla.com/"], - prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?", - schema: { - type: "object", - properties: { - name: { type: "string" }, - work: { type: "string" }, - education: { type: "string" } + it.concurrent( + "should return mintlify api components", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://mintlify.com/docs/*"], + prompt: "what are the 4 API components?", + schema: { + type: "array", + items: { + type: "object", + properties: { + component: { type: "string" } + } + }, + required: ["items"] }, - required: ["name", "work", "education"] - }, - allowExternalLinks: true - }) + allowExternalLinks: true + }); - console.log(response.body.data); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data?.name).toBe("Eric Ciarla"); - expect(response.body.data?.work).toBeDefined(); - expect(response.body.data?.education).toBeDefined(); - }, 60000); + console.log(response.body.data?.items); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data?.items.length).toBe(4); + let gotItRight = 0; + for (const component of response.body.data?.items) { + if (component.component.toLowerCase().includes("parameter")) + gotItRight++; + if (component.component.toLowerCase().includes("response")) + gotItRight++; + if (component.component.toLowerCase().includes("expandable")) + gotItRight++; + if (component.component.toLowerCase().includes("sticky")) gotItRight++; + if (component.component.toLowerCase().includes("examples")) + gotItRight++; + } + expect(gotItRight).toBeGreaterThan(2); + }, + 60000 + ); - it.concurrent("should extract information without a schema", async () => { - const response = await request(TEST_URL) - .post("/v1/extract") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - urls: ["https://docs.firecrawl.dev"], - prompt: "What is the title and description of the page?" - }); + it.concurrent( + "should return information about Eric Ciarla", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://ericciarla.com/"], + prompt: + "Who is Eric Ciarla? Where does he work? Where did he go to school?", + schema: { + type: "object", + properties: { + name: { type: "string" }, + work: { type: "string" }, + education: { type: "string" } + }, + required: ["name", "work", "education"] + }, + allowExternalLinks: true + }); - console.log(response.body.data); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(typeof response.body.data).toBe("object"); - expect(Object.keys(response.body.data).length).toBeGreaterThan(0); - }, 60000); + console.log(response.body.data); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data?.name).toBe("Eric Ciarla"); + expect(response.body.data?.work).toBeDefined(); + expect(response.body.data?.education).toBeDefined(); + }, + 60000 + ); - + it.concurrent( + "should extract information without a schema", + async () => { + const response = await request(TEST_URL) + .post("/v1/extract") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + urls: ["https://docs.firecrawl.dev"], + prompt: "What is the title and description of the page?" + }); + console.log(response.body.data); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(typeof response.body.data).toBe("object"); + expect(Object.keys(response.body.data).length).toBeGreaterThan(0); + }, + 60000 + ); }); diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index dec77131..a8841aab 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -38,14 +38,17 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); it.concurrent("should return an error for a blocklisted URL", async () => { const blocklistedUrl = "https://facebook.com/fake-test"; @@ -70,172 +73,232 @@ describe("E2E Tests for API Routes", () => { // expect(response.statusCode).toBe(200); // }, 30000); // 30 seconds timeout - it.concurrent("should return a successful response with a valid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://roastmywebsite.ai" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.metadata).toHaveProperty("title"); - expect(response.body.data.metadata).toHaveProperty("description"); - expect(response.body.data.metadata).toHaveProperty("keywords"); - expect(response.body.data.metadata).toHaveProperty("robots"); - expect(response.body.data.metadata).toHaveProperty("ogTitle"); - expect(response.body.data.metadata).toHaveProperty("ogDescription"); - expect(response.body.data.metadata).toHaveProperty("ogUrl"); - expect(response.body.data.metadata).toHaveProperty("ogImage"); - expect(response.body.data.metadata).toHaveProperty("ogLocaleAlternate"); - expect(response.body.data.metadata).toHaveProperty("ogSiteName"); - expect(response.body.data.metadata).toHaveProperty("sourceURL"); - expect(response.body.data.metadata).toHaveProperty("pageStatusCode"); - expect(response.body.data.metadata.pageError).toBeUndefined(); - expect(response.body.data.metadata.title).toBe("Roast My Website"); - expect(response.body.data.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.body.data.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl"); - expect(response.body.data.metadata.robots).toBe("follow, index"); - expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); - expect(response.body.data.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"); - expect(response.body.data.metadata.ogUrl).toBe("https://www.roastmywebsite.ai"); - expect(response.body.data.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png"); - expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); - expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); - expect(response.body.data.metadata.sourceURL).toBe("https://roastmywebsite.ai"); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - }, 30000); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://roastmywebsite.ai" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.metadata).toHaveProperty("title"); + expect(response.body.data.metadata).toHaveProperty("description"); + expect(response.body.data.metadata).toHaveProperty("keywords"); + expect(response.body.data.metadata).toHaveProperty("robots"); + expect(response.body.data.metadata).toHaveProperty("ogTitle"); + expect(response.body.data.metadata).toHaveProperty("ogDescription"); + expect(response.body.data.metadata).toHaveProperty("ogUrl"); + expect(response.body.data.metadata).toHaveProperty("ogImage"); + expect(response.body.data.metadata).toHaveProperty("ogLocaleAlternate"); + expect(response.body.data.metadata).toHaveProperty("ogSiteName"); + expect(response.body.data.metadata).toHaveProperty("sourceURL"); + expect(response.body.data.metadata).toHaveProperty("pageStatusCode"); + expect(response.body.data.metadata.pageError).toBeUndefined(); + expect(response.body.data.metadata.title).toBe("Roast My Website"); + expect(response.body.data.metadata.description).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.keywords).toBe( + "Roast My Website,Roast,Website,GitHub,Firecrawl" + ); + expect(response.body.data.metadata.robots).toBe("follow, index"); + expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); + expect(response.body.data.metadata.ogDescription).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(response.body.data.metadata.ogUrl).toBe( + "https://www.roastmywebsite.ai" + ); + expect(response.body.data.metadata.ogImage).toBe( + "https://www.roastmywebsite.ai/og.png" + ); + expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); + expect(response.body.data.metadata.sourceURL).toBe( + "https://roastmywebsite.ai" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + }, + 30000 + ); // 30 seconds timeout - it.concurrent("should return a successful response with a valid API key and includeHtml set to true", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("html"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.markdown).toContain("_Roast_"); - expect(response.body.data.html).toContain(" { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true } + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.html).toContain(" { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeRawHtml: true }, - }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.content).toContain("_Roast_"); - expect(response.body.data.markdown).toContain("_Roast_"); - expect(response.body.data.rawHtml).toContain(" { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf' }); - await new Promise((r) => setTimeout(r, 6000)); + it.concurrent( + "should return a successful response with a valid API key and includeRawHtml set to true", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeRawHtml: true } + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain("_Roast_"); + expect(response.body.data.markdown).toContain("_Roast_"); + expect(response.body.data.rawHtml).toContain(" { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001' }); - await new Promise((r) => setTimeout(r, 6000)); + it.concurrent( + "should return a successful response for a valid scrape with PDF file", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" }); + await new Promise((r) => setTimeout(r, 6000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); - expect(response.body.data.metadata.pageStatusCode).toBe(200); - expect(response.body.data.metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, + 60000 + ); // 60 seconds - it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } }); - await new Promise((r) => setTimeout(r, 6000)); + it.concurrent( + "should return a successful response for a valid scrape with PDF file without explicit .pdf extension", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001" }); + await new Promise((r) => setTimeout(r, 6000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "We present spectrophotometric observations of the Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.pageStatusCode).toBe(200); + expect(response.body.data.metadata.pageError).toBeUndefined(); + }, + 60000 + ); // 60 seconds - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { - const responseWithoutRemoveTags = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com/" }); - expect(responseWithoutRemoveTags.statusCode).toBe(200); - expect(responseWithoutRemoveTags.body).toHaveProperty("data"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); - expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); - expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); - expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site"); - expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer - expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav - expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong + it.concurrent( + "should return a successful response for a valid scrape with PDF file and parsePDF set to false", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://arxiv.org/pdf/astro-ph/9301001.pdf", + pageOptions: { parsePDF: false } + }); + await new Promise((r) => setTimeout(r, 6000)); - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - expect(response.body.data).toHaveProperty("content"); - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data.content).toContain("Scrape This Site"); - expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer - expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav - expect(response.body.data.content).not.toContain("web scraping"); // strong - }, 30000); // 30 seconds timeout + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.content).toContain( + "/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj" + ); + }, + 60000 + ); // 60 seconds + + it.concurrent( + "should return a successful response with a valid API key with removeTags option", + async () => { + const responseWithoutRemoveTags = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com/" }); + expect(responseWithoutRemoveTags.statusCode).toBe(200); + expect(responseWithoutRemoveTags.body).toHaveProperty("data"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("content"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); + expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); + expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); + expect(responseWithoutRemoveTags.body.data.content).toContain( + "Scrape This Site" + ); + expect(responseWithoutRemoveTags.body.data.content).toContain( + "Lessons and Videos" + ); // #footer + expect(responseWithoutRemoveTags.body.data.content).toContain( + "[Sandbox](" + ); // .nav + expect(responseWithoutRemoveTags.body.data.content).toContain( + "web scraping" + ); // strong + + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com/", + pageOptions: { removeTags: [".nav", "#footer", "strong"] } + }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.content).toContain("Scrape This Site"); + expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer + expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav + expect(response.body.data.content).not.toContain("web scraping"); // strong + }, + 30000 + ); // 30 seconds timeout // TODO: add this test back once we nail the waitFor option to be more deterministic // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { @@ -258,101 +321,137 @@ describe("E2E Tests for API Routes", () => { // expect(duration).toBeGreaterThanOrEqual(7000); // }, 12000); // 12 seconds timeout - it.concurrent('should return a successful response for a scrape with 400 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/400' }); - await new Promise((r) => setTimeout(r, 5000)); + it.concurrent( + "should return a successful response for a scrape with 400 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/400" }); + await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(400); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("bad request"); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(400); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "bad request" + ); + }, + 60000 + ); // 60 seconds - it.concurrent('should return a successful response for a scrape with 401 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/401' }); - await new Promise((r) => setTimeout(r, 5000)); + it.concurrent( + "should return a successful response for a scrape with 401 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/401" }); + await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(401); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("unauthorized"); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(401); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "unauthorized" + ); + }, + 60000 + ); // 60 seconds - it.concurrent("should return a successful response for a scrape with 403 page", async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/403' }); + it.concurrent( + "should return a successful response for a scrape with 403 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/403" }); - await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(403); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("forbidden"); - }, 60000); // 60 seconds + await new Promise((r) => setTimeout(r, 5000)); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(403); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "forbidden" + ); + }, + 60000 + ); // 60 seconds - it.concurrent('should return a successful response for a scrape with 404 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/404' }); - await new Promise((r) => setTimeout(r, 5000)); + it.concurrent( + "should return a successful response for a scrape with 404 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/404" }); + await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(404); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("not found"); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(404); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "not found" + ); + }, + 60000 + ); // 60 seconds - it.concurrent('should return a successful response for a scrape with 405 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/405' }); - await new Promise((r) => setTimeout(r, 5000)); + it.concurrent( + "should return a successful response for a scrape with 405 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/405" }); + await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(405); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("method not allowed"); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(405); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "method not allowed" + ); + }, + 60000 + ); // 60 seconds - it.concurrent('should return a successful response for a scrape with 500 page', async () => { - const response = await request(TEST_URL) - .post('/v0/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/500' }); - await new Promise((r) => setTimeout(r, 5000)); + it.concurrent( + "should return a successful response for a scrape with 500 page", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/500" }); + await new Promise((r) => setTimeout(r, 5000)); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - expect(response.body.data).toHaveProperty('content'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.pageStatusCode).toBe(500); - expect(response.body.data.metadata.pageError.toLowerCase()).toContain("internal server error"); - }, 60000); // 60 seconds + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("content"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.pageStatusCode).toBe(500); + expect(response.body.data.metadata.pageError.toLowerCase()).toContain( + "internal server error" + ); + }, + 60000 + ); // 60 seconds }); describe("POST /v0/crawl", () => { @@ -361,14 +460,17 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); it.concurrent("should return an error for a blocklisted URL", async () => { const blocklistedUrl = "https://twitter.com/fake-test"; @@ -383,56 +485,64 @@ describe("E2E Tests for API Routes", () => { ); }); - it.concurrent("should return a successful response with a valid API key for crawl", async () => { - const response = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("jobId"); - expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ - ); - }); - it.concurrent('should prevent duplicate requests using the same idempotency key', async () => { - const uniqueIdempotencyKey = uuidv4(); - - // First request with the idempotency key - const firstResponse = await request(TEST_URL) - .post('/v0/crawl') - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://docs.firecrawl.dev' }); - - expect(firstResponse.statusCode).toBe(200); - - // Second request with the same idempotency key - const secondResponse = await request(TEST_URL) - .post('/v0/crawl') - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .set("x-idempotency-key", uniqueIdempotencyKey) - .send({ url: 'https://docs.firecrawl.dev' }); - - expect(secondResponse.statusCode).toBe(409); - expect(secondResponse.body.error).toBe('Idempotency key already used'); - }); + it.concurrent( + "should return a successful response with a valid API key for crawl", + async () => { + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("jobId"); + expect(response.body.jobId).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + } + ); + it.concurrent( + "should prevent duplicate requests using the same idempotency key", + async () => { + const uniqueIdempotencyKey = uuidv4(); + + // First request with the idempotency key + const firstResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .set("x-idempotency-key", uniqueIdempotencyKey) + .send({ url: "https://docs.firecrawl.dev" }); + + expect(firstResponse.statusCode).toBe(200); + + // Second request with the same idempotency key + const secondResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .set("x-idempotency-key", uniqueIdempotencyKey) + .send({ url: "https://docs.firecrawl.dev" }); + + expect(secondResponse.statusCode).toBe(409); + expect(secondResponse.body.error).toBe("Idempotency key already used"); + } + ); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + includes: ["blog/*"] + } + }); - it.concurrent("should return a successful response with a valid API key and valid includes option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - limit: 10, - crawlerOptions: { - includes: ["blog/*"], - }, - }); - let response; let isFinished = false; @@ -453,278 +563,322 @@ describe("E2E Tests for API Routes", () => { const completedResponse = response; const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); - }); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds - - it.concurrent("should return a successful response with a valid API key and valid excludes option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - limit: 10, - crawlerOptions: { - excludes: ["blog/*"], - }, + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.mendable.ai/blog/")).toBeTruthy(); }); - - let isFinished = false; - let response; - while (!isFinished) { - response = await request(TEST_URL) + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + }, + 60000 + ); // 60 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + limit: 10, + crawlerOptions: { + excludes: ["blog/*"] + } + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); + }); + }, + 90000 + ); // 90 seconds + + it.concurrent( + "should return a successful response with a valid API key and limit to 3", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { limit: 3 } + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + const completedResponse = response; + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data.length).toBe(3); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + }, + 60000 + ); // 60 seconds + + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + crawlerOptions: { maxDepth: 1 } + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - - const completedResponse = response; - - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); - }); - }, 90000); // 90 seconds - - it.concurrent("should return a successful response with a valid API key and limit to 3", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - crawlerOptions: { limit: 3 }, - }); - - let isFinished = false; - let response; - - while (!isFinished) { - response = await request(TEST_URL) + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); + + it.concurrent( + "should return a successful response with relative max depth option for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com/pages/", + crawlerOptions: { maxDepth: 1 } + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - - const completedResponse = response; - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data.length).toBe(3); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds - - it.concurrent("should return a successful response with max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); - // Check if all URLs have a maximum depth of 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(2); - }); - }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com/pages/", - crawlerOptions: { maxDepth: 1 }, + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(3); }); - expect(crawlResponse.statusCode).toBe(200); + }, + 180000 + ); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + it.concurrent( + "should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.mendable.ai", + crawlerOptions: { maxDepth: 0 } + }); + expect(crawlResponse.statusCode).toBe(200); + + const response = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(3); - }); - }, 180000); - - it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { - - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://www.mendable.ai", - crawlerOptions: { maxDepth: 0 }, - }); - expect(crawlResponse.statusCode).toBe(200); - - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); const testurls = completedResponse.body.data.map( (item: any) => item.metadata?.sourceURL ); //console.log(testurls) - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThanOrEqual(1); + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThanOrEqual(1); - // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split('/'); - const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); - expect(depth).toBeLessThanOrEqual(1); - }); - }, 180000); - - - - + // Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(1); + }); + }, + 180000 + ); // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { // const crawlResponse = await request(TEST_URL) @@ -735,7 +889,7 @@ describe("E2E Tests for API Routes", () => { // url: "https://mendable.ai", // crawlerOptions: { limit: 10 }, // }); - + // const response = await request(TEST_URL) // .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -771,100 +925,126 @@ describe("E2E Tests for API Routes", () => { // expect(completedResponse.body.data[0].content).not.toContain("main menu"); // }, 60000); // 60 seconds - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, - }); - expect(crawlResponse.statusCode).toBe(200); + it.concurrent( + "should return a successful response for a valid crawl job with includeHtml set to true option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true } + }); + expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) + const response = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); + + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - } - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - // 120 seconds - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); - expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); - expect(completedResponse.body.data[0].html).toContain(" { - const crawlInitResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - crawlerOptions: { - allowExternalContentLinks: true, - ignoreSitemap: true, - returnOnlyUrls: true, - limit: 50 - } - }); - - expect(crawlInitResponse.statusCode).toBe(200); - expect(crawlInitResponse.body).toHaveProperty("jobId"); - - let crawlStatus: string = "scraping"; - let crawlData = []; - while (crawlStatus !== "completed") { - const statusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`) + const completedResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - crawlStatus = statusResponse.body.status; - if (statusResponse.body.data) { - crawlData = statusResponse.body.data; + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + + // 120 seconds + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); + expect(completedResponse.body.data[0].html).toContain(" { + const crawlInitResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + crawlerOptions: { + allowExternalContentLinks: true, + ignoreSitemap: true, + returnOnlyUrls: true, + limit: 50 + } + }); + + expect(crawlInitResponse.statusCode).toBe(200); + expect(crawlInitResponse.body).toHaveProperty("jobId"); + + let crawlStatus: string = "scraping"; + let crawlData = []; + while (crawlStatus !== "completed") { + const statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + crawlStatus = statusResponse.body.status; + if (statusResponse.body.data) { + crawlData = statusResponse.body.data; + } + if (crawlStatus !== "completed") { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } - if (crawlStatus !== "completed") { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - expect(crawlData.length).toBeGreaterThan(0); - expect(crawlData).toEqual(expect.arrayContaining([ - expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), - expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }), - expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") }) - ])); - }, 180000); // 3 minutes timeout + expect(crawlData.length).toBeGreaterThan(0); + expect(crawlData).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + url: expect.stringContaining( + "https://firecrawl.dev/?ref=mendable+banner" + ) + }), + expect.objectContaining({ + url: expect.stringContaining("https://mendable.ai/pricing") + }), + expect.objectContaining({ + url: expect.stringContaining("https://x.com/CalebPeffer") + }) + ]) + ); + }, + 180000 + ); // 3 minutes timeout }); describe("POST /v0/crawlWebsitePreview", () => { @@ -873,14 +1053,17 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/crawlWebsitePreview") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); // it.concurrent("should return an error for a blocklisted URL", async () => { // const blocklistedUrl = "https://instagram.com/fake-test"; @@ -894,15 +1077,19 @@ describe("E2E Tests for API Routes", () => { // expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); // }); - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev", timeout: 1000 }); + it.concurrent( + "should return a timeout error when scraping takes longer than the specified timeout", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev", timeout: 1000 }); - expect(response.statusCode).toBe(408); - }, 3000); + expect(response.statusCode).toBe(408); + }, + 3000 + ); // it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => { // const response = await request(TEST_URL) @@ -924,26 +1111,33 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(401); - }); + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(401); + } + ); - it.concurrent("should return a successful response with a valid API key for search", async () => { - const response = await request(TEST_URL) - .post("/v0/search") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ query: "test" }); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success"); - expect(response.body.success).toBe(true); - expect(response.body).toHaveProperty("data"); - }, 30000); // 30 seconds timeout + it.concurrent( + "should return a successful response with a valid API key for search", + async () => { + const response = await request(TEST_URL) + .post("/v0/search") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ query: "test" }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success"); + expect(response.body.success).toBe(true); + expect(response.body).toHaveProperty("data"); + }, + 30000 + ); // 30 seconds timeout }); describe("GET /v0/crawl/status/:jobId", () => { @@ -952,123 +1146,217 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(401); }); - it.concurrent("should return an error response with an invalid API key", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/123") - .set("Authorization", `Bearer invalid-api-key`); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return Job not found for invalid job ID", async () => { - const response = await request(TEST_URL) - .get("/v0/crawl/status/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - }); - - it.concurrent("should return a successful crawl status response for a valid crawl job", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://mendable.ai/blog" }); - expect(crawlResponse.statusCode).toBe(200); - - let isCompleted = false; - let completedResponse; - - while (!isCompleted) { + it.concurrent( + "should return an error response with an invalid API key", + async () => { const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .get("/v0/crawl/status/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v0/crawl/status/invalidJobId") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isCompleted = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } + expect(response.statusCode).toBe(404); } - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); + ); - const childrenLinks = completedResponse.body.data.filter(doc => - doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") - ); + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://mendable.ai/blog" }); + expect(crawlResponse.statusCode).toBe(200); - expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, 180000); // 120 seconds - - it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ', async () => { - const crawlResponse = await request(TEST_URL) - .post('/v0/crawl') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001', crawlerOptions: { limit: 10, excludes: [ 'list/*', 'login', 'abs/*', 'static/*', 'about/*', 'archive/*' ] }}); - expect(crawlResponse.statusCode).toBe(200); + let isCompleted = false; + let completedResponse; - let isCompleted = false; - let completedResponse; + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); - while (!isCompleted) { - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('status'); - - if (response.body.status === 'completed') { - isCompleted = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } } - } - expect(completedResponse.body.status).toBe('completed'); - expect(completedResponse.body).toHaveProperty('data'); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].content).toContain("Mendable"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + + const childrenLinks = completedResponse.body.data.filter( + (doc) => + doc.metadata && + doc.metadata.sourceURL && + doc.metadata.sourceURL.includes("mendable.ai/blog") + ); + + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds + + it.concurrent( + "should return a successful response for a valid crawl job with PDF files without explicit .pdf extension ", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://arxiv.org/pdf/astro-ph/9301001", + crawlerOptions: { + limit: 10, + excludes: [ + "list/*", + "login", + "abs/*", + "static/*", + "about/*", + "archive/*" + ] + } + }); + expect(crawlResponse.statusCode).toBe(200); + + let isCompleted = false; + let completedResponse; + + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isCompleted = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body.data.length).toEqual(1); expect(completedResponse.body.data).toEqual( expect.arrayContaining([ expect.objectContaining({ - content: expect.stringContaining('asymmetries might represent, for instance, preferred source orientations to our line of sight.') + content: expect.stringContaining( + "asymmetries might represent, for instance, preferred source orientations to our line of sight." + ) }) ]) ); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - }, 180000); // 120 seconds + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); + expect( + completedResponse.body.data[0].metadata.pageError + ).toBeUndefined(); + }, + 180000 + ); // 120 seconds + it.concurrent( + "should return a successful response for a valid crawl job with includeHtml set to true option (2)", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://roastmywebsite.ai", + pageOptions: { includeHtml: true } + }); + expect(crawlResponse.statusCode).toBe(200); + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + expect(["active", "waiting"]).toContain(response.body.status); - it.concurrent("should return a successful response for a valid crawl job with includeHtml set to true option (2)", async () => { + let isFinished = false; + let completedResponse; + + while (!isFinished) { + const response = await request(TEST_URL) + .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + + if (response.body.status === "completed") { + isFinished = true; + completedResponse = response; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).toHaveProperty("html"); + expect(completedResponse.body.data[0].content).toContain("_Roast_"); + expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); + expect(completedResponse.body.data[0].html).toContain(" { const crawlResponse = await request(TEST_URL) .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ - url: "https://roastmywebsite.ai", + url: "https://mendable.ai/blog", pageOptions: { includeHtml: true }, + crawlerOptions: { allowBackwardCrawling: true } }); expect(crawlResponse.statusCode).toBe(200); - const response = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting"]).toContain(response.body.status); - let isFinished = false; let completedResponse; @@ -1095,190 +1383,167 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0].content).toContain("_Roast_"); - expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); - expect(completedResponse.body.data[0].html).toContain(" { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai/blog", - pageOptions: { includeHtml: true }, - crawlerOptions: { allowBackwardCrawling: true }, + const onlyChildrenLinks = completedResponse.body.data.filter((doc) => { + return ( + doc.metadata && + doc.metadata.sourceURL && + doc.metadata.sourceURL.includes("mendable.ai/blog") + ); }); - expect(crawlResponse.statusCode).toBe(200); - - let isFinished = false; - let completedResponse; - while (!isFinished) { - const response = await request(TEST_URL) + expect(completedResponse.body.data.length).toBeGreaterThan( + onlyChildrenLinks.length + ); + }, + 60000 + ); + + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://jestjs.io" }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 20000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - if (response.body.status === "completed") { - isFinished = true; - completedResponse = response; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again - } - } - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).toHaveProperty("html"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].markdown).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined(); - - const onlyChildrenLinks = completedResponse.body.data.filter(doc => { - return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog") - }); - - expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length); - }, 60000); - - it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://jestjs.io" }); - - expect(crawlResponse.statusCode).toBe(200); - - await new Promise((r) => setTimeout(r, 20000)); - - const responseCancel = await request(TEST_URL) - .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); - - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("failed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data).toBeNull(); - expect(completedResponse.body).toHaveProperty("partial_data"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.partial_data[0].metadata.pageStatusCode).toBe(200); - expect(completedResponse.body.partial_data[0].metadata.pageError).toBeUndefined(); - }, 60000); // 60 seconds + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("failed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data).toBeNull(); + expect(completedResponse.body).toHaveProperty("partial_data"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); + expect( + completedResponse.body.partial_data[0].metadata.pageStatusCode + ).toBe(200); + expect( + completedResponse.body.partial_data[0].metadata.pageError + ).toBeUndefined(); + }, + 60000 + ); // 60 seconds describe("POST /v0/scrape with LLM Extraction", () => { - it.concurrent("should extract data using LLM extraction mode", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - pageOptions: { - onlyMainContent: true, - }, - extractorOptions: { - mode: "llm-extraction", - extractionPrompt: - "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - extractionSchema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", - }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], + it.concurrent( + "should extract data using LLM extraction mode", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + pageOptions: { + onlyMainContent: true }, - }, - }); - - // Ensure that the job was successfully created before proceeding with LLM extraction - expect(response.statusCode).toBe(200); - - // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` - let llmExtraction = response.body.data.llm_extraction; - - // Check if the llm_extraction object has the required properties with correct types and values - expect(llmExtraction).toHaveProperty("company_mission"); - expect(typeof llmExtraction.company_mission).toBe("string"); - expect(llmExtraction).toHaveProperty("supports_sso"); - expect(llmExtraction.supports_sso).toBe(true); - expect(typeof llmExtraction.supports_sso).toBe("boolean"); - expect(llmExtraction).toHaveProperty("is_open_source"); - expect(llmExtraction.is_open_source).toBe(false); - expect(typeof llmExtraction.is_open_source).toBe("boolean"); - }, 60000); // 60 secs - - it.concurrent("should extract data using LLM extraction mode with RawHtml", async () => { - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://mendable.ai", - - extractorOptions: { - mode: "llm-extraction-from-raw-html", - extractionPrompt: - "Based on the information on the page, what are the primary and secondary CTA buttons?", - extractionSchema: { - type: "object", - properties: { - primary_cta: { - type: "string", + extractorOptions: { + mode: "llm-extraction", + extractionPrompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + extractionSchema: { + type: "object", + properties: { + company_mission: { + type: "string" + }, + supports_sso: { + type: "boolean" + }, + is_open_source: { + type: "boolean" + } }, - secondary_cta: { - type: "string", + required: ["company_mission", "supports_sso", "is_open_source"] + } + } + }); + + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); + + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; + + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("company_mission"); + expect(typeof llmExtraction.company_mission).toBe("string"); + expect(llmExtraction).toHaveProperty("supports_sso"); + expect(llmExtraction.supports_sso).toBe(true); + expect(typeof llmExtraction.supports_sso).toBe("boolean"); + expect(llmExtraction).toHaveProperty("is_open_source"); + expect(llmExtraction.is_open_source).toBe(false); + expect(typeof llmExtraction.is_open_source).toBe("boolean"); + }, + 60000 + ); // 60 secs + + it.concurrent( + "should extract data using LLM extraction mode with RawHtml", + async () => { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://mendable.ai", + + extractorOptions: { + mode: "llm-extraction-from-raw-html", + extractionPrompt: + "Based on the information on the page, what are the primary and secondary CTA buttons?", + extractionSchema: { + type: "object", + properties: { + primary_cta: { + type: "string" + }, + secondary_cta: { + type: "string" + } }, - }, - required: ["primary_cta", "secondary_cta"], - }, - }, - }); + required: ["primary_cta", "secondary_cta"] + } + } + }); - // Ensure that the job was successfully created before proceeding with LLM extraction - expect(response.statusCode).toBe(200); + // Ensure that the job was successfully created before proceeding with LLM extraction + expect(response.statusCode).toBe(200); - // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` - let llmExtraction = response.body.data.llm_extraction; + // Assuming the LLM extraction object is available in the response body under `data.llm_extraction` + let llmExtraction = response.body.data.llm_extraction; - // Check if the llm_extraction object has the required properties with correct types and values - expect(llmExtraction).toHaveProperty("primary_cta"); - expect(typeof llmExtraction.primary_cta).toBe("string"); - expect(llmExtraction).toHaveProperty("secondary_cta"); - expect(typeof llmExtraction.secondary_cta).toBe("string"); - - }, 60000); // 60 secs + // Check if the llm_extraction object has the required properties with correct types and values + expect(llmExtraction).toHaveProperty("primary_cta"); + expect(typeof llmExtraction.primary_cta).toBe("string"); + expect(llmExtraction).toHaveProperty("secondary_cta"); + expect(typeof llmExtraction.secondary_cta).toBe("string"); + }, + 60000 + ); // 60 secs }); // describe("POST /v0/scrape for Top 100 Companies", () => { @@ -1340,60 +1605,63 @@ describe("E2E Tests for API Routes", () => { // }); describe("POST /v0/crawl with fast mode", () => { - it.concurrent("should complete the crawl under 20 seconds", async () => { - const startTime = Date.now(); + it.concurrent( + "should complete the crawl under 20 seconds", + async () => { + const startTime = Date.now(); - const crawlResponse = await request(TEST_URL) - .post("/v0/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://flutterbricks.com", - crawlerOptions: { - mode: "fast" + const crawlResponse = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://flutterbricks.com", + crawlerOptions: { + mode: "fast" + } + }); + + expect(crawlResponse.statusCode).toBe(200); + + const jobId = crawlResponse.body.jobId; + let statusResponse; + let isFinished = false; + + while (!isFinished) { + statusResponse = await request(TEST_URL) + .get(`/v0/crawl/status/${jobId}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(statusResponse.statusCode).toBe(200); + isFinished = statusResponse.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } - }); - - expect(crawlResponse.statusCode).toBe(200); - - const jobId = crawlResponse.body.jobId; - let statusResponse; - let isFinished = false; - - while (!isFinished) { - statusResponse = await request(TEST_URL) - .get(`/v0/crawl/status/${jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(statusResponse.statusCode).toBe(200); - isFinished = statusResponse.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } - } - // const endTime = Date.now(); - // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + // const endTime = Date.now(); + // const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds - // console.log(`Time elapsed: ${timeElapsed} seconds`); + // console.log(`Time elapsed: ${timeElapsed} seconds`); - expect(statusResponse.body.status).toBe("completed"); - expect(statusResponse.body).toHaveProperty("data"); - expect(statusResponse.body.data[0]).toHaveProperty("content"); - expect(statusResponse.body.data[0]).toHaveProperty("markdown"); - expect(statusResponse.body.data[0]).toHaveProperty("metadata"); - expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200); - expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined(); + expect(statusResponse.body.status).toBe("completed"); + expect(statusResponse.body).toHaveProperty("data"); + expect(statusResponse.body.data[0]).toHaveProperty("content"); + expect(statusResponse.body.data[0]).toHaveProperty("markdown"); + expect(statusResponse.body.data[0]).toHaveProperty("metadata"); + expect(statusResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(statusResponse.body.data[0].metadata.pageError).toBeUndefined(); - const results = statusResponse.body.data; - // results.forEach((result, i) => { - // console.log(result.metadata.sourceURL); - // }); - expect(results.length).toBeGreaterThanOrEqual(10); - expect(results.length).toBeLessThanOrEqual(15); - - }, 20000); + const results = statusResponse.body.data; + // results.forEach((result, i) => { + // console.log(result.metadata.sourceURL); + // }); + expect(results.length).toBeGreaterThanOrEqual(10); + expect(results.length).toBeLessThanOrEqual(15); + }, + 20000 + ); // it.concurrent("should complete the crawl in more than 10 seconds", async () => { // const startTime = Date.now(); @@ -1440,7 +1708,7 @@ describe("E2E Tests for API Routes", () => { // // }); // expect(results.length).toBeGreaterThanOrEqual(10); // expect(results.length).toBeLessThanOrEqual(15); - + // }, 50000);// 15 seconds timeout to account for network delays }); @@ -1453,24 +1721,28 @@ describe("E2E Tests for API Routes", () => { }); describe("Rate Limiter", () => { - it.concurrent("should return 429 when rate limit is exceeded for preview token", async () => { - for (let i = 0; i < 5; i++) { + it.concurrent( + "should return 429 when rate limit is exceeded for preview token", + async () => { + for (let i = 0; i < 5; i++) { + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer this_is_just_a_preview_token`) + .set("Content-Type", "application/json") + .send({ url: "https://www.scrapethissite.com" }); + + expect(response.statusCode).toBe(200); + } const response = await request(TEST_URL) .post("/v0/scrape") .set("Authorization", `Bearer this_is_just_a_preview_token`) .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com" }); - expect(response.statusCode).toBe(200); - } - const response = await request(TEST_URL) - .post("/v0/scrape") - .set("Authorization", `Bearer this_is_just_a_preview_token`) - .set("Content-Type", "application/json") - .send({ url: "https://www.scrapethissite.com" }); - - expect(response.statusCode).toBe(429); - }, 90000); + expect(response.statusCode).toBe(429); + }, + 90000 + ); }); // it.concurrent("should return 429 when rate limit is exceeded for API key", async () => { diff --git a/apps/api/src/__tests__/e2e_map/index.test.ts b/apps/api/src/__tests__/e2e_map/index.test.ts index b065dff1..948f097e 100644 --- a/apps/api/src/__tests__/e2e_map/index.test.ts +++ b/apps/api/src/__tests__/e2e_map/index.test.ts @@ -15,7 +15,7 @@ describe("E2E Tests for Map API Routes", () => { .send({ url: "https://firecrawl.dev", sitemapOnly: false, - search: "smart-crawl", + search: "smart-crawl" }); console.log(response.body); @@ -37,7 +37,7 @@ describe("E2E Tests for Map API Routes", () => { .send({ url: "https://firecrawl.dev", sitemapOnly: false, - includeSubdomains: true, + includeSubdomains: true }); console.log(response.body); @@ -60,7 +60,7 @@ describe("E2E Tests for Map API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - sitemapOnly: true, + sitemapOnly: true }); console.log(response.body); @@ -84,7 +84,7 @@ describe("E2E Tests for Map API Routes", () => { .send({ url: "https://firecrawl.dev", sitemapOnly: false, - limit: 10, + limit: 10 }); console.log(response.body); @@ -104,7 +104,7 @@ describe("E2E Tests for Map API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://geekflare.com/sitemap_index.xml", - sitemapOnly: true, + sitemapOnly: true }); console.log(response.body); diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index 83f676b8..9c3ddf33 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -32,7 +32,6 @@ describe("E2E Tests for API Routes with No Authentication", () => { process.env = originalEnv; }); - describe("GET /", () => { it("should return Hello, world! message", async () => { const response = await request(TEST_URL).get("/"); @@ -62,7 +61,9 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response", async () => { @@ -87,7 +88,9 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response", async () => { @@ -116,7 +119,9 @@ describe("E2E Tests for API Routes with No Authentication", () => { .set("Content-Type", "application/json") .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); - expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(response.body.error).toContain( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should return a successful response", async () => { @@ -199,8 +204,6 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - - }, 60000); // 60 seconds }); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index e1f5f3fa..33e3be5d 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -2,7 +2,7 @@ import request from "supertest"; import { configDotenv } from "dotenv"; import { ScrapeRequestInput, - ScrapeResponseRequestTest, + ScrapeResponseRequestTest } from "../../controllers/v1/types"; configDotenv(); @@ -19,15 +19,17 @@ describe("E2E Tests for v1 API Routes", () => { describe("GET /is-production", () => { it.concurrent("should return the production status", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).get( - "/is-production" - ); + const response: ScrapeResponseRequestTest = + await request(TEST_URL).get("/is-production"); - console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION); - console.log('?', process.env.USE_DB_AUTHENTICATION === 'true'); - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - console.log('!!useDbAuthentication', !!useDbAuthentication); - console.log('!useDbAuthentication', !useDbAuthentication); + console.log( + "process.env.USE_DB_AUTHENTICATION", + process.env.USE_DB_AUTHENTICATION + ); + console.log("?", process.env.USE_DB_AUTHENTICATION === "true"); + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; + console.log("!!useDbAuthentication", !!useDbAuthentication); + console.log("!useDbAuthentication", !useDbAuthentication); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("isProduction"); @@ -37,15 +39,15 @@ describe("E2E Tests for v1 API Routes", () => { describe("POST /v1/scrape", () => { it.concurrent("should require authorization", async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .send({ url: "https://firecrawl.dev"}) + .post("/v1/scrape") + .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); it.concurrent("should throw error for blocklisted URL", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://facebook.com/fake-test", + url: "https://facebook.com/fake-test" }; const response = await request(TEST_URL) @@ -55,7 +57,9 @@ describe("E2E Tests for v1 API Routes", () => { .send(scrapeRequest); expect(response.statusCode).toBe(403); - expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + expect(response.body.error).toBe( + "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." + ); }); it.concurrent( @@ -74,7 +78,7 @@ describe("E2E Tests for v1 API Routes", () => { "should return a successful response with a valid API key", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://roastmywebsite.ai", + url: "https://roastmywebsite.ai" }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -126,7 +130,7 @@ describe("E2E Tests for v1 API Routes", () => { "should return a successful response with a valid API key", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://arxiv.org/abs/2410.04840", + url: "https://arxiv.org/abs/2410.04840" }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -146,8 +150,12 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.markdown).toContain("Strong Model Collapse"); expect(response.body.data.metadata.error).toBeUndefined(); - expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse"); - expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse"); + expect(response.body.data.metadata.description).toContain( + "Abstract page for arXiv paper 2410.04840: Strong Model Collapse" + ); + expect(response.body.data.metadata.citation_title).toBe( + "Strong Model Collapse" + ); expect(response.body.data.metadata.citation_author).toEqual([ "Dohmatob, Elvis", "Feng, Yunzhen", @@ -155,11 +163,21 @@ describe("E2E Tests for v1 API Routes", () => { "Kempe, Julia" ]); expect(response.body.data.metadata.citation_date).toBe("2024/10/07"); - expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08"); - expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840"); - expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840"); - expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm"); - expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840"); + expect(response.body.data.metadata.citation_online_date).toBe( + "2024/10/08" + ); + expect(response.body.data.metadata.citation_pdf_url).toBe( + "http://arxiv.org/pdf/2410.04840" + ); + expect(response.body.data.metadata.citation_arxiv_id).toBe( + "2410.04840" + ); + expect(response.body.data.metadata.citation_abstract).toContain( + "Within the scaling laws paradigm" + ); + expect(response.body.data.metadata.sourceURL).toBe( + "https://arxiv.org/abs/2410.04840" + ); expect(response.body.data.metadata.statusCode).toBe(200); }, 30000 @@ -169,7 +187,7 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", - formats: ["markdown", "html"], + formats: ["markdown", "html"] }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -177,7 +195,7 @@ describe("E2E Tests for v1 API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { @@ -193,62 +211,77 @@ describe("E2E Tests for v1 API Routes", () => { }, 30000 ); - it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { + it.concurrent( + "should return a successful response for a valid scrape with PDF file", + async () => { const scrapeRequest: ScrapeRequestInput = { url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" - // formats: ["markdown", "html"], + // formats: ["markdown", "html"], }; const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send(scrapeRequest); - await new Promise((r) => setTimeout(r, 6000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); - expect(response.body.data.metadata.statusCode).toBe(200); - expect(response.body.data.metadata.error).toBeUndefined(); - }, 60000); - - it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { - const scrapeRequest: ScrapeRequestInput = { - url: "https://arxiv.org/pdf/astro-ph/9301001" - }; - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send(scrapeRequest); - await new Promise((r) => setTimeout(r, 6000)); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy'); - expect(response.body.data.metadata.statusCode).toBe(200); - expect(response.body.data.metadata.error).toBeUndefined(); - }, 60000); - - it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { - const scrapeRequest: ScrapeRequestInput = { - url: "https://www.scrapethissite.com/", - onlyMainContent: false // default is true - }; - const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain( + "Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 60000 + ); + + it.concurrent( + "should return a successful response for a valid scrape with PDF file without explicit .pdf extension", + async () => { + const scrapeRequest: ScrapeRequestInput = { + url: "https://arxiv.org/pdf/astro-ph/9301001" + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain( + "Broad Line Radio Galaxy" + ); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 60000 + ); + + it.concurrent( + "should return a successful response with a valid API key with removeTags option", + async () => { + const scrapeRequest: ScrapeRequestInput = { + url: "https://www.scrapethissite.com/", + onlyMainContent: false // default is true + }; + const responseWithoutRemoveTags: ScrapeResponseRequestTest = + await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); expect(responseWithoutRemoveTags.statusCode).toBe(200); expect(responseWithoutRemoveTags.body).toHaveProperty("data"); @@ -258,13 +291,17 @@ describe("E2E Tests for v1 API Routes", () => { expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown"); expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); - expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav - expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer - + expect(responseWithoutRemoveTags.body.data.markdown).toContain( + "[FAQ](/faq/)" + ); // .nav + expect(responseWithoutRemoveTags.body.data.markdown).toContain( + "Hartley Brody 2023" + ); // #footer + const scrapeRequestWithRemoveTags: ScrapeRequestInput = { - url: "https://www.scrapethissite.com/", - excludeTags: ['.nav', '#footer', 'strong'], - onlyMainContent: false // default is true + url: "https://www.scrapethissite.com/", + excludeTags: [".nav", "#footer", "strong"], + onlyMainContent: false // default is true }; const response: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/scrape") @@ -281,725 +318,757 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data).not.toHaveProperty("html"); expect(response.body.data.markdown).not.toContain("Hartley Brody 2023"); - expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // - }, 30000); + expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // + }, + 30000 + ); - it.concurrent('should return a successful response for a scrape with 400 page', async () => { + it.concurrent( + "should return a successful response for a scrape with 400 page", + async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/400' }); + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/400" }); await new Promise((r) => setTimeout(r, 5000)); - + expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); + expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(400); - }, 60000); + }, + 60000 + ); - - it.concurrent('should return a successful response for a scrape with 401 page', async () => { + it.concurrent( + "should return a successful response for a scrape with 401 page", + async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/401' }); + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/401" }); await new Promise((r) => setTimeout(r, 5000)); - + expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); + expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(401); - }, 60000); + }, + 60000 + ); - // Removed it as we want to retry fallback to the next scraper - // it.concurrent('should return a successful response for a scrape with 403 page', async () => { - // const response: ScrapeResponseRequestTest = await request(TEST_URL) - // .post('/v1/scrape') - // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - // .set('Content-Type', 'application/json') - // .send({ url: 'https://httpstat.us/403' }); - // await new Promise((r) => setTimeout(r, 5000)); - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty('data'); - // if (!("data" in response.body)) { - // throw new Error("Expected response body to have 'data' property"); - // } - // expect(response.body.data).toHaveProperty('markdown'); - // expect(response.body.data).toHaveProperty('metadata'); - // expect(response.body.data.metadata.statusCode).toBe(403); - // }, 60000); + // Removed it as we want to retry fallback to the next scraper + // it.concurrent('should return a successful response for a scrape with 403 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/403' }); + // await new Promise((r) => setTimeout(r, 5000)); - it.concurrent('should return a successful response for a scrape with 404 page', async () => { + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(403); + // }, 60000); + + it.concurrent( + "should return a successful response for a scrape with 404 page", + async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/404' }); + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/404" }); await new Promise((r) => setTimeout(r, 5000)); - + expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); + expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(404); - }, 60000); + }, + 60000 + ); - // it.concurrent('should return a successful response for a scrape with 405 page', async () => { - // const response: ScrapeResponseRequestTest = await request(TEST_URL) - // .post('/v1/scrape') - // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - // .set('Content-Type', 'application/json') - // .send({ url: 'https://httpstat.us/405' }); - // await new Promise((r) => setTimeout(r, 5000)); - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty('data'); - // if (!("data" in response.body)) { - // throw new Error("Expected response body to have 'data' property"); - // } - // expect(response.body.data).toHaveProperty('markdown'); - // expect(response.body.data).toHaveProperty('metadata'); - // expect(response.body.data.metadata.statusCode).toBe(405); - // }, 60000); + // it.concurrent('should return a successful response for a scrape with 405 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/405' }); + // await new Promise((r) => setTimeout(r, 5000)); - // it.concurrent('should return a successful response for a scrape with 500 page', async () => { - // const response: ScrapeResponseRequestTest = await request(TEST_URL) - // .post('/v1/scrape') - // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - // .set('Content-Type', 'application/json') - // .send({ url: 'https://httpstat.us/500' }); - // await new Promise((r) => setTimeout(r, 5000)); - - // expect(response.statusCode).toBe(200); - // expect(response.body).toHaveProperty('data'); - // if (!("data" in response.body)) { - // throw new Error("Expected response body to have 'data' property"); - // } - // expect(response.body.data).toHaveProperty('markdown'); - // expect(response.body.data).toHaveProperty('metadata'); - // expect(response.body.data.metadata.statusCode).toBe(500); - // }, 60000); + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(405); + // }, 60000); - it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => { + // it.concurrent('should return a successful response for a scrape with 500 page', async () => { + // const response: ScrapeResponseRequestTest = await request(TEST_URL) + // .post('/v1/scrape') + // .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + // .set('Content-Type', 'application/json') + // .send({ url: 'https://httpstat.us/500' }); + // await new Promise((r) => setTimeout(r, 5000)); + + // expect(response.statusCode).toBe(200); + // expect(response.body).toHaveProperty('data'); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data).toHaveProperty('markdown'); + // expect(response.body.data).toHaveProperty('metadata'); + // expect(response.body.data.metadata.statusCode).toBe(500); + // }, 60000); + + it.concurrent( + "should return a timeout error when scraping takes longer than the specified timeout", + async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", timeout: 1000 }); - + expect(response.statusCode).toBe(408); - }, 3000); + }, + 3000 + ); - it.concurrent( - "should return a successful response with a valid API key and includeHtml set to true", - async () => { - const scrapeRequest: ScrapeRequestInput = { - url: "https://roastmywebsite.ai", - formats: ["html","rawHtml"], - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).not.toHaveProperty("markdown"); - expect(response.body.data).toHaveProperty("html"); - expect(response.body.data).toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.html).toContain(" { + const scrapeRequest: ScrapeRequestInput = { + url: "https://roastmywebsite.ai", + formats: ["html", "rawHtml"] + }; - it.concurrent( - "should return a successful response with waitFor", - async () => { - const scrapeRequest: ScrapeRequestInput = { - url: "https://ycombinator.com/companies", - formats: ["markdown"], - waitFor: 8000 - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty("markdown"); - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data).not.toHaveProperty("links"); - expect(response.body.data).not.toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.markdown).toContain("PagerDuty"); - expect(response.body.data.metadata.statusCode).toBe(200); - expect(response.body.data.metadata.error).toBeUndefined(); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); - }, - 30000 - ); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + expect(response.body.data).toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.html).toContain(" { - const scrapeRequest: ScrapeRequestInput = { - url: "https://roastmywebsite.ai", - formats: ["links"], - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).not.toHaveProperty("html"); - expect(response.body.data).not.toHaveProperty("rawHtml"); - expect(response.body.data).toHaveProperty("links"); - expect(response.body.data).toHaveProperty("metadata"); - expect(response.body.data.links).toContain("https://firecrawl.dev"); - expect(response.body.data.metadata.statusCode).toBe(200); - expect(response.body.data.metadata.error).toBeUndefined(); - }, - 30000 - ); - + it.concurrent( + "should return a successful response with waitFor", + async () => { + const scrapeRequest: ScrapeRequestInput = { + url: "https://ycombinator.com/companies", + formats: ["markdown"], + waitFor: 8000 + }; + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("links"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.markdown).toContain("PagerDuty"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 30000 + ); + + it.concurrent( + "should return a successful response with a valid links on page", + async () => { + const scrapeRequest: ScrapeRequestInput = { + url: "https://roastmywebsite.ai", + formats: ["links"] + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data).not.toHaveProperty("rawHtml"); + expect(response.body.data).toHaveProperty("links"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.links).toContain("https://firecrawl.dev"); + expect(response.body.data.metadata.statusCode).toBe(200); + expect(response.body.data.metadata.error).toBeUndefined(); + }, + 30000 + ); }); -describe("POST /v1/map", () => { - it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return an error response with an invalid API key", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should return a successful response with a valid API key", async () => { - const mapRequest = { - url: "https://roastmywebsite.ai" - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - }); - - it.concurrent("should return a successful response with a valid API key and search", async () => { - const mapRequest = { - url: "https://usemotion.com", - search: "pricing" - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - expect(links[0]).toContain("usemotion.com/pricing"); - }); - - it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { - const mapRequest = { - url: "https://firecrawl.dev", - search: "docs", - includeSubdomains: true - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - - const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); - expect(containsDocsFirecrawlDev).toBe(true); - }); - - it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { - const mapRequest = { - url: "https://www.firecrawl.dev", - search: "docs", - includeSubdomains: true - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - - const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); - expect(containsDocsFirecrawlDev).toBe(true); - }, 10000) - - it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { - const mapRequest = { - url: "https://www.firecrawl.dev", - search: "docs", - includeSubdomains: false - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("links"); - if (!("links" in response.body)) { - throw new Error("Expected response body to have 'links' property"); - } - const links = response.body.links as unknown[]; - expect(Array.isArray(links)).toBe(true); - expect(links.length).toBeGreaterThan(0); - expect(links[0]).not.toContain("docs.firecrawl.dev"); - }) - - it.concurrent("should return an error for invalid URL", async () => { - const mapRequest = { - url: "invalid-url", - includeSubdomains: true, - search: "test", - }; - - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/map") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(mapRequest); - - expect(response.statusCode).toBe(400); - expect(response.body).toHaveProperty("success", false); - expect(response.body).toHaveProperty("error"); - }); -}); - - -describe("POST /v1/crawl", () => { - it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/crawl") - .send({ url: "https://firecrawl.dev" }); - expect(response.statusCode).toBe(401); - }); - - it.concurrent("should throw error for blocklisted URL", async () => { - const scrapeRequest: ScrapeRequestInput = { - url: "https://facebook.com/fake-test", - }; - - const response = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(403); - expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); - }); - - it.concurrent( - "should return an error response with an invalid API key", - async () => { + describe("POST /v1/map", () => { + it.concurrent("should require authorization", async () => { const response: ScrapeResponseRequestTest = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer invalid-api-key`) - .set("Content-Type", "application/json") + .post("/v1/map") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } - ); + }); - it.concurrent("should return a successful response", async () => { - const response = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://firecrawl.dev" }); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("id"); - expect(response.body.id).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } ); - expect(response.body).toHaveProperty("success", true); - expect(response.body).toHaveProperty("url"); - expect(response.body.url).toContain("/v1/crawl/"); - }); - it.concurrent( - "should return a successful response with a valid API key and valid includes option", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://firecrawl.dev", - limit: 40, - includePaths: ["blog/*"], - }); + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const mapRequest = { + url: "https://roastmywebsite.ai" + }; - let response; - let isFinished = false; - - while (!isFinished) { - response = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); } + ); - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + it.concurrent( + "should return a successful response with a valid API key and search", + async () => { + const mapRequest = { + url: "https://usemotion.com", + search: "pricing" + }; - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(5); - urls.forEach((url: string) => { - expect(url).toContain("firecrawl.dev/blog"); - }); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0 - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - }, - 180000 - ); // 180 seconds - - it.concurrent( - "should return a successful response with a valid API key and valid excludes option", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ - url: "https://firecrawl.dev", - limit: 40, - excludePaths: ["blog/*"], - }); - - let isFinished = false; - let response; - - while (!isFinished) { - response = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - isFinished = response.body.status === "completed"; - - if (!isFinished) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("usemotion.com/pricing"); } + ); - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request( - TEST_URL - ) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + it.concurrent( + "should return a successful response with a valid API key and search and allowSubdomains", + async () => { + const mapRequest = { + url: "https://firecrawl.dev", + search: "docs", + includeSubdomains: true + }; - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(3); - urls.forEach((url: string) => { - expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); - }); - }, - 90000 - ); // 90 seconds + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); - it.concurrent( - "should return a successful response with max depth option for a valid crawl job", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + + const containsDocsFirecrawlDev = links.some((link: string) => + link.includes("docs.firecrawl.dev") + ); + expect(containsDocsFirecrawlDev).toBe(true); + } + ); + + it.concurrent( + "should return a successful response with a valid API key and search and allowSubdomains and www", + async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + + const containsDocsFirecrawlDev = links.some((link: string) => + link.includes("docs.firecrawl.dev") + ); + expect(containsDocsFirecrawlDev).toBe(true); + }, + 10000 + ); + + it.concurrent( + "should return a successful response with a valid API key and search and not allowSubdomains and www", + async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: false + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).not.toContain("docs.firecrawl.dev"); + } + ); + + it.concurrent("should return an error for invalid URL", async () => { + const mapRequest = { + url: "invalid-url", + includeSubdomains: true, + search: "test" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ - url: "https://www.scrapethissite.com", - maxDepth: 1, - }); - expect(crawlResponse.statusCode).toBe(200); + .send(mapRequest); - const response = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("status"); - expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status); - // wait for 60 seconds - let isCompleted = false; - while (!isCompleted) { - const statusCheckResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(statusCheckResponse.statusCode).toBe(200); - isCompleted = statusCheckResponse.body.status === "completed"; - if (!isCompleted) { - await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again - } - } - const completedResponse = await request( - TEST_URL - ) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).not.toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL - ); - expect(urls.length).toBeGreaterThan(1); - - // Check if all URLs have a maximum depth of 1 - urls.forEach((url: string) => { - const pathSplits = new URL(url).pathname.split("/"); - const depth = - pathSplits.length - - (pathSplits[0].length === 0 && - pathSplits[pathSplits.length - 1].length === 0 - ? 1 - : 0); - expect(depth).toBeLessThanOrEqual(2); - }); - }, - 180000 - ); -}) - -describe("GET /v1/crawl/:jobId", () => { - it.concurrent("should require authorization", async () => { - const response = await request(TEST_URL).get("/v1/crawl/123"); - expect(response.statusCode).toBe(401); + expect(response.statusCode).toBe(400); + expect(response.body).toHaveProperty("success", false); + expect(response.body).toHaveProperty("error"); + }); }); - it.concurrent( - "should return an error response with an invalid API key", - async () => { - const response = await request(TEST_URL) - .get("/v1/crawl/123") - .set("Authorization", `Bearer invalid-api-key`); + describe("POST /v1/crawl", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } - ); + }); + + it.concurrent("should throw error for blocklisted URL", async () => { + const scrapeRequest: ScrapeRequestInput = { + url: "https://facebook.com/fake-test" + }; - it.concurrent( - "should return Job not found for invalid job ID", - async () => { const response = await request(TEST_URL) - .get("/v1/crawl/invalidJobId") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(response.statusCode).toBe(404); - } - ); - - it.concurrent( - "should return a successful crawl status response for a valid crawl job", - async () => { - const crawlResponse = await request(TEST_URL) .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://docs.firecrawl.dev" }); - expect(crawlResponse.statusCode).toBe(200); + .send(scrapeRequest); - let isCompleted = false; + expect(response.statusCode).toBe(403); + expect(response.body.error).toBe( + "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." + ); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + } + ); + + it.concurrent("should return a successful response", async () => { + const response = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("id"); + expect(response.body.id).toMatch( + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + ); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("url"); + expect(response.body.url).toContain("/v1/crawl/"); + }); + + it.concurrent( + "should return a successful response with a valid API key and valid includes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 40, + includePaths: ["blog/*"] + }); + + let response; + let isFinished = false; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(5); + urls.forEach((url: string) => { + expect(url).toContain("firecrawl.dev/blog"); + }); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0 + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 180000 + ); // 180 seconds + + it.concurrent( + "should return a successful response with a valid API key and valid excludes option", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://firecrawl.dev", + limit: 40, + excludePaths: ["blog/*"] + }); + + let isFinished = false; + let response; + + while (!isFinished) { + response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); + isFinished = response.body.status === "completed"; + + if (!isFinished) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(3); + urls.forEach((url: string) => { + expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); + }); + }, + 90000 + ); // 90 seconds + + it.concurrent( + "should return a successful response with max depth option for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ + url: "https://www.scrapethissite.com", + maxDepth: 1 + }); + expect(crawlResponse.statusCode).toBe(200); - while (!isCompleted) { const response = await request(TEST_URL) .get(`/v1/crawl/${crawlResponse.body.id}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); - - if (response.body.status === "completed") { - isCompleted = true; - } else { - await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + expect(["active", "waiting", "completed", "scraping"]).toContain( + response.body.status + ); + // wait for 60 seconds + let isCompleted = false; + while (!isCompleted) { + const statusCheckResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(statusCheckResponse.statusCode).toBe(200); + isCompleted = statusCheckResponse.body.status === "completed"; + if (!isCompleted) { + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again + } } + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + const urls = completedResponse.body.data.map( + (item: any) => item.metadata?.sourceURL + ); + expect(urls.length).toBeGreaterThan(1); + + // Check if all URLs have a maximum depth of 1 + urls.forEach((url: string) => { + const pathSplits = new URL(url).pathname.split("/"); + const depth = + pathSplits.length - + (pathSplits[0].length === 0 && + pathSplits[pathSplits.length - 1].length === 0 + ? 1 + : 0); + expect(depth).toBeLessThanOrEqual(2); + }); + }, + 180000 + ); + }); + + describe("GET /v1/crawl/:jobId", () => { + it.concurrent("should require authorization", async () => { + const response = await request(TEST_URL).get("/v1/crawl/123"); + expect(response.statusCode).toBe(401); + }); + + it.concurrent( + "should return an error response with an invalid API key", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/123") + .set("Authorization", `Bearer invalid-api-key`); + expect(response.statusCode).toBe(401); } + ); - await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database - const completedResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + it.concurrent( + "should return Job not found for invalid job ID", + async () => { + const response = await request(TEST_URL) + .get("/v1/crawl/invalidJobId") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(404); + } + ); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("completed"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).not.toHaveProperty("content"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect( - completedResponse.body.data[0].metadata.error - ).toBeUndefined(); + it.concurrent( + "should return a successful crawl status response for a valid crawl job", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.firecrawl.dev" }); + expect(crawlResponse.statusCode).toBe(200); - const childrenLinks = completedResponse.body.data.filter( - (doc) => - doc.metadata && - doc.metadata.sourceURL - ); + let isCompleted = false; - expect(childrenLinks.length).toBe(completedResponse.body.data.length); - }, - 180000 - ); // 120 seconds + while (!isCompleted) { + const response = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("status"); - it.concurrent( - "If someone cancels a crawl job, it should turn into failed status", - async () => { - const crawlResponse = await request(TEST_URL) - .post("/v1/crawl") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send({ url: "https://docs.firecrawl.dev", limit: 10 }); + if (response.body.status === "completed") { + isCompleted = true; + } else { + await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again + } + } - expect(crawlResponse.statusCode).toBe(200); + await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - await new Promise((r) => setTimeout(r, 10000)); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("completed"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).not.toHaveProperty("content"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - const responseCancel = await request(TEST_URL) - .delete(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - expect(responseCancel.statusCode).toBe(200); - expect(responseCancel.body).toHaveProperty("status"); - expect(responseCancel.body.status).toBe("cancelled"); + const childrenLinks = completedResponse.body.data.filter( + (doc) => doc.metadata && doc.metadata.sourceURL + ); - await new Promise((r) => setTimeout(r, 10000)); - const completedResponse = await request(TEST_URL) - .get(`/v1/crawl/${crawlResponse.body.id}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); - - expect(completedResponse.statusCode).toBe(200); - expect(completedResponse.body).toHaveProperty("status"); - expect(completedResponse.body.status).toBe("cancelled"); - expect(completedResponse.body).toHaveProperty("data"); - expect(completedResponse.body.data[0]).toHaveProperty("markdown"); - expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); - expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); - }, - 60000 - ); // 60 seconds -}) + expect(childrenLinks.length).toBe(completedResponse.body.data.length); + }, + 180000 + ); // 120 seconds + + it.concurrent( + "If someone cancels a crawl job, it should turn into failed status", + async () => { + const crawlResponse = await request(TEST_URL) + .post("/v1/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://docs.firecrawl.dev", limit: 10 }); + + expect(crawlResponse.statusCode).toBe(200); + + await new Promise((r) => setTimeout(r, 10000)); + + const responseCancel = await request(TEST_URL) + .delete(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + expect(responseCancel.statusCode).toBe(200); + expect(responseCancel.body).toHaveProperty("status"); + expect(responseCancel.body.status).toBe("cancelled"); + + await new Promise((r) => setTimeout(r, 10000)); + const completedResponse = await request(TEST_URL) + .get(`/v1/crawl/${crawlResponse.body.id}`) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + + expect(completedResponse.statusCode).toBe(200); + expect(completedResponse.body).toHaveProperty("status"); + expect(completedResponse.body.status).toBe("cancelled"); + expect(completedResponse.body).toHaveProperty("data"); + expect(completedResponse.body.data[0]).toHaveProperty("markdown"); + expect(completedResponse.body.data[0]).toHaveProperty("metadata"); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); + }, + 60000 + ); // 60 seconds + }); }); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts index 5c7feb1f..e297f7c8 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts @@ -2,7 +2,7 @@ import request from "supertest"; import { configDotenv } from "dotenv"; import { ScrapeRequest, - ScrapeResponseRequestTest, + ScrapeResponseRequestTest } from "../../controllers/v1/types"; configDotenv(); @@ -10,31 +10,39 @@ const FIRECRAWL_API_URL = "http://127.0.0.1:3002"; const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test describe("E2E Tests for v1 API Routes", () => { + it.concurrent( + "should return a successful response for a scrape with 403 page", + async () => { + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: "https://httpstat.us/403" }); - it.concurrent('should return a successful response for a scrape with 403 page', async () => { - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) - .post('/v1/scrape') - .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) - .set('Content-Type', 'application/json') - .send({ url: 'https://httpstat.us/403' }); + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data.metadata.statusCode).toBe(403); + }, + 30000 + ); - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - expect(response.body.data).toHaveProperty('markdown'); - expect(response.body.data).toHaveProperty('metadata'); - expect(response.body.data.metadata.statusCode).toBe(403); - }, 30000); - - it.concurrent("should handle 'formats:markdown (default)' parameter correctly", + it.concurrent( + "should handle 'formats:markdown (default)' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL } as ScrapeRequest; - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -47,27 +55,41 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data).toHaveProperty("markdown"); - - expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl."); - expect(response.body.data.markdown).toContain("Content with id #content-1"); + + expect(response.body.data.markdown).toContain( + "This page is used for end-to-end (e2e) testing with Firecrawl." + ); + expect(response.body.data.markdown).toContain( + "Content with id #content-1" + ); // expect(response.body.data.markdown).toContain("Loading..."); expect(response.body.data.markdown).toContain("Click me!"); - expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe - expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom + expect(response.body.data.markdown).toContain( + "Power your AI apps with clean data crawled from any website. It's also open-source." + ); // firecrawl.dev inside an iframe + expect(response.body.data.markdown).toContain( + "This content loads only when you see it. Don't blink! 👼" + ); // the browser always scroll to the bottom expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default - expect(response.body.data.markdown).not.toContain("This content is only visible on mobile"); + expect(response.body.data.markdown).not.toContain( + "This content is only visible on mobile" + ); }, - 30000); + 30000 + ); - it.concurrent("should handle 'formats:html' parameter correctly", + it.concurrent( + "should handle 'formats:html' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, formats: ["html"] } as ScrapeRequest; - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -79,23 +101,30 @@ describe("E2E Tests for v1 API Routes", () => { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data).not.toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("html"); - expect(response.body.data.html).not.toContain("
Header
"); - expect(response.body.data.html).toContain("

This page is used for end-to-end (e2e) testing with Firecrawl.

"); + expect(response.body.data.html).not.toContain( + '
Header
' + ); + expect(response.body.data.html).toContain( + '

This page is used for end-to-end (e2e) testing with Firecrawl.

' + ); }, - 30000); + 30000 + ); - it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly", + it.concurrent( + "should handle 'rawHtml' in 'formats' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, formats: ["rawHtml"] } as ScrapeRequest; - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -110,45 +139,30 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).not.toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("rawHtml"); - expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.

"); + expect(response.body.data.rawHtml).toContain( + ">This page is used for end-to-end (e2e) testing with Firecrawl.

" + ); expect(response.body.data.rawHtml).toContain(">Header"); }, - 30000); - + 30000 + ); + // - TODO: tests for links // - TODO: tests for screenshot // - TODO: tests for screenshot@fullPage - it.concurrent("should handle 'headers' parameter correctly", async () => { - // @ts-ignore - const scrapeRequest = { - url: E2E_TEST_SERVER_URL, - headers: { "e2e-header-test": "firecrawl" } - } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequest); - - expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty("data"); - if (!("data" in response.body)) { - throw new Error("Expected response body to have 'data' property"); - } - - expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl"); - }, 30000); - - it.concurrent("should handle 'includeTags' parameter correctly", + it.concurrent( + "should handle 'headers' parameter correctly", async () => { + // @ts-ignore const scrapeRequest = { url: E2E_TEST_SERVER_URL, - includeTags: ['#content-1'] + headers: { "e2e-header-test": "firecrawl" } } as ScrapeRequest; - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") @@ -160,73 +174,126 @@ describe("E2E Tests for v1 API Routes", () => { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data.markdown).not.toContain("

This page is used for end-to-end (e2e) testing with Firecrawl.

"); - expect(response.body.data.markdown).toContain("Content with id #content-1"); + expect(response.body.data.markdown).toContain( + "e2e-header-test: firecrawl" + ); }, - 30000); - - it.concurrent("should handle 'excludeTags' parameter correctly", + 30000 + ); + + it.concurrent( + "should handle 'includeTags' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - excludeTags: ['#content-1'] + includeTags: ["#content-1"] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl."); - expect(response.body.data.markdown).not.toContain("Content with id #content-1"); + expect(response.body.data.markdown).not.toContain( + "

This page is used for end-to-end (e2e) testing with Firecrawl.

" + ); + expect(response.body.data.markdown).toContain( + "Content with id #content-1" + ); }, - 30000); - - it.concurrent("should handle 'onlyMainContent' parameter correctly", + 30000 + ); + + it.concurrent( + "should handle 'excludeTags' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + excludeTags: ["#content-1"] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data.markdown).toContain( + "This page is used for end-to-end (e2e) testing with Firecrawl." + ); + expect(response.body.data.markdown).not.toContain( + "Content with id #content-1" + ); + }, + 30000 + ); + + it.concurrent( + "should handle 'onlyMainContent' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, formats: ["html", "markdown"], onlyMainContent: false } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - - expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl."); - expect(response.body.data.html).toContain("
Header
"); + + expect(response.body.data.markdown).toContain( + "This page is used for end-to-end (e2e) testing with Firecrawl." + ); + expect(response.body.data.html).toContain( + '
Header
' + ); }, - 30000); - - it.concurrent("should handle 'timeout' parameter correctly", + 30000 + ); + + it.concurrent( + "should handle 'timeout' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, timeout: 500 } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(408); if (!("error" in response.body)) { @@ -234,65 +301,87 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.error).toBe("Request timed out"); expect(response.body.success).toBe(false); - }, 30000); + }, + 30000 + ); - - it.concurrent("should handle 'mobile' parameter correctly", + it.concurrent( + "should handle 'mobile' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, mobile: true } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data.markdown).toContain("This content is only visible on mobile"); + expect(response.body.data.markdown).toContain( + "This content is only visible on mobile" + ); }, - 30000); - - it.concurrent("should handle 'parsePDF' parameter correctly", - async () => { - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + 30000 + ); + + it.concurrent( + "should handle 'parsePDF' parameter correctly", + async () => { + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'}); + .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" }); await new Promise((r) => setTimeout(r, 6000)); expect(response.statusCode).toBe(200); - expect(response.body).toHaveProperty('data'); + expect(response.body).toHaveProperty("data"); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993'); - expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm'); + expect(response.body.data.markdown).toContain( + "arXiv:astro-ph/9301001v1 7 Jan 1993" + ); + expect(response.body.data.markdown).not.toContain( + "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm" + ); - const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + const responseNoParsePDF: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false }); + .send({ + url: "https://arxiv.org/pdf/astro-ph/9301001.pdf", + parsePDF: false + }); await new Promise((r) => setTimeout(r, 6000)); expect(responseNoParsePDF.statusCode).toBe(200); - expect(responseNoParsePDF.body).toHaveProperty('data'); + expect(responseNoParsePDF.body).toHaveProperty("data"); if (!("data" in responseNoParsePDF.body)) { throw new Error("Expected response body to have 'data' property"); } - expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm'); + expect(responseNoParsePDF.body.data.markdown).toContain( + "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm" + ); }, - 30000); - + 30000 + ); + // it.concurrent("should handle 'location' parameter correctly", // async () => { // const scrapeRequest: ScrapeRequest = { @@ -302,76 +391,85 @@ describe("E2E Tests for v1 API Routes", () => { // languages: ["en"] // } // }; - + // const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) // .post("/v1/scrape") // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) // .set("Content-Type", "application/json") // .send(scrapeRequest); - + // expect(response.statusCode).toBe(200); // // Add assertions to verify location is handled correctly // }, // 30000); - - it.concurrent("should handle 'skipTlsVerification' parameter correctly", + + it.concurrent( + "should handle 'skipTlsVerification' parameter correctly", async () => { const scrapeRequest = { url: "https://expired.badssl.com/", timeout: 120000 } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - console.log("Error1a") - // console.log(response.body) + console.log("Error1a"); + // console.log(response.body) expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } expect(response.body.data.metadata.pageStatusCode).toBe(500); - console.log("Error?") - + console.log("Error?"); + const scrapeRequestWithSkipTlsVerification = { url: "https://expired.badssl.com/", skipTlsVerification: true, timeout: 120000 - } as ScrapeRequest; - - const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) - .post("/v1/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(scrapeRequestWithSkipTlsVerification); - - console.log("Error1b") + + const responseWithSkipTlsVerification: ScrapeResponseRequestTest = + await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequestWithSkipTlsVerification); + + console.log("Error1b"); // console.log(responseWithSkipTlsVerification.body) expect(responseWithSkipTlsVerification.statusCode).toBe(200); if (!("data" in responseWithSkipTlsVerification.body)) { throw new Error("Expected response body to have 'data' property"); } // console.log(responseWithSkipTlsVerification.body.data) - expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com"); + expect(responseWithSkipTlsVerification.body.data.markdown).toContain( + "badssl.com" + ); }, - 60000); - - it.concurrent("should handle 'removeBase64Images' parameter correctly", + 60000 + ); + + it.concurrent( + "should handle 'removeBase64Images' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, removeBase64Images: true } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); @@ -380,49 +478,63 @@ describe("E2E Tests for v1 API Routes", () => { // - TODO: not working for every image // expect(response.body.data.markdown).toContain("Image-Removed"); }, - 30000); + 30000 + ); - it.concurrent("should handle 'action wait' parameter correctly", + it.concurrent( + "should handle 'action wait' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - actions: [{ - type: "wait", - milliseconds: 10000 - }] + actions: [ + { + type: "wait", + milliseconds: 10000 + } + ] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } expect(response.body.data.markdown).not.toContain("Loading..."); - expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!"); + expect(response.body.data.markdown).toContain( + "Content loaded after 5 seconds!" + ); }, - 30000); + 30000 + ); // screenshot - it.concurrent("should handle 'action screenshot' parameter correctly", + it.concurrent( + "should handle 'action screenshot' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - actions: [{ - type: "screenshot" - }] + actions: [ + { + type: "screenshot" + } + ] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); @@ -430,32 +542,42 @@ describe("E2E Tests for v1 API Routes", () => { if (!response.body.data.actions?.screenshots) { throw new Error("Expected response body to have screenshots array"); } - expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0); - expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"); + expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan( + 0 + ); + expect(response.body.data.actions.screenshots[0]).toContain( + "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-" + ); // TODO compare screenshot with expected screenshot }, - 30000); + 30000 + ); - it.concurrent("should handle 'action screenshot@fullPage' parameter correctly", + it.concurrent( + "should handle 'action screenshot@fullPage' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - actions: [{ - type: "screenshot", - fullPage: true - }, - { - type:"scrape" - }] + actions: [ + { + type: "screenshot", + fullPage: true + }, + { + type: "scrape" + } + ] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); @@ -464,77 +586,101 @@ describe("E2E Tests for v1 API Routes", () => { if (!response.body.data.actions?.screenshots) { throw new Error("Expected response body to have screenshots array"); } - expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0); - expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"); + expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan( + 0 + ); + expect(response.body.data.actions.screenshots[0]).toContain( + "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-" + ); if (!response.body.data.actions?.scrapes) { - throw new Error("Expected response body to have scrapes array"); + throw new Error("Expected response body to have scrapes array"); } - expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/"); - expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.

"); + expect(response.body.data.actions.scrapes[0].url).toBe( + "https://firecrawl-e2e-test.vercel.app/" + ); + expect(response.body.data.actions.scrapes[0].html).toContain( + "This page is used for end-to-end (e2e) testing with Firecrawl.

" + ); // TODO compare screenshot with expected full page screenshot }, - 30000); + 30000 + ); - it.concurrent("should handle 'action click' parameter correctly", + it.concurrent( + "should handle 'action click' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - actions: [{ - type: "click", - selector: "#click-me" - }] + actions: [ + { + type: "click", + selector: "#click-me" + } + ] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } expect(response.body.data.markdown).not.toContain("Click me!"); - expect(response.body.data.markdown).toContain("Text changed after click!"); + expect(response.body.data.markdown).toContain( + "Text changed after click!" + ); }, - 30000); + 30000 + ); - it.concurrent("should handle 'action write' parameter correctly", + it.concurrent( + "should handle 'action write' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, formats: ["html"], - actions: [{ - type: "click", - selector: "#input-1" - }, - { - type: "write", - text: "Hello, world!" - } - ]} as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + actions: [ + { + type: "click", + selector: "#input-1" + }, + { + type: "write", + text: "Hello, world!" + } + ] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + expect(response.statusCode).toBe(200); if (!("data" in response.body)) { throw new Error("Expected response body to have 'data' property"); } - + // TODO: fix this test (need to fix fire-engine first) // uncomment the following line: // expect(response.body.data.html).toContain(""); }, - 30000); + 30000 + ); // TODO: fix this test (need to fix fire-engine first) - it.concurrent("should handle 'action pressKey' parameter correctly", + it.concurrent( + "should handle 'action pressKey' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, @@ -546,13 +692,15 @@ describe("E2E Tests for v1 API Routes", () => { } ] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + // // TODO: fix this test (need to fix fire-engine first) // // right now response.body is: { success: false, error: '(Internal server error) - null' } // expect(response.statusCode).toBe(200); @@ -561,10 +709,12 @@ describe("E2E Tests for v1 API Routes", () => { // } // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown") }, - 30000); + 30000 + ); // TODO: fix this test (need to fix fire-engine first) - it.concurrent("should handle 'action scroll' parameter correctly", + it.concurrent( + "should handle 'action scroll' parameter correctly", async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, @@ -581,23 +731,25 @@ describe("E2E Tests for v1 API Routes", () => { } ] } as ScrapeRequest; - - const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + + const response: ScrapeResponseRequestTest = await request( + FIRECRAWL_API_URL + ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send(scrapeRequest); - + // TODO: uncomment this tests // expect(response.statusCode).toBe(200); // if (!("data" in response.body)) { // throw new Error("Expected response body to have 'data' property"); // } - // + // // expect(response.body.data.markdown).toContain("You have reached the bottom!") }, - 30000); + 30000 + ); // TODO: test scrape action - -}); \ No newline at end of file +}); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 90a4587d..e026eef0 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -3,7 +3,7 @@ import dotenv from "dotenv"; import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, - FirecrawlScrapeResponse, + FirecrawlScrapeResponse } from "../../types"; dotenv.config(); @@ -28,9 +28,8 @@ describe("E2E Tests for v0 API Routes", () => { describe("POST /v0/scrape", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlScrapeResponse = await request(TEST_URL).post( - "/v0/scrape" - ); + const response: FirecrawlScrapeResponse = + await request(TEST_URL).post("/v0/scrape"); expect(response.statusCode).toBe(401); }); @@ -99,7 +98,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true }, + pageOptions: { includeHtml: true } }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -196,7 +195,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com/", - pageOptions: { removeTags: [".nav", "#footer", "strong"] }, + pageOptions: { removeTags: [".nav", "#footer", "strong"] } }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -338,9 +337,8 @@ describe("E2E Tests for v0 API Routes", () => { describe("POST /v0/crawl", () => { it.concurrent("should require authorization", async () => { - const response: FirecrawlCrawlResponse = await request(TEST_URL).post( - "/v0/crawl" - ); + const response: FirecrawlCrawlResponse = + await request(TEST_URL).post("/v0/crawl"); expect(response.statusCode).toBe(401); }); @@ -383,8 +381,8 @@ describe("E2E Tests for v0 API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - includes: ["blog/*"], - }, + includes: ["blog/*"] + } }); let response: FirecrawlCrawlStatusResponse; @@ -446,8 +444,8 @@ describe("E2E Tests for v0 API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - excludes: ["blog/*"], - }, + excludes: ["blog/*"] + } }); let isFinished = false; @@ -494,7 +492,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 }, + crawlerOptions: { maxDepth: 1 } }); expect(crawlResponse.statusCode).toBe(200); @@ -690,7 +688,9 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Firecrawl"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( + 200 + ); expect( completedResponse.body.data[0].metadata.pageError ).toBeUndefined(); @@ -760,7 +760,10 @@ describe("E2E Tests for v0 API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } }); + .send({ + url: "https://docs.tatum.io", + crawlerOptions: { limit: 200 } + }); expect(crawlResponse.statusCode).toBe(200); @@ -825,7 +828,7 @@ describe("E2E Tests for v0 API Routes", () => { .send({ url: "https://mendable.ai", pageOptions: { - onlyMainContent: true, + onlyMainContent: true }, extractorOptions: { mode: "llm-extraction", @@ -835,18 +838,18 @@ describe("E2E Tests for v0 API Routes", () => { type: "object", properties: { company_mission: { - type: "string", + type: "string" }, supports_sso: { - type: "boolean", + type: "boolean" }, is_open_source: { - type: "boolean", - }, + type: "boolean" + } }, - required: ["company_mission", "supports_sso", "is_open_source"], - }, - }, + required: ["company_mission", "supports_sso", "is_open_source"] + } + } }); // Ensure that the job was successfully created before proceeding with LLM extraction diff --git a/apps/api/src/controllers/__tests__/crawl.test.ts b/apps/api/src/controllers/__tests__/crawl.test.ts index e65523cb..81fa2e5d 100644 --- a/apps/api/src/controllers/__tests__/crawl.test.ts +++ b/apps/api/src/controllers/__tests__/crawl.test.ts @@ -1,30 +1,30 @@ -import { crawlController } from '../v0/crawl' -import { Request, Response } from 'express'; -import { authenticateUser } from '../auth'; // Ensure this import is correct -import { createIdempotencyKey } from '../../services/idempotency/create'; -import { validateIdempotencyKey } from '../../services/idempotency/validate'; -import { v4 as uuidv4 } from 'uuid'; +import { crawlController } from "../v0/crawl"; +import { Request, Response } from "express"; +import { authenticateUser } from "../auth"; // Ensure this import is correct +import { createIdempotencyKey } from "../../services/idempotency/create"; +import { validateIdempotencyKey } from "../../services/idempotency/validate"; +import { v4 as uuidv4 } from "uuid"; -jest.mock('../auth', () => ({ +jest.mock("../auth", () => ({ authenticateUser: jest.fn().mockResolvedValue({ success: true, - team_id: 'team123', + team_id: "team123", error: null, status: 200 }), reduce: jest.fn() })); -jest.mock('../../services/idempotency/validate'); +jest.mock("../../services/idempotency/validate"); -describe('crawlController', () => { - it('should prevent duplicate requests using the same idempotency key', async () => { +describe("crawlController", () => { + it("should prevent duplicate requests using the same idempotency key", async () => { const req = { headers: { - 'x-idempotency-key': await uuidv4(), - 'Authorization': `Bearer ${process.env.TEST_API_KEY}` + "x-idempotency-key": await uuidv4(), + Authorization: `Bearer ${process.env.TEST_API_KEY}` }, body: { - url: 'https://mendable.ai' + url: "https://mendable.ai" } } as unknown as Request; const res = { @@ -33,7 +33,9 @@ describe('crawlController', () => { } as unknown as Response; // Mock the idempotency key validation to return false for the second call - (validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false); + (validateIdempotencyKey as jest.Mock) + .mockResolvedValueOnce(true) + .mockResolvedValueOnce(false); // First request should succeed await crawlController(req, res); @@ -42,6 +44,8 @@ describe('crawlController', () => { // Second request with the same key should fail await crawlController(req, res); expect(res.status).toHaveBeenCalledWith(409); - expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' }); + expect(res.json).toHaveBeenCalledWith({ + error: "Idempotency key already used" + }); }); -}); \ No newline at end of file +}); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 8f4d49ea..947c2784 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -4,7 +4,7 @@ import { AuthResponse, NotificationType, PlanType, - RateLimiterMode, + RateLimiterMode } from "../types"; import { supabase_service } from "../services/supabase"; import { withAuth } from "../lib/withAuth"; @@ -39,7 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean { export async function setCachedACUC( api_key: string, acuc: - | AuthCreditUsageChunk | null + | AuthCreditUsageChunk + | null | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null) ) { const cacheKeyACUC = `acuc_${api_key}`; @@ -48,7 +49,7 @@ export async function setCachedACUC( try { await redlock.using([redLockKey], 10000, {}, async (signal) => { if (typeof acuc === "function") { - acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null")); + acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null")); if (acuc === null) { if (signal.aborted) { @@ -125,7 +126,7 @@ export async function getACUC( if (chunk !== null && useCache) { setCachedACUC(api_key, chunk); } - + // console.log(chunk); return chunk; @@ -134,9 +135,7 @@ export async function getACUC( } } -export async function clearACUC( - api_key: string, -): Promise { +export async function clearACUC(api_key: string): Promise { const cacheKeyACUC = `acuc_${api_key}`; await deleteKey(cacheKeyACUC); } @@ -146,7 +145,11 @@ export async function authenticateUser( res, mode?: RateLimiterMode ): Promise { - return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode); + return withAuth(supaAuthenticateUser, { + success: true, + chunk: null, + team_id: "bypass" + })(req, res, mode); } export async function supaAuthenticateUser( @@ -167,7 +170,7 @@ export async function supaAuthenticateUser( return { success: false, error: "Unauthorized: Token missing", - status: 401, + status: 401 }; } @@ -196,7 +199,7 @@ export async function supaAuthenticateUser( return { success: false, error: "Unauthorized: Invalid token", - status: 401, + status: 401 }; } @@ -206,7 +209,7 @@ export async function supaAuthenticateUser( return { success: false, error: "Unauthorized: Invalid token", - status: 401, + status: 401 }; } @@ -216,7 +219,7 @@ export async function supaAuthenticateUser( const plan = getPlanByPriceId(priceId); subscriptionData = { team_id: teamId, - plan, + plan }; switch (mode) { case RateLimiterMode.Crawl: @@ -270,7 +273,13 @@ export async function supaAuthenticateUser( try { await rateLimiter.consume(team_endpoint_token); } catch (rateLimiterRes) { - logger.error(`Rate limit exceeded: ${rateLimiterRes}`, { teamId, priceId, plan: subscriptionData?.plan, mode, rateLimiterRes }); + logger.error(`Rate limit exceeded: ${rateLimiterRes}`, { + teamId, + priceId, + plan: subscriptionData?.plan, + mode, + rateLimiterRes + }); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); @@ -284,7 +293,7 @@ export async function supaAuthenticateUser( return { success: false, error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, - status: 429, + status: 429 }; } @@ -314,7 +323,7 @@ export async function supaAuthenticateUser( success: true, team_id: teamId ?? undefined, plan: (subscriptionData?.plan ?? "") as PlanType, - chunk, + chunk }; } function getPlanByPriceId(price_id: string | null): PlanType { diff --git a/apps/api/src/controllers/v0/admin/queue.ts b/apps/api/src/controllers/v0/admin/queue.ts index 6ef8a992..6cc1c6e0 100644 --- a/apps/api/src/controllers/v0/admin/queue.ts +++ b/apps/api/src/controllers/v0/admin/queue.ts @@ -31,7 +31,9 @@ export async function cleanBefore24hCompleteJobsController( ).flat(); const before24hJobs = completedJobs.filter( - (job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + (job) => + job.finishedOn !== undefined && + job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 ) || []; let count = 0; @@ -71,14 +73,14 @@ export async function queuesController(req: Request, res: Response) { const scrapeQueue = getScrapeQueue(); const [webScraperActive] = await Promise.all([ - scrapeQueue.getActiveCount(), + scrapeQueue.getActiveCount() ]); const noActiveJobs = webScraperActive === 0; // 200 if no active jobs, 503 if there are active jobs return res.status(noActiveJobs ? 200 : 500).json({ webScraperActive, - noActiveJobs, + noActiveJobs }); } catch (error) { logger.error(error); @@ -97,7 +99,7 @@ export async function autoscalerController(req: Request, res: Response) { await Promise.all([ scrapeQueue.getActiveCount(), scrapeQueue.getWaitingCount(), - scrapeQueue.getPrioritizedCount(), + scrapeQueue.getPrioritizedCount() ]); let waitingAndPriorityCount = webScraperWaiting + webScraperPriority; @@ -107,8 +109,8 @@ export async function autoscalerController(req: Request, res: Response) { "https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines", { headers: { - Authorization: `Bearer ${process.env.FLY_API_TOKEN}`, - }, + Authorization: `Bearer ${process.env.FLY_API_TOKEN}` + } } ); const machines = await request.json(); @@ -184,13 +186,13 @@ export async function autoscalerController(req: Request, res: Response) { } return res.status(200).json({ mode: "scale-descale", - count: targetMachineCount, + count: targetMachineCount }); } return res.status(200).json({ mode: "normal", - count: activeMachines, + count: activeMachines }); } catch (error) { logger.error(error); diff --git a/apps/api/src/controllers/v0/admin/redis-health.ts b/apps/api/src/controllers/v0/admin/redis-health.ts index dc587606..963755ef 100644 --- a/apps/api/src/controllers/v0/admin/redis-health.ts +++ b/apps/api/src/controllers/v0/admin/redis-health.ts @@ -49,7 +49,7 @@ export async function redisHealthController(req: Request, res: Response) { const healthStatus = { queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", redisRateLimitClient: - redisRateLimitHealth === testValue ? "healthy" : "unhealthy", + redisRateLimitHealth === testValue ? "healthy" : "unhealthy" }; if ( diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts index e81064f2..b445978c 100644 --- a/apps/api/src/controllers/v0/crawl-cancel.ts +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -10,13 +10,9 @@ configDotenv(); export async function crawlCancelController(req: Request, res: Response) { try { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; - const auth = await authenticateUser( - req, - res, - RateLimiterMode.CrawlStatus - ); + const auth = await authenticateUser(req, res, RateLimiterMode.CrawlStatus); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 9c799eeb..756fca44 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -12,21 +12,25 @@ import { toLegacyDocument } from "../v1/types"; configDotenv(); export async function getJobs(crawlId: string, ids: string[]) { - const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[]; - + const jobs = ( + await Promise.all(ids.map((x) => getScrapeQueue().getJob(x))) + ).filter((x) => x) as Job[]; + if (process.env.USE_DB_AUTHENTICATION === "true") { const supabaseData = await supabaseGetJobsByCrawlId(crawlId); - supabaseData.forEach(x => { - const job = jobs.find(y => y.id === x.job_id); + supabaseData.forEach((x) => { + const job = jobs.find((y) => y.id === x.job_id); if (job) { job.returnvalue = x.docs; } - }) + }); } - jobs.forEach(job => { - job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + jobs.forEach((job) => { + job.returnvalue = Array.isArray(job.returnvalue) + ? job.returnvalue[0] + : job.returnvalue; }); return jobs; @@ -34,11 +38,7 @@ export async function getJobs(crawlId: string, ids: string[]) { export async function crawlStatusController(req: Request, res: Response) { try { - const auth = await authenticateUser( - req, - res, - RateLimiterMode.CrawlStatus - ); + const auth = await authenticateUser(req, res, RateLimiterMode.CrawlStatus); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } @@ -55,7 +55,7 @@ export async function crawlStatusController(req: Request, res: Response) { } let jobIDs = await getCrawlJobs(req.params.jobId); let jobs = await getJobs(req.params.jobId, jobIDs); - let jobStatuses = await Promise.all(jobs.map(x => x.getState())); + let jobStatuses = await Promise.all(jobs.map((x) => x.getState())); // Combine jobs and jobStatuses into a single array of objects let jobsWithStatuses = jobs.map((job, index) => ({ @@ -64,18 +64,31 @@ export async function crawlStatusController(req: Request, res: Response) { })); // Filter out failed jobs - jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed" && x.status !== "unknown"); + jobsWithStatuses = jobsWithStatuses.filter( + (x) => x.status !== "failed" && x.status !== "unknown" + ); // Sort jobs by timestamp jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp); // Extract sorted jobs and statuses - jobs = jobsWithStatuses.map(x => x.job); - jobStatuses = jobsWithStatuses.map(x => x.status); + jobs = jobsWithStatuses.map((x) => x.job); + jobStatuses = jobsWithStatuses.map((x) => x.status); - const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; + const jobStatus = sc.cancelled + ? "failed" + : jobStatuses.every((x) => x === "completed") + ? "completed" + : "active"; - const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); + const data = jobs + .filter( + (x) => + x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null + ) + .map((x) => + Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue + ); if ( jobs.length > 0 && @@ -83,7 +96,7 @@ export async function crawlStatusController(req: Request, res: Response) { jobs[0].data.pageOptions && !jobs[0].data.pageOptions.includeRawHtml ) { - data.forEach(item => { + data.forEach((item) => { if (item) { delete item.rawHtml; } @@ -92,10 +105,19 @@ export async function crawlStatusController(req: Request, res: Response) { res.json({ status: jobStatus, - current: jobStatuses.filter(x => x === "completed" || x === "failed").length, + current: jobStatuses.filter((x) => x === "completed" || x === "failed") + .length, total: jobs.length, - data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null, - partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)), + data: + jobStatus === "completed" + ? data.map((x) => toLegacyDocument(x, sc.internalOptions)) + : null, + partial_data: + jobStatus === "completed" + ? [] + : data + .filter((x) => x !== null) + .map((x) => toLegacyDocument(x, sc.internalOptions)) }); } catch (error) { Sentry.captureException(error); diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index 06a86f92..b8c6bc63 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { logCrawl } from "../../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../../src/services/idempotency/create"; -import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values"; +import { + defaultCrawlPageOptions, + defaultCrawlerOptions, + defaultOrigin +} from "../../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; import { logger } from "../../../src/lib/logger"; -import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; +import { + addCrawlJob, + addCrawlJobs, + crawlToCrawler, + lockURL, + lockURLs, + saveCrawl, + StoredCrawl +} from "../../../src/lib/crawl-redis"; import { getScrapeQueue } from "../../../src/services/queue-service"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; @@ -20,11 +32,7 @@ import { ZodError } from "zod"; export async function crawlController(req: Request, res: Response) { try { - const auth = await authenticateUser( - req, - res, - RateLimiterMode.Crawl - ); + const auth = await authenticateUser(req, res, RateLimiterMode.Crawl); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } @@ -46,7 +54,7 @@ export async function crawlController(req: Request, res: Response) { const crawlerOptions = { ...defaultCrawlerOptions, - ...req.body.crawlerOptions, + ...req.body.crawlerOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; @@ -71,16 +79,24 @@ export async function crawlController(req: Request, res: Response) { } const limitCheck = req.body?.crawlerOptions?.limit ?? 1; - const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } = - await checkTeamCredits(chunk, team_id, limitCheck); + const { + success: creditsCheckSuccess, + message: creditsCheckMessage, + remainingCredits + } = await checkTeamCredits(chunk, team_id, limitCheck); if (!creditsCheckSuccess) { - return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" }); + return res + .status(402) + .json({ + error: + "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" + }); } // TODO: need to do this to v1 crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); - + let url = urlSchema.parse(req.body.url); if (!url) { return res.status(400).json({ error: "Url is required" }); @@ -99,7 +115,7 @@ export async function crawlController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); } @@ -136,7 +152,11 @@ export async function crawlController(req: Request, res: Response) { await logCrawl(id, team_id); - const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions( + pageOptions, + undefined, + undefined + ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter delete (scrapeOptions as any).timeout; @@ -148,7 +168,7 @@ export async function crawlController(req: Request, res: Response) { internalOptions, team_id, plan, - createdAt: Date.now(), + createdAt: Date.now() }; const crawler = crawlToCrawler(id, sc); @@ -163,14 +183,13 @@ export async function crawlController(req: Request, res: Response) { ? null : await crawler.tryGetSitemap(); - if (sitemap !== null && sitemap.length > 0) { let jobPriority = 20; // If it is over 1000, we need to get the job priority, // otherwise we can use the default priority of 20 - if(sitemap.length > 1000){ + if (sitemap.length > 1000) { // set base to 21 - jobPriority = await getJobPriority({plan, team_id, basePriority: 21}) + jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 }); } const jobs = sitemap.map((x) => { const url = x.url; @@ -187,12 +206,12 @@ export async function crawlController(req: Request, res: Response) { plan, origin: req.body.origin ?? defaultOrigin, crawl_id: id, - sitemapped: true, + sitemapped: true }, opts: { jobId: uuid, - priority: jobPriority, - }, + priority: jobPriority + } }; }); @@ -226,12 +245,12 @@ export async function crawlController(req: Request, res: Response) { team_id, plan: plan!, origin: req.body.origin ?? defaultOrigin, - crawl_id: id, + crawl_id: id }, { - priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs + priority: 15 // prioritize request 0 of crawl jobs same as scrape jobs }, - jobId, + jobId ); await addCrawlJob(id, jobId); } @@ -240,8 +259,10 @@ export async function crawlController(req: Request, res: Response) { } catch (error) { Sentry.captureException(error); logger.error(error); - return res.status(500).json({ error: error instanceof ZodError - ? "Invalid URL" - : error.message }); + return res + .status(500) + .json({ + error: error instanceof ZodError ? "Invalid URL" : error.message + }); } } diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 8b82bef8..3b47bfaa 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -4,7 +4,13 @@ import { RateLimiterMode } from "../../../src/types"; import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; import { logger } from "../../../src/lib/logger"; -import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; +import { + addCrawlJob, + crawlToCrawler, + lockURL, + saveCrawl, + StoredCrawl +} from "../../../src/lib/crawl-redis"; import { addScrapeJob } from "../../../src/services/queue-jobs"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; @@ -12,11 +18,7 @@ import { fromLegacyScrapeOptions } from "../v1/types"; export async function crawlPreviewController(req: Request, res: Response) { try { - const auth = await authenticateUser( - req, - res, - RateLimiterMode.Preview - ); + const auth = await authenticateUser(req, res, RateLimiterMode.Preview); const team_id = "preview"; @@ -39,16 +41,18 @@ export async function crawlPreviewController(req: Request, res: Response) { } if (isUrlBlocked(url)) { - return res - .status(403) - .json({ - error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - }); + return res.status(403).json({ + error: + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + }); } const crawlerOptions = req.body.crawlerOptions ?? {}; - const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] }; + const pageOptions = req.body.pageOptions ?? { + onlyMainContent: false, + includeHtml: false, + removeTags: [] + }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // try { @@ -87,7 +91,11 @@ export async function crawlPreviewController(req: Request, res: Response) { robots = await this.getRobotsTxt(); } catch (_) {} - const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions( + pageOptions, + undefined, + undefined + ); const sc: StoredCrawl = { originUrl: url, @@ -97,20 +105,44 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id, plan, robots, - createdAt: Date.now(), + createdAt: Date.now() }; await saveCrawl(id, sc); const crawler = crawlToCrawler(id, sc); - const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap(); + const sitemap = sc.crawlerOptions?.ignoreSitemap + ? null + : await crawler.tryGetSitemap(); if (sitemap !== null) { - for (const url of sitemap.map(x => x.url)) { + for (const url of sitemap.map((x) => x.url)) { await lockURL(id, sc, url); const jobId = uuidv4(); - await addScrapeJob({ + await addScrapeJob( + { + url, + mode: "single_urls", + team_id, + plan: plan!, + crawlerOptions, + scrapeOptions, + internalOptions, + origin: "website-preview", + crawl_id: id, + sitemapped: true + }, + {}, + jobId + ); + await addCrawlJob(id, jobId); + } + } else { + await lockURL(id, sc, url); + const jobId = uuidv4(); + await addScrapeJob( + { url, mode: "single_urls", team_id, @@ -119,25 +151,11 @@ export async function crawlPreviewController(req: Request, res: Response) { scrapeOptions, internalOptions, origin: "website-preview", - crawl_id: id, - sitemapped: true, - }, {}, jobId); - await addCrawlJob(id, jobId); - } - } else { - await lockURL(id, sc, url); - const jobId = uuidv4(); - await addScrapeJob({ - url, - mode: "single_urls", - team_id, - plan: plan!, - crawlerOptions, - scrapeOptions, - internalOptions, - origin: "website-preview", - crawl_id: id, - }, {}, jobId); + crawl_id: id + }, + {}, + jobId + ); await addCrawlJob(id, jobId); } diff --git a/apps/api/src/controllers/v0/keyAuth.ts b/apps/api/src/controllers/v0/keyAuth.ts index 63915302..2495705c 100644 --- a/apps/api/src/controllers/v0/keyAuth.ts +++ b/apps/api/src/controllers/v0/keyAuth.ts @@ -1,17 +1,12 @@ - import { AuthResponse, RateLimiterMode } from "../../types"; import { Request, Response } from "express"; import { authenticateUser } from "../auth"; - export const keyAuthController = async (req: Request, res: Response) => { try { // make sure to authenticate user first, Bearer - const auth = await authenticateUser( - req, - res - ); + const auth = await authenticateUser(req, res); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } @@ -22,4 +17,3 @@ export const keyAuthController = async (req: Request, res: Response) => { return res.status(500).json({ error: error.message }); } }; - diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 02b9400e..c7c8d9fe 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -2,19 +2,24 @@ import { ExtractorOptions, PageOptions } from "./../../lib/entities"; import { Request, Response } from "express"; import { billTeam, - checkTeamCredits, + checkTeamCredits } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; -import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types"; +import { + Document, + fromLegacyCombo, + toLegacyDocument, + url as urlSchema +} from "../v1/types"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, - defaultOrigin, + defaultOrigin } from "../../lib/default-values"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { getScrapeQueue } from "../../services/queue-service"; @@ -50,13 +55,18 @@ export async function scrapeHelper( success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - returnCode: 403, + returnCode: 403 }; } const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); - const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions); + const { scrapeOptions, internalOptions } = fromLegacyCombo( + pageOptions, + extractorOptions, + timeout, + crawlerOptions + ); await addScrapeJob( { @@ -67,7 +77,7 @@ export async function scrapeHelper( internalOptions, plan: plan!, origin: req.body.origin ?? defaultOrigin, - is_scrape: true, + is_scrape: true }, {}, jobId, @@ -80,18 +90,21 @@ export async function scrapeHelper( { name: "Wait for job to finish", op: "bullmq.wait", - attributes: { job: jobId }, + attributes: { job: jobId } }, async (span) => { try { - doc = (await waitForJob(jobId, timeout)); + doc = await waitForJob(jobId, timeout); } catch (e) { - if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) { + if ( + e instanceof Error && + (e.message.startsWith("Job wait") || e.message === "timeout") + ) { span.setAttribute("timedOut", true); return { success: false, error: "Request timed out", - returnCode: 408, + returnCode: 408 }; } else if ( typeof e === "string" && @@ -104,7 +117,7 @@ export async function scrapeHelper( return { success: false, error: e, - returnCode: 500, + returnCode: 500 }; } else { throw e; @@ -127,7 +140,7 @@ export async function scrapeHelper( success: true, error: "No page found", returnCode: 200, - data: doc, + data: doc }; } @@ -153,7 +166,7 @@ export async function scrapeHelper( return { success: true, data: toLegacyDocument(doc, internalOptions), - returnCode: 200, + returnCode: 200 }; } @@ -161,11 +174,7 @@ export async function scrapeController(req: Request, res: Response) { try { let earlyReturn = false; // make sure to authenticate user first, Bearer - const auth = await authenticateUser( - req, - res, - RateLimiterMode.Scrape - ); + const auth = await authenticateUser(req, res, RateLimiterMode.Scrape); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } @@ -176,7 +185,7 @@ export async function scrapeController(req: Request, res: Response) { const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const extractorOptions = { ...defaultExtractorOptions, - ...req.body.extractorOptions, + ...req.body.extractorOptions }; const origin = req.body.origin ?? defaultOrigin; let timeout = req.body.timeout ?? defaultTimeout; @@ -188,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) { ) { return res.status(400).json({ error: - "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified", + "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" }); } @@ -202,14 +211,19 @@ export async function scrapeController(req: Request, res: Response) { await checkTeamCredits(chunk, team_id, 1); if (!creditsCheckSuccess) { earlyReturn = true; - return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" }); + return res + .status(402) + .json({ + error: + "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" + }); } } catch (error) { logger.error(error); earlyReturn = true; return res.status(500).json({ error: - "Error checking team credits. Please contact help@firecrawl.com for help.", + "Error checking team credits. Please contact help@firecrawl.com for help." }); } @@ -230,7 +244,10 @@ export async function scrapeController(req: Request, res: Response) { const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = result.data && (result.data as Document).markdown - ? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo") + ? numTokensFromString( + (result.data as Document).markdown!, + "gpt-3.5-turbo" + ) : 0; if (result.success) { @@ -250,27 +267,33 @@ export async function scrapeController(req: Request, res: Response) { } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction - billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => { - logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); + billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => { + logger.error( + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}` + ); // Optionally, you could notify an admin or add to a retry queue here }); } } - + let doc = result.data; if (!pageOptions || !pageOptions.includeRawHtml) { if (doc && (doc as Document).rawHtml) { delete (doc as Document).rawHtml; } } - - if(pageOptions && pageOptions.includeExtract) { - if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { + + if (pageOptions && pageOptions.includeExtract) { + if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { delete (doc as Document).markdown; } } - const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout); + const { scrapeOptions } = fromLegacyScrapeOptions( + pageOptions, + extractorOptions, + timeout + ); logJob({ job_id: jobId, @@ -285,7 +308,7 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions: crawlerOptions, scrapeOptions, origin: origin, - num_tokens: numTokens, + num_tokens: numTokens }); return res.status(result.returnCode).json(result); @@ -298,7 +321,7 @@ export async function scrapeController(req: Request, res: Response) { ? "Invalid URL" : typeof error === "string" ? error - : error?.message ?? "Internal Server Error", + : (error?.message ?? "Internal Server Error") }); } } diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 4dd38afd..4950ea5f 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -1,5 +1,8 @@ import { Request, Response } from "express"; -import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; +import { + billTeam, + checkTeamCredits +} from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; @@ -13,7 +16,12 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { Job } from "bullmq"; -import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types"; +import { + Document, + fromLegacyCombo, + fromLegacyScrapeOptions, + toLegacyDocument +} from "../v1/types"; export async function searchHelper( jobId: string, @@ -54,16 +62,23 @@ export async function searchHelper( filter: filter, lang: searchOptions.lang ?? "en", country: searchOptions.country ?? "us", - location: searchOptions.location, + location: searchOptions.location }); let justSearch = pageOptions.fetchPageContent === false; - const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions); + const { scrapeOptions, internalOptions } = fromLegacyCombo( + pageOptions, + undefined, + 60000, + crawlerOptions + ); if (justSearch) { - billTeam(team_id, subscription_id, res.length).catch(error => { - logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); + billTeam(team_id, subscription_id, res.length).catch((error) => { + logger.error( + `Failed to bill team ${team_id} for ${res.length} credits: ${error}` + ); // Optionally, you could notify an admin or add to a retry queue here }); return { success: true, data: res, returnCode: 200 }; @@ -78,11 +93,11 @@ export async function searchHelper( return { success: true, error: "No search results found", returnCode: 200 }; } - const jobPriority = await getJobPriority({plan, team_id, basePriority: 20}); - + const jobPriority = await getJobPriority({ plan, team_id, basePriority: 20 }); + // filter out social media links - const jobDatas = res.map(x => { + const jobDatas = res.map((x) => { const url = x.url; const uuid = uuidv4(); return { @@ -92,28 +107,32 @@ export async function searchHelper( mode: "single_urls", team_id: team_id, scrapeOptions, - internalOptions, + internalOptions }, opts: { jobId: uuid, - priority: jobPriority, + priority: jobPriority } }; - }) + }); // TODO: addScrapeJobs for (const job of jobDatas) { - await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority) + await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority); } - const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions)); - + const docs = ( + await Promise.all( + jobDatas.map((x) => waitForJob(x.opts.jobId, 60000)) + ) + ).map((x) => toLegacyDocument(x, internalOptions)); + if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; } const sq = getScrapeQueue(); - await Promise.all(jobDatas.map(x => sq.remove(x.opts.jobId))); + await Promise.all(jobDatas.map((x) => sq.remove(x.opts.jobId))); // make sure doc.content is not empty const filteredDocs = docs.filter( @@ -121,24 +140,25 @@ export async function searchHelper( ); if (filteredDocs.length === 0) { - return { success: true, error: "No page found", returnCode: 200, data: docs }; + return { + success: true, + error: "No page found", + returnCode: 200, + data: docs + }; } return { success: true, data: filteredDocs, - returnCode: 200, + returnCode: 200 }; } export async function searchController(req: Request, res: Response) { try { // make sure to authenticate user first, Bearer - const auth = await authenticateUser( - req, - res, - RateLimiterMode.Search - ); + const auth = await authenticateUser(req, res, RateLimiterMode.Search); if (!auth.success) { return res.status(auth.status).json({ error: auth.error }); } @@ -149,12 +169,12 @@ export async function searchController(req: Request, res: Response) { onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, removeTags: req.body.pageOptions?.removeTags ?? [], - fallback: req.body.pageOptions?.fallback ?? false, + fallback: req.body.pageOptions?.fallback ?? false }; const origin = req.body.origin ?? "api"; const searchOptions = req.body.searchOptions ?? { limit: 5 }; - + const jobId = uuidv4(); try { @@ -192,11 +212,14 @@ export async function searchController(req: Request, res: Response) { mode: "search", url: req.body.query, crawlerOptions: crawlerOptions, - origin: origin, + origin: origin }); return res.status(result.returnCode).json(result); } catch (error) { - if (error instanceof Error && (error.message.startsWith("Job wait") || error.message === "timeout")) { + if ( + error instanceof Error && + (error.message.startsWith("Job wait") || error.message === "timeout") + ) { return res.status(408).json({ error: "Request timed out" }); } diff --git a/apps/api/src/controllers/v0/status.ts b/apps/api/src/controllers/v0/status.ts index c5eafc2d..73bfa159 100644 --- a/apps/api/src/controllers/v0/status.ts +++ b/apps/api/src/controllers/v0/status.ts @@ -4,7 +4,10 @@ import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; -export async function crawlJobStatusPreviewController(req: Request, res: Response) { +export async function crawlJobStatusPreviewController( + req: Request, + res: Response +) { try { const sc = await getCrawl(req.params.jobId); if (!sc) { @@ -22,18 +25,30 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons // } // } - const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); - const jobStatuses = await Promise.all(jobs.map(x => x.getState())); - const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; + const jobs = (await getJobs(req.params.jobId, jobIDs)).sort( + (a, b) => a.timestamp - b.timestamp + ); + const jobStatuses = await Promise.all(jobs.map((x) => x.getState())); + const jobStatus = sc.cancelled + ? "failed" + : jobStatuses.every((x) => x === "completed") + ? "completed" + : jobStatuses.some((x) => x === "failed") + ? "failed" + : "active"; - const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); + const data = jobs.map((x) => + Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue + ); res.json({ status: jobStatus, - current: jobStatuses.filter(x => x === "completed" || x === "failed").length, + current: jobStatuses.filter((x) => x === "completed" || x === "failed") + .length, total: jobs.length, data: jobStatus === "completed" ? data : null, - partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null), + partial_data: + jobStatus === "completed" ? [] : data.filter((x) => x !== null) }); } catch (error) { Sentry.captureException(error); diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts index 0a9931d3..1ce058a0 100644 --- a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -24,11 +24,15 @@ describe("URL Schema Validation", () => { }); it("should reject URLs without a valid top-level domain", () => { - expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path"); + expect(() => url.parse("http://example")).toThrow( + "URL must have a valid top-level domain or be a valid path" + ); }); it("should reject blocked URLs", () => { - expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(() => url.parse("https://facebook.com")).toThrow( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should handle URLs with subdomains correctly", () => { @@ -42,23 +46,33 @@ describe("URL Schema Validation", () => { }); it("should handle URLs with subdomains that are blocked", () => { - expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(() => url.parse("https://sub.facebook.com")).toThrow( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); it("should handle URLs with paths that are blocked", () => { - expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); - expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + expect(() => url.parse("http://facebook.com/path")).toThrow( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); + expect(() => url.parse("https://facebook.com/another/path")).toThrow( + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + ); }); - + it("should reject malformed URLs starting with 'http://http'", () => { - expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol."); + expect(() => url.parse("http://http://example.com")).toThrow( + "Invalid URL. Invalid protocol." + ); }); it("should reject malformed URLs containing multiple 'http://'", () => { - expect(() => url.parse("http://example.com/http://example.com")).not.toThrow(); + expect(() => + url.parse("http://example.com/http://example.com") + ).not.toThrow(); }); it("should reject malformed URLs containing multiple 'http://'", () => { expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); }); -}) \ No newline at end of file +}); diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 064ee73b..a78264e3 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -5,14 +5,14 @@ import { batchScrapeRequestSchema, CrawlResponse, RequestWithAuth, - ScrapeOptions, + ScrapeOptions } from "./types"; import { addCrawlJobs, getCrawl, lockURLs, saveCrawl, - StoredCrawl, + StoredCrawl } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getJobPriority } from "../../lib/job-priority"; @@ -27,27 +27,40 @@ export async function batchScrapeController( req.body = batchScrapeRequestSchema.parse(req.body); const id = req.body.appendToId ?? uuidv4(); - const logger = _logger.child({ crawlId: id, batchScrapeId: id, module: "api/v1", method: "batchScrapeController", teamId: req.auth.team_id, plan: req.auth.plan }); - logger.debug("Batch scrape " + id + " starting", { urlsLength: req.body.urls, appendToId: req.body.appendToId, account: req.account }); + const logger = _logger.child({ + crawlId: id, + batchScrapeId: id, + module: "api/v1", + method: "batchScrapeController", + teamId: req.auth.team_id, + plan: req.auth.plan + }); + logger.debug("Batch scrape " + id + " starting", { + urlsLength: req.body.urls, + appendToId: req.body.appendToId, + account: req.account + }); if (!req.body.appendToId) { await logCrawl(id, req.auth.team_id); } let { remainingCredits } = req.account!; - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - if(!useDbAuthentication){ + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; + if (!useDbAuthentication) { remainingCredits = Infinity; } - const sc: StoredCrawl = req.body.appendToId ? await getCrawl(req.body.appendToId) as StoredCrawl : { - crawlerOptions: null, - scrapeOptions: req.body, - internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter - team_id: req.auth.team_id, - createdAt: Date.now(), - plan: req.auth.plan, - }; + const sc: StoredCrawl = req.body.appendToId + ? ((await getCrawl(req.body.appendToId)) as StoredCrawl) + : { + crawlerOptions: null, + scrapeOptions: req.body, + internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter + team_id: req.auth.team_id, + createdAt: Date.now(), + plan: req.auth.plan + }; if (!req.body.appendToId) { await saveCrawl(id, sc); @@ -57,9 +70,13 @@ export async function batchScrapeController( // If it is over 1000, we need to get the job priority, // otherwise we can use the default priority of 20 - if(req.body.urls.length > 1000){ + if (req.body.urls.length > 1000) { // set base to 21 - jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + jobPriority = await getJobPriority({ + plan: req.auth.plan, + team_id: req.auth.team_id, + basePriority: 21 + }); } logger.debug("Using job priority " + jobPriority, { jobPriority }); @@ -80,12 +97,12 @@ export async function batchScrapeController( crawl_id: id, sitemapped: true, v1: true, - webhook: req.body.webhook, + webhook: req.body.webhook }, opts: { jobId: uuidv4(), - priority: 20, - }, + priority: 20 + } }; }); @@ -103,18 +120,25 @@ export async function batchScrapeController( logger.debug("Adding scrape jobs to BullMQ..."); await addScrapeJobs(jobs); - if(req.body.webhook) { - logger.debug("Calling webhook with batch_scrape.started...", { webhook: req.body.webhook }); - await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started"); + if (req.body.webhook) { + logger.debug("Calling webhook with batch_scrape.started...", { + webhook: req.body.webhook + }); + await callWebhook( + req.auth.team_id, + id, + null, + req.body.webhook, + true, + "batch_scrape.started" + ); } const protocol = process.env.ENV === "local" ? req.protocol : "https"; - + return res.status(200).json({ success: true, id, - url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, + url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}` }); } - - diff --git a/apps/api/src/controllers/v1/concurrency-check.ts b/apps/api/src/controllers/v1/concurrency-check.ts index 8695c6e6..bd25c73b 100644 --- a/apps/api/src/controllers/v1/concurrency-check.ts +++ b/apps/api/src/controllers/v1/concurrency-check.ts @@ -2,7 +2,7 @@ import { authenticateUser } from "../auth"; import { ConcurrencyCheckParams, ConcurrencyCheckResponse, - RequestWithAuth, + RequestWithAuth } from "./types"; import { RateLimiterMode } from "../../types"; import { Response } from "express"; diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts index 958318b5..986ff104 100644 --- a/apps/api/src/controllers/v1/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -7,9 +7,12 @@ import { configDotenv } from "dotenv"; import { RequestWithAuth } from "./types"; configDotenv(); -export async function crawlCancelController(req: RequestWithAuth<{ jobId: string }>, res: Response) { +export async function crawlCancelController( + req: RequestWithAuth<{ jobId: string }>, + res: Response +) { try { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; const sc = await getCrawl(req.params.jobId); if (!sc) { diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index f552492f..d9994d97 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -1,32 +1,47 @@ import { authMiddleware } from "../../routes/v1"; import { RateLimiterMode } from "../../types"; import { authenticateUser } from "../auth"; -import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types"; +import { + CrawlStatusParams, + CrawlStatusResponse, + Document, + ErrorResponse, + RequestWithAuth +} from "./types"; import { WebSocket } from "ws"; import { v4 as uuidv4 } from "uuid"; import { logger } from "../../lib/logger"; -import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis"; +import { + getCrawl, + getCrawlExpiry, + getCrawlJobs, + getDoneJobsOrdered, + getDoneJobsOrderedLength, + getThrottledJobs, + isCrawlFinished, + isCrawlFinishedLocked +} from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { getJob, getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; import { Job, JobState } from "bullmq"; type ErrorMessage = { - type: "error", - error: string, -} + type: "error"; + error: string; +}; type CatchupMessage = { - type: "catchup", - data: CrawlStatusResponse, -} + type: "catchup"; + data: CrawlStatusResponse; +}; type DocumentMessage = { - type: "document", - data: Document, -} + type: "document"; + data: Document; +}; -type DoneMessage = { type: "done" } +type DoneMessage = { type: "done" }; type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage; @@ -47,7 +62,10 @@ function close(ws: WebSocket, code: number, msg: Message) { } } -async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth) { +async function crawlStatusWS( + ws: WebSocket, + req: RequestWithAuth +) { const sc = await getCrawl(req.params.jobId); if (!sc) { return close(ws, 1008, { type: "error", error: "Job not found" }); @@ -69,17 +87,23 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth !doneJobIDs.includes(x)); - const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)])); - const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]); - const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[] + const notDoneJobIDs = jobIDs.filter((x) => !doneJobIDs.includes(x)); + const jobStatuses = await Promise.all( + notDoneJobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)]) + ); + const newlyDoneJobIDs: string[] = jobStatuses + .filter((x) => x[1] === "completed" || x[1] === "failed") + .map((x) => x[0]); + const newlyDoneJobs: Job[] = ( + await Promise.all(newlyDoneJobIDs.map((x) => getJob(x))) + ).filter((x) => x !== undefined) as Job[]; for (const job of newlyDoneJobs) { if (job.returnvalue) { send(ws, { type: "document", - data: job.returnvalue, - }) + data: job.returnvalue + }); } else { return close(ws, 3000, { type: "error", error: job.failedReason }); } @@ -95,8 +119,10 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); - const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); + let jobStatuses = await Promise.all( + jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const) + ); + const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id))); const throttledJobsSet = new Set(throttledJobs); @@ -104,18 +130,27 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; + const status: Exclude["status"] = + sc.cancelled + ? "cancelled" + : validJobStatuses.every((x) => x[1] === "completed") + ? "completed" + : "scraping"; jobIDs = validJobIDs; // Use validJobIDs instead of jobIDs for further processing const doneJobs = await getJobs(doneJobIDs); - const data = doneJobs.map(x => x.returnvalue); + const data = doneJobs.map((x) => x.returnvalue); send(ws, { type: "catchup", @@ -126,7 +161,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth) { +export async function crawlStatusWSController( + ws: WebSocket, + req: RequestWithAuth +) { try { - const auth = await authenticateUser( - req, - null, - RateLimiterMode.CrawlStatus, - ); + const auth = await authenticateUser(req, null, RateLimiterMode.CrawlStatus); if (!auth.success) { return close(ws, 3000, { type: "error", - error: auth.error, + error: auth.error }); } @@ -167,15 +201,24 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut verbose = JSON.stringify({ message: err.message, name: err.name, - stack: err.stack, + stack: err.stack }); } } - logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); + logger.error( + "Error occurred in WebSocket! (" + + req.path + + ") -- ID " + + id + + " -- " + + verbose + ); return close(ws, 1011, { type: "error", - error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id + error: + "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + + id }); } } diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index c0c4f4b5..d88d26fb 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -1,8 +1,23 @@ import { Response } from "express"; -import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types"; -import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis"; +import { + CrawlStatusParams, + CrawlStatusResponse, + ErrorResponse, + RequestWithAuth +} from "./types"; +import { + getCrawl, + getCrawlExpiry, + getCrawlJobs, + getDoneJobsOrdered, + getDoneJobsOrderedLength, + getThrottledJobs +} from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; -import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; +import { + supabaseGetJobById, + supabaseGetJobsById +} from "../../lib/supabase-jobs"; import { configDotenv } from "dotenv"; import { Job, JobState } from "bullmq"; import { logger } from "../../lib/logger"; @@ -11,7 +26,7 @@ configDotenv(); export async function getJob(id: string) { const job = await getScrapeQueue().getJob(id); if (!job) return job; - + if (process.env.USE_DB_AUTHENTICATION === "true") { const supabaseData = await supabaseGetJobById(id); @@ -20,33 +35,43 @@ export async function getJob(id: string) { } } - job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + job.returnvalue = Array.isArray(job.returnvalue) + ? job.returnvalue[0] + : job.returnvalue; return job; } export async function getJobs(ids: string[]) { - const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[]; - + const jobs: (Job & { id: string })[] = ( + await Promise.all(ids.map((x) => getScrapeQueue().getJob(x))) + ).filter((x) => x) as (Job & { id: string })[]; + if (process.env.USE_DB_AUTHENTICATION === "true") { const supabaseData = await supabaseGetJobsById(ids); - supabaseData.forEach(x => { - const job = jobs.find(y => y.id === x.job_id); + supabaseData.forEach((x) => { + const job = jobs.find((y) => y.id === x.job_id); if (job) { job.returnvalue = x.docs; } - }) + }); } - jobs.forEach(job => { - job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + jobs.forEach((job) => { + job.returnvalue = Array.isArray(job.returnvalue) + ? job.returnvalue[0] + : job.returnvalue; }); return jobs; } -export async function crawlStatusController(req: RequestWithAuth, res: Response, isBatch = false) { +export async function crawlStatusController( + req: RequestWithAuth, + res: Response, + isBatch = false +) { const sc = await getCrawl(req.params.jobId); if (!sc) { return res.status(404).json({ success: false, error: "Job not found" }); @@ -56,12 +81,18 @@ export async function crawlStatusController(req: RequestWithAuth [x, await getScrapeQueue().getJobState(x)] as const)); - const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id)); + let jobStatuses = await Promise.all( + jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const) + ); + const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id))); const throttledJobsSet = new Set(throttledJobs); @@ -69,30 +100,48 @@ export async function crawlStatusController(req: RequestWithAuth["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping"; + const status: Exclude["status"] = + sc.cancelled + ? "cancelled" + : validJobStatuses.every((x) => x[1] === "completed") + ? "completed" + : "scraping"; // Use validJobIDs instead of jobIDs for further processing jobIDs = validJobIDs; const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); - const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); + const doneJobsOrder = await getDoneJobsOrdered( + req.params.jobId, + start, + end ?? -1 + ); let doneJobs: Job[] = []; - if (end === undefined) { // determine 10 megabyte limit + if (end === undefined) { + // determine 10 megabyte limit let bytes = 0; const bytesLimit = 10485760; // 10 MiB in bytes const factor = 100; // chunking for faster retrieval - for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) { + for ( + let i = 0; + i < doneJobsOrder.length && bytes < bytesLimit; + i += factor + ) { // get current chunk and retrieve jobs - const currentIDs = doneJobsOrder.slice(i, i+factor); + const currentIDs = doneJobsOrder.slice(i, i + factor); const jobs = await getJobs(currentIDs); // iterate through jobs and add them one them one to the byte counter @@ -101,12 +150,16 @@ export async function crawlStatusController(req: RequestWithAuth (await x.getState()) === "failed" ? null : x))).filter(x => x !== null) as Job[]; + doneJobs = ( + await Promise.all( + (await getJobs(doneJobsOrder)).map(async (x) => + (await x.getState()) === "failed" ? null : x + ) + ) + ).filter((x) => x !== null) as Job[]; } - const data = doneJobs.map(x => x.returnvalue); + const data = doneJobs.map((x) => x.returnvalue); const protocol = process.env.ENV === "local" ? req.protocol : "https"; - const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`); + const nextURL = new URL( + `${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}` + ); nextURL.searchParams.set("skip", (start + data.length).toString()); @@ -151,10 +212,9 @@ export async function crawlStatusController(req: RequestWithAuth 0) { - logger.debug("Using sitemap of length " + sitemap.length, { sitemapLength: sitemap.length }); + logger.debug("Using sitemap of length " + sitemap.length, { + sitemapLength: sitemap.length + }); let jobPriority = 20; // If it is over 1000, we need to get the job priority, // otherwise we can use the default priority of 20 - if(sitemap.length > 1000){ + if (sitemap.length > 1000) { // set base to 21 - jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21}) + jobPriority = await getJobPriority({ + plan: req.auth.plan, + team_id: req.auth.team_id, + basePriority: 21 + }); } logger.debug("Using job priority " + jobPriority, { jobPriority }); @@ -127,14 +149,14 @@ export async function crawlController( crawl_id: id, sitemapped: true, webhook: req.body.webhook, - v1: true, + v1: true }, opts: { jobId: uuid, - priority: 20, - }, + priority: 20 + } }; - }) + }); logger.debug("Locking URLs..."); await lockURLs( @@ -150,7 +172,9 @@ export async function crawlController( logger.debug("Adding scrape jobs to BullMQ..."); await getScrapeQueue().addBulk(jobs); } else { - logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap }); + logger.debug("Sitemap not found or ignored.", { + ignoreSitemap: sc.crawlerOptions.ignoreSitemap + }); logger.debug("Locking URL..."); await lockURL(id, sc, req.body.url); @@ -168,30 +192,37 @@ export async function crawlController( origin: "api", crawl_id: id, webhook: req.body.webhook, - v1: true, + v1: true }, { - priority: 15, + priority: 15 }, - jobId, + jobId ); logger.debug("Adding scrape job to BullMQ...", { jobId }); await addCrawlJob(id, jobId); } logger.debug("Done queueing jobs!"); - if(req.body.webhook) { - logger.debug("Calling webhook with crawl.started...", { webhook: req.body.webhook }); - await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started"); + if (req.body.webhook) { + logger.debug("Calling webhook with crawl.started...", { + webhook: req.body.webhook + }); + await callWebhook( + req.auth.team_id, + id, + null, + req.body.webhook, + true, + "crawl.started" + ); } const protocol = process.env.ENV === "local" ? req.protocol : "https"; - + return res.status(200).json({ success: true, id, - url: `${protocol}://${req.get("host")}/v1/crawl/${id}`, + url: `${protocol}://${req.get("host")}/v1/crawl/${id}` }); } - - diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 736c8760..74b188e7 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -6,7 +6,7 @@ import { extractRequestSchema, ExtractResponse, MapDocument, - scrapeOptions, + scrapeOptions } from "./types"; import { Document } from "../../lib/entities"; import Redis from "ioredis"; @@ -46,7 +46,7 @@ export async function extractController( res: Response ) { const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true"; - + req.body = extractRequestSchema.parse(req.body); const id = crypto.randomUUID(); @@ -56,17 +56,19 @@ export async function extractController( // Process all URLs in parallel const urlPromises = req.body.urls.map(async (url) => { - if (url.includes('/*') || req.body.allowExternalLinks) { + if (url.includes("/*") || req.body.allowExternalLinks) { // Handle glob pattern URLs - const baseUrl = url.replace('/*', ''); + const baseUrl = url.replace("/*", ""); // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any const allowExternalLinks = req.body.allowExternalLinks ?? true; let urlWithoutWww = baseUrl.replace("www.", ""); - let mapUrl = req.body.prompt && allowExternalLinks - ? `${req.body.prompt} ${urlWithoutWww}` - : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}` - : `site:${urlWithoutWww}`; + let mapUrl = + req.body.prompt && allowExternalLinks + ? `${req.body.prompt} ${urlWithoutWww}` + : req.body.prompt + ? `${req.body.prompt} site:${urlWithoutWww}` + : `site:${urlWithoutWww}`; const mapResults = await getMapResults({ url: baseUrl, @@ -79,15 +81,17 @@ export async function extractController( // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping ignoreSitemap: !selfHosted ? true : false, includeMetadata: true, - includeSubdomains: req.body.includeSubdomains, + includeSubdomains: req.body.includeSubdomains }); let mappedLinks = mapResults.links as MapDocument[]; // Limit number of links to MAX_EXTRACT_LIMIT mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); - let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); - + let mappedLinksRerank = mappedLinks.map( + (x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}` + ); + // Filter by path prefix if present // wrong // if (pathPrefix) { @@ -96,32 +100,50 @@ export async function extractController( if (req.body.prompt) { // Get similarity scores between the search query and each link's context - const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl); - + const linksAndScores = await performRanking( + mappedLinksRerank, + mappedLinks.map((l) => l.url), + mapUrl + ); + // First try with high threshold - let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD); - + let filteredLinks = filterAndProcessLinks( + mappedLinks, + linksAndScores, + INITIAL_SCORE_THRESHOLD + ); + // If we don't have enough high-quality links, try with lower threshold if (filteredLinks.length < MIN_REQUIRED_LINKS) { - logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`); - filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD); - + logger.info( + `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...` + ); + filteredLinks = filterAndProcessLinks( + mappedLinks, + linksAndScores, + FALLBACK_SCORE_THRESHOLD + ); + if (filteredLinks.length === 0) { // If still no results, take top N results regardless of score - logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`); + logger.warn( + `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.` + ); filteredLinks = linksAndScores .sort((a, b) => b.score - a.score) .slice(0, MIN_REQUIRED_LINKS) - .map(x => mappedLinks.find(link => link.url === x.link)) - .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)); + .map((x) => mappedLinks.find((link) => link.url === x.link)) + .filter( + (x): x is MapDocument => + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url) + ); } } mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT); } - return mappedLinks.map(x => x.url) as string[]; - + return mappedLinks.map((x) => x.url) as string[]; } else { // Handle direct URLs without glob pattern if (!isUrlBlocked(url)) { @@ -138,7 +160,8 @@ export async function extractController( if (links.length === 0) { return res.status(400).json({ success: false, - error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs." + error: + "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs." }); } @@ -151,19 +174,19 @@ export async function extractController( const jobPriority = await getJobPriority({ plan: req.auth.plan as PlanType, team_id: req.auth.team_id, - basePriority: 10, + basePriority: 10 }); await addScrapeJob( { url, - mode: "single_urls", + mode: "single_urls", team_id: req.auth.team_id, scrapeOptions: scrapeOptions.parse({}), internalOptions: {}, plan: req.auth.plan!, origin, - is_scrape: true, + is_scrape: true }, {}, jobId, @@ -179,7 +202,10 @@ export async function extractController( return doc; } catch (e) { logger.error(`Error in scrapeController: ${e}`); - if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) { + if ( + e instanceof Error && + (e.message.startsWith("Job wait") || e.message === "timeout") + ) { throw { status: 408, error: "Request timed out" @@ -187,7 +213,7 @@ export async function extractController( } else { throw { status: 500, - error: `(Internal server error) - ${(e && e.message) ? e.message : e}` + error: `(Internal server error) - ${e && e.message ? e.message : e}` }; } } @@ -195,7 +221,7 @@ export async function extractController( try { const results = await Promise.all(scrapePromises); - docs.push(...results.filter(doc => doc !== null).map(x => x!)); + docs.push(...results.filter((doc) => doc !== null).map((x) => x!)); } catch (e) { return res.status(e.status).json({ success: false, @@ -207,20 +233,26 @@ export async function extractController( logger.child({ method: "extractController/generateOpenAICompletions" }), { mode: "llm", - systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), + systemPrompt: + "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + + links.join(", "), prompt: req.body.prompt, - schema: req.body.schema, + schema: req.body.schema }, - docs.map(x => buildDocument(x)).join('\n'), + docs.map((x) => buildDocument(x)).join("\n"), undefined, true // isExtractEndpoint ); // TODO: change this later // While on beta, we're billing 5 credits per link discovered/scraped. - billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => { - logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`); - }); + billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch( + (error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}` + ); + } + ); let data = completions.extract ?? {}; let warning = completions.warning; @@ -256,12 +288,20 @@ export async function extractController( * @returns The filtered list of links. */ function filterAndProcessLinks( - mappedLinks: MapDocument[], - linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[], + mappedLinks: MapDocument[], + linksAndScores: { + link: string; + linkWithContext: string; + score: number; + originalIndex: number; + }[], threshold: number ): MapDocument[] { return linksAndScores - .filter(x => x.score > threshold) - .map(x => mappedLinks.find(link => link.url === x.link)) - .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)); -} \ No newline at end of file + .filter((x) => x.score > threshold) + .map((x) => mappedLinks.find((link) => link.url === x.link)) + .filter( + (x): x is MapDocument => + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url) + ); +} diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 9a0a5eb6..7ddd7b78 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,6 +1,11 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types"; +import { + MapDocument, + mapRequestSchema, + RequestWithAuth, + scrapeOptions +} from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; import { configDotenv } from "dotenv"; @@ -8,7 +13,7 @@ import { checkAndUpdateURLForMap, isSameDomain, isSameSubdomain, - removeDuplicateUrls, + removeDuplicateUrls } from "../../lib/validateUrl"; import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; @@ -67,13 +72,13 @@ export async function getMapResults({ crawlerOptions: { ...crawlerOptions, limit: crawlerOptions.sitemapOnly ? 10000000 : limit, - scrapeOptions: undefined, + scrapeOptions: undefined }, scrapeOptions: scrapeOptions.parse({}), internalOptions: {}, team_id: teamId, createdAt: Date.now(), - plan: plan, + plan: plan }; const crawler = crawlToCrawler(id, sc); @@ -85,7 +90,8 @@ export async function getMapResults({ sitemap.forEach((x) => { links.push(x.url); }); - links = links.slice(1) + links = links + .slice(1) .map((x) => { try { return checkAndUpdateURLForMap(x).url.trim(); @@ -99,13 +105,17 @@ export async function getMapResults({ } else { let urlWithoutWww = url.replace("www.", ""); - let mapUrl = search && allowExternalLinks - ? `${search} ${urlWithoutWww}` - : search ? `${search} site:${urlWithoutWww}` - : `site:${url}`; + let mapUrl = + search && allowExternalLinks + ? `${search} ${urlWithoutWww}` + : search + ? `${search} site:${urlWithoutWww}` + : `site:${url}`; const resultsPerPage = 100; - const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); + const maxPages = Math.ceil( + Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage + ); const cacheKey = `fireEngineMap:${mapUrl}`; const cachedResult = await redis.get(cacheKey); @@ -119,7 +129,7 @@ export async function getMapResults({ const fetchPage = async (page: number) => { return fireEngineMap(mapUrl, { numResults: resultsPerPage, - page: page, + page: page }); }; @@ -134,7 +144,7 @@ export async function getMapResults({ // Parallelize sitemap fetch with serper search const [sitemap, ...searchResults] = await Promise.all([ ignoreSitemap ? null : crawler.tryGetSitemap(true), - ...(cachedResult ? [] : pagePromises), + ...(cachedResult ? [] : pagePromises) ]); if (!cachedResult) { @@ -162,7 +172,7 @@ export async function getMapResults({ links = [ mapResults[0].url, ...mapResults.slice(1).map((x) => x.url), - ...links, + ...links ]; } else { mapResults.map((x) => { @@ -199,14 +209,16 @@ export async function getMapResults({ links = removeDuplicateUrls(links); } - const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit); + const linksToReturn = crawlerOptions.sitemapOnly + ? links + : links.slice(0, limit); return { success: true, links: includeMetadata ? mapResults : linksToReturn, scrape_id: origin?.includes("website") ? id : undefined, job_id: id, - time_taken: (new Date().getTime() - Date.now()) / 1000, + time_taken: (new Date().getTime() - Date.now()) / 1000 }; } @@ -225,7 +237,7 @@ export async function mapController( crawlerOptions: req.body, origin: req.body.origin, teamId: req.auth.team_id, - plan: req.auth.plan, + plan: req.auth.plan }); // Bill the team @@ -244,12 +256,12 @@ export async function mapController( docs: result.links, time_taken: result.time_taken, team_id: req.auth.team_id, - mode: "map", + mode: "map", url: req.body.url, crawlerOptions: {}, scrapeOptions: {}, origin: req.body.origin ?? "api", - num_tokens: 0, + num_tokens: 0 }); const response = { @@ -259,4 +271,4 @@ export async function mapController( }; return res.status(200).json(response); -} \ No newline at end of file +} diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index b7f19a3b..b366b79e 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -12,30 +12,30 @@ export async function scrapeStatusController(req: any, res: any) { const job = await supabaseGetJobByIdOnlyData(req.params.jobId); const allowedTeams = [ - "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", + "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", "511544f2-2fce-4183-9c59-6c29b02c69b5" ]; - if(!allowedTeams.includes(job?.team_id)){ + if (!allowedTeams.includes(job?.team_id)) { return res.status(403).json({ success: false, - error: "You are not allowed to access this resource.", + error: "You are not allowed to access this resource." }); } return res.status(200).json({ success: true, - data: job?.docs[0], + data: job?.docs[0] }); } catch (error) { if (error instanceof Error && error.message == "Too Many Requests") { return res.status(429).json({ success: false, - error: "Rate limit exceeded. Please try again later.", + error: "Rate limit exceeded. Please try again later." }); } else { return res.status(500).json({ success: false, - error: "An unexpected error occurred.", + error: "An unexpected error occurred." }); } } diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 9c85c91e..05cc68e3 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -5,7 +5,7 @@ import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, - ScrapeResponse, + ScrapeResponse } from "./types"; import { billTeam } from "../../services/billing/credit_billing"; import { v4 as uuidv4 } from "uuid"; @@ -30,7 +30,7 @@ export async function scrapeController( const jobPriority = await getJobPriority({ plan: req.auth.plan as PlanType, team_id: req.auth.team_id, - basePriority: 10, + basePriority: 10 }); await addScrapeJob( @@ -42,29 +42,37 @@ export async function scrapeController( internalOptions: {}, plan: req.auth.plan!, origin: req.body.origin, - is_scrape: true, + is_scrape: true }, {}, jobId, jobPriority ); - const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0); + const totalWait = + (req.body.waitFor ?? 0) + + (req.body.actions ?? []).reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 0) : 0) + a, + 0 + ); let doc: Document; try { doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { logger.error(`Error in scrapeController: ${e}`); - if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) { + if ( + e instanceof Error && + (e.message.startsWith("Job wait") || e.message === "timeout") + ) { return res.status(408).json({ success: false, - error: "Request timed out", + error: "Request timed out" }); } else { return res.status(500).json({ success: false, - error: `(Internal server error) - ${(e && e.message) ? e.message : e}`, + error: `(Internal server error) - ${e && e.message ? e.message : e}` }); } } @@ -75,8 +83,8 @@ export async function scrapeController( const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = doc && doc.extract - // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") - ? 0 // TODO: fix + ? // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") + 0 // TODO: fix : 0; let creditsToBeBilled = 1; // Assuming 1 credit per document @@ -84,14 +92,18 @@ export async function scrapeController( // Don't bill if we're early returning return; } - if(req.body.extract && req.body.formats.includes("extract")) { + if (req.body.extract && req.body.formats.includes("extract")) { creditsToBeBilled = 5; } - billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => { - logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); - // Optionally, you could notify an admin or add to a retry queue here - }); + billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch( + (error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}` + ); + // Optionally, you could notify an admin or add to a retry queue here + } + ); if (!req.body.formats.includes("rawHtml")) { if (doc && doc.rawHtml) { @@ -111,12 +123,12 @@ export async function scrapeController( url: req.body.url, scrapeOptions: req.body, origin: origin, - num_tokens: numTokens, + num_tokens: numTokens }); return res.status(200).json({ success: true, data: doc, - scrape_id: origin?.includes("website") ? jobId : undefined, + scrape_id: origin?.includes("website") ? jobId : undefined }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index f1596f5e..f9fa2392 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -4,7 +4,12 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { PlanType } from "../../types"; import { countries } from "../../lib/validate-country"; -import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities"; +import { + ExtractorOptions, + PageOptions, + ScrapeActionContent, + Document as V0Document +} from "../../lib/entities"; import { InternalOptions } from "../../scraper/scrapeURL"; export type Format = @@ -31,212 +36,265 @@ export const url = z.preprocess( (x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x), "URL must have a valid top-level domain or be a valid path" ) - .refine( - (x) => { - try { - checkUrl(x as string) - return true; - } catch (_) { - return false; - } - }, - "Invalid URL" - ) + .refine((x) => { + try { + checkUrl(x as string); + return true; + } catch (_) { + return false; + } + }, "Invalid URL") .refine( (x) => !isUrlBlocked(x as string), "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." ) ); -const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; +const strictMessage = + "Unrecognized key in body -- please review the v1 API documentation for request body changes"; -export const extractOptions = z.object({ - mode: z.enum(["llm"]).default("llm"), - schema: z.any().optional(), - systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."), - prompt: z.string().optional() -}).strict(strictMessage); +export const extractOptions = z + .object({ + mode: z.enum(["llm"]).default("llm"), + schema: z.any().optional(), + systemPrompt: z + .string() + .default( + "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required." + ), + prompt: z.string().optional() + }) + .strict(strictMessage); export type ExtractOptions = z.infer; -export const actionsSchema = z.array(z.union([ - z.object({ - type: z.literal("wait"), - milliseconds: z.number().int().positive().finite().optional(), - selector: z.string().optional(), - }).refine( - (data) => (data.milliseconds !== undefined || data.selector !== undefined) && !(data.milliseconds !== undefined && data.selector !== undefined), - { - message: "Either 'milliseconds' or 'selector' must be provided, but not both.", - } - ), - z.object({ - type: z.literal("click"), - selector: z.string(), - }), - z.object({ - type: z.literal("screenshot"), - fullPage: z.boolean().default(false), - }), - z.object({ - type: z.literal("write"), - text: z.string(), - }), - z.object({ - type: z.literal("press"), - key: z.string(), - }), - z.object({ - type: z.literal("scroll"), - direction: z.enum(["up", "down"]).optional().default("down"), - selector: z.string().optional(), - }), - z.object({ - type: z.literal("scrape"), - }), - z.object({ - type: z.literal("executeJavascript"), - script: z.string() - }), -])); - -export const scrapeOptions = z.object({ - formats: z - .enum([ - "markdown", - "html", - "rawHtml", - "links", - "screenshot", - "screenshot@fullPage", - "extract" - ]) - .array() - .optional() - .default(["markdown"]) - .refine(x => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage"), - headers: z.record(z.string(), z.string()).optional(), - includeTags: z.string().array().optional(), - excludeTags: z.string().array().optional(), - onlyMainContent: z.boolean().default(true), - timeout: z.number().int().positive().finite().safe().optional(), - waitFor: z.number().int().nonnegative().finite().safe().default(0), - extract: extractOptions.optional(), - mobile: z.boolean().default(false), - parsePDF: z.boolean().default(true), - actions: actionsSchema.optional(), - // New - location: z.object({ - country: z.string().optional().refine( - (val) => !val || Object.keys(countries).includes(val.toUpperCase()), - { - message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", - } - ).transform(val => val ? val.toUpperCase() : 'US'), - languages: z.string().array().optional(), - }).optional(), - - // Deprecated - geolocation: z.object({ - country: z.string().optional().refine( - (val) => !val || Object.keys(countries).includes(val.toUpperCase()), - { - message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", - } - ).transform(val => val ? val.toUpperCase() : 'US'), - languages: z.string().array().optional(), - }).optional(), - skipTlsVerification: z.boolean().default(false), - removeBase64Images: z.boolean().default(true), -}).strict(strictMessage) +export const actionsSchema = z.array( + z.union([ + z + .object({ + type: z.literal("wait"), + milliseconds: z.number().int().positive().finite().optional(), + selector: z.string().optional() + }) + .refine( + (data) => + (data.milliseconds !== undefined || data.selector !== undefined) && + !(data.milliseconds !== undefined && data.selector !== undefined), + { + message: + "Either 'milliseconds' or 'selector' must be provided, but not both." + } + ), + z.object({ + type: z.literal("click"), + selector: z.string() + }), + z.object({ + type: z.literal("screenshot"), + fullPage: z.boolean().default(false) + }), + z.object({ + type: z.literal("write"), + text: z.string() + }), + z.object({ + type: z.literal("press"), + key: z.string() + }), + z.object({ + type: z.literal("scroll"), + direction: z.enum(["up", "down"]).optional().default("down"), + selector: z.string().optional() + }), + z.object({ + type: z.literal("scrape") + }), + z.object({ + type: z.literal("executeJavascript"), + script: z.string() + }) + ]) +); +export const scrapeOptions = z + .object({ + formats: z + .enum([ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "extract" + ]) + .array() + .optional() + .default(["markdown"]) + .refine( + (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), + "You may only specify either screenshot or screenshot@fullPage" + ), + headers: z.record(z.string(), z.string()).optional(), + includeTags: z.string().array().optional(), + excludeTags: z.string().array().optional(), + onlyMainContent: z.boolean().default(true), + timeout: z.number().int().positive().finite().safe().optional(), + waitFor: z.number().int().nonnegative().finite().safe().default(0), + extract: extractOptions.optional(), + mobile: z.boolean().default(false), + parsePDF: z.boolean().default(true), + actions: actionsSchema.optional(), + // New + location: z + .object({ + country: z + .string() + .optional() + .refine( + (val) => !val || Object.keys(countries).includes(val.toUpperCase()), + { + message: + "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code." + } + ) + .transform((val) => (val ? val.toUpperCase() : "US")), + languages: z.string().array().optional() + }) + .optional(), + // Deprecated + geolocation: z + .object({ + country: z + .string() + .optional() + .refine( + (val) => !val || Object.keys(countries).includes(val.toUpperCase()), + { + message: + "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code." + } + ) + .transform((val) => (val ? val.toUpperCase() : "US")), + languages: z.string().array().optional() + }) + .optional(), + skipTlsVerification: z.boolean().default(false), + removeBase64Images: z.boolean().default(true) + }) + .strict(strictMessage); export type ScrapeOptions = z.infer; -export const extractV1Options = z.object({ - urls: url.array().max(10, "Maximum of 10 URLs allowed per request while in beta."), - prompt: z.string().optional(), - schema: z.any().optional(), - limit: z.number().int().positive().finite().safe().optional(), - ignoreSitemap: z.boolean().default(false), - includeSubdomains: z.boolean().default(true), - allowExternalLinks: z.boolean().default(false), - origin: z.string().optional().default("api"), - timeout: z.number().int().positive().finite().safe().default(60000) -}).strict(strictMessage) +export const extractV1Options = z + .object({ + urls: url + .array() + .max(10, "Maximum of 10 URLs allowed per request while in beta."), + prompt: z.string().optional(), + schema: z.any().optional(), + limit: z.number().int().positive().finite().safe().optional(), + ignoreSitemap: z.boolean().default(false), + includeSubdomains: z.boolean().default(true), + allowExternalLinks: z.boolean().default(false), + origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(60000) + }) + .strict(strictMessage); export type ExtractV1Options = z.infer; export const extractRequestSchema = extractV1Options; export type ExtractRequest = z.infer; -export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({ - url, - origin: z.string().optional().default("api"), - timeout: z.number().int().positive().finite().safe().default(30000), -}).strict(strictMessage).refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); - }, - { - message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", - } -).transform((obj) => { - if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { - return { ...obj, timeout: 60000 }; - } - return obj; -}); - - +export const scrapeRequestSchema = scrapeOptions + .omit({ timeout: true }) + .extend({ + url, + origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(30000) + }) + .strict(strictMessage) + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) + ); + }, + { + message: + "When 'extract' format is specified, 'extract' options must be provided, and vice versa" + } + ) + .transform((obj) => { + if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { + return { ...obj, timeout: 60000 }; + } + return obj; + }); export type ScrapeRequest = z.infer; export type ScrapeRequestInput = z.input; -export const webhookSchema = z.preprocess(x => { - if (typeof x === "string") { - return { url: x }; - } else { - return x; - } -}, z.object({ - url: z.string().url(), - headers: z.record(z.string(), z.string()).default({}), -}).strict(strictMessage)) - -export const batchScrapeRequestSchema = scrapeOptions.extend({ - urls: url.array(), - origin: z.string().optional().default("api"), - webhook: webhookSchema.optional(), - appendToId: z.string().uuid().optional(), -}).strict(strictMessage).refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions); +export const webhookSchema = z.preprocess( + (x) => { + if (typeof x === "string") { + return { url: x }; + } else { + return x; + } }, - { - message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa", - } + z + .object({ + url: z.string().url(), + headers: z.record(z.string(), z.string()).default({}) + }) + .strict(strictMessage) ); +export const batchScrapeRequestSchema = scrapeOptions + .extend({ + urls: url.array(), + origin: z.string().optional().default("api"), + webhook: webhookSchema.optional(), + appendToId: z.string().uuid().optional() + }) + .strict(strictMessage) + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) + ); + }, + { + message: + "When 'extract' format is specified, 'extract' options must be provided, and vice versa" + } + ); + export type BatchScrapeRequest = z.infer; -const crawlerOptions = z.object({ - includePaths: z.string().array().default([]), - excludePaths: z.string().array().default([]), - maxDepth: z.number().default(10), // default? - limit: z.number().default(10000), // default? - allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? - allowExternalLinks: z.boolean().default(false), - allowSubdomains: z.boolean().default(false), - ignoreRobotsTxt: z.boolean().default(false), - ignoreSitemap: z.boolean().default(false), - deduplicateSimilarURLs: z.boolean().default(true), - ignoreQueryParameters: z.boolean().default(false), -}).strict(strictMessage); +const crawlerOptions = z + .object({ + includePaths: z.string().array().default([]), + excludePaths: z.string().array().default([]), + maxDepth: z.number().default(10), // default? + limit: z.number().default(10000), // default? + allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? + allowExternalLinks: z.boolean().default(false), + allowSubdomains: z.boolean().default(false), + ignoreRobotsTxt: z.boolean().default(false), + ignoreSitemap: z.boolean().default(false), + deduplicateSimilarURLs: z.boolean().default(true), + ignoreQueryParameters: z.boolean().default(false) + }) + .strict(strictMessage); // export type CrawlerOptions = { // includePaths?: string[]; @@ -250,13 +308,15 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; -export const crawlRequestSchema = crawlerOptions.extend({ - url, - origin: z.string().optional().default("api"), - scrapeOptions: scrapeOptions.default({}), - webhook: webhookSchema.optional(), - limit: z.number().default(10000), -}).strict(strictMessage); +export const crawlRequestSchema = crawlerOptions + .extend({ + url, + origin: z.string().optional().default("api"), + scrapeOptions: scrapeOptions.default({}), + webhook: webhookSchema.optional(), + limit: z.number().default(10000) + }) + .strict(strictMessage); // export type CrawlRequest = { // url: string; @@ -270,18 +330,19 @@ export const crawlRequestSchema = crawlerOptions.extend({ // extractionSchema?: Record; // } - export type CrawlRequest = z.infer; -export const mapRequestSchema = crawlerOptions.extend({ - url, - origin: z.string().optional().default("api"), - includeSubdomains: z.boolean().default(true), - search: z.string().optional(), - ignoreSitemap: z.boolean().default(false), - sitemapOnly: z.boolean().default(false), - limit: z.number().min(1).max(5000).default(5000), -}).strict(strictMessage); +export const mapRequestSchema = crawlerOptions + .extend({ + url, + origin: z.string().optional().default("api"), + includeSubdomains: z.boolean().default(true), + search: z.string().optional(), + ignoreSitemap: z.boolean().default(false), + sitemapOnly: z.boolean().default(false), + limit: z.number().min(1).max(5000).default(5000) + }) + .strict(strictMessage); // export type MapRequest = { // url: string; @@ -451,7 +512,7 @@ export interface RequestWithMaybeACUC< ReqBody = undefined, ResBody = undefined > extends Request { - acuc?: AuthCreditUsageChunk, + acuc?: AuthCreditUsageChunk; } export interface RequestWithACUC< @@ -459,13 +520,13 @@ export interface RequestWithACUC< ReqBody = undefined, ResBody = undefined > extends Request { - acuc: AuthCreditUsageChunk, + acuc: AuthCreditUsageChunk; } export interface RequestWithAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined, + ResBody = undefined > extends Request { auth: AuthObject; account?: Account; @@ -483,16 +544,15 @@ export interface RequestWithMaybeAuth< export interface RequestWithAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined, + ResBody = undefined > extends RequestWithACUC { auth: AuthObject; account?: Account; } -export interface ResponseWithSentry< - ResBody = undefined, -> extends Response { - sentry?: string, +export interface ResponseWithSentry + extends Response { + sentry?: string; } export function toLegacyCrawlerOptions(x: CrawlerOptions) { @@ -509,11 +569,14 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { ignoreRobotsTxt: x.ignoreRobotsTxt, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, - ignoreQueryParameters: x.ignoreQueryParameters, + ignoreQueryParameters: x.ignoreQueryParameters }; } -export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } { +export function fromLegacyCrawlerOptions(x: any): { + crawlOptions: CrawlerOptions; + internalOptions: InternalOptions; +} { return { crawlOptions: crawlerOptions.parse({ includePaths: x.includes, @@ -526,37 +589,50 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions ignoreRobotsTxt: x.ignoreRobotsTxt, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, - ignoreQueryParameters: x.ignoreQueryParameters, + ignoreQueryParameters: x.ignoreQueryParameters }), internalOptions: { - v0CrawlOnlyUrls: x.returnOnlyUrls, - }, + v0CrawlOnlyUrls: x.returnOnlyUrls + } }; } - - export interface MapDocument { url: string; title?: string; description?: string; -} -export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } { +} +export function fromLegacyScrapeOptions( + pageOptions: PageOptions, + extractorOptions: ExtractorOptions | undefined, + timeout: number | undefined +): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { return { scrapeOptions: scrapeOptions.parse({ formats: [ - (pageOptions.includeMarkdown ?? true) ? "markdown" as const : null, - (pageOptions.includeHtml ?? false) ? "html" as const : null, - (pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null, - (pageOptions.screenshot ?? false) ? "screenshot" as const : null, - (pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null, - (extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null, + (pageOptions.includeMarkdown ?? true) ? ("markdown" as const) : null, + (pageOptions.includeHtml ?? false) ? ("html" as const) : null, + (pageOptions.includeRawHtml ?? false) ? ("rawHtml" as const) : null, + (pageOptions.screenshot ?? false) ? ("screenshot" as const) : null, + (pageOptions.fullPageScreenshot ?? false) + ? ("screenshot@fullPage" as const) + : null, + extractorOptions !== undefined && + extractorOptions.mode.includes("llm-extraction") + ? ("extract" as const) + : null, "links" - ].filter(x => x !== null), + ].filter((x) => x !== null), waitFor: pageOptions.waitFor, headers: pageOptions.headers, - includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags), - excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags), + includeTags: + typeof pageOptions.onlyIncludeTags === "string" + ? [pageOptions.onlyIncludeTags] + : pageOptions.onlyIncludeTags, + excludeTags: + typeof pageOptions.removeTags === "string" + ? [pageOptions.removeTags] + : pageOptions.removeTags, onlyMainContent: pageOptions.onlyMainContent ?? false, timeout: timeout, parsePDF: pageOptions.parsePDF, @@ -564,29 +640,45 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio location: pageOptions.geolocation, skipTlsVerification: pageOptions.skipTlsVerification, removeBase64Images: pageOptions.removeBase64Images, - extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? { - systemPrompt: extractorOptions.extractionPrompt, - prompt: extractorOptions.userPrompt, - schema: extractorOptions.extractionSchema, - } : undefined, - mobile: pageOptions.mobile, + extract: + extractorOptions !== undefined && + extractorOptions.mode.includes("llm-extraction") + ? { + systemPrompt: extractorOptions.extractionPrompt, + prompt: extractorOptions.userPrompt, + schema: extractorOptions.extractionSchema + } + : undefined, + mobile: pageOptions.mobile }), internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, - v0UseFastMode: pageOptions.useFastMode, - }, + v0UseFastMode: pageOptions.useFastMode + } // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks - } + }; } -export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} { - const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout); +export function fromLegacyCombo( + pageOptions: PageOptions, + extractorOptions: ExtractorOptions | undefined, + timeout: number | undefined, + crawlerOptions: any +): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { + const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions( + pageOptions, + extractorOptions, + timeout + ); const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; } -export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } { +export function toLegacyDocument( + document: Document, + internalOptions: InternalOptions +): V0Document | { url: string } { if (internalOptions.v0CrawlOnlyUrls) { return { url: document.metadata.sourceURL! }; } @@ -604,9 +696,9 @@ export function toLegacyDocument(document: Document, internalOptions: InternalOp statusCode: undefined, pageError: document.metadata.error, pageStatusCode: document.metadata.statusCode, - screenshot: document.screenshot, + screenshot: document.screenshot }, - actions: document.actions , - warning: document.warning, - } + actions: document.actions, + warning: document.warning + }; } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index e32bf97f..905c32d8 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry" +import "./services/sentry"; import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -9,9 +9,9 @@ import { v0Router } from "./routes/v0"; import os from "os"; import { logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; -import http from 'node:http'; -import https from 'node:https'; -import CacheableLookup from 'cacheable-lookup'; +import http from "node:http"; +import https from "node:https"; +import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; @@ -25,14 +25,12 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; logger.info(`Number of CPUs: ${numCPUs} available`); -const cacheable = new CacheableLookup() - +const cacheable = new CacheableLookup(); // Install cacheable lookup for all other requests cacheable.install(http.globalAgent); cacheable.install(https.globalAgent); - const ws = expressWs(express()); const app = ws.app; @@ -48,7 +46,7 @@ serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ queues: [new BullAdapter(getScrapeQueue())], - serverAdapter: serverAdapter, + serverAdapter: serverAdapter }); app.use( @@ -82,15 +80,15 @@ function startServer(port = DEFAULT_PORT) { }); const exitHandler = () => { - logger.info('SIGTERM signal received: closing HTTP server') + logger.info("SIGTERM signal received: closing HTTP server"); server.close(() => { logger.info("Server closed."); process.exit(0); }); }; - process.on('SIGTERM', exitHandler); - process.on('SIGINT', exitHandler); + process.on("SIGTERM", exitHandler); + process.on("SIGINT", exitHandler); return server; } @@ -101,13 +99,11 @@ if (require.main === module) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([ - scrapeQueue.getWaitingCount(), - ]); + const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs return res.status(noWaitingJobs ? 200 : 500).json({ - waitingJobs, + waitingJobs }); } catch (error) { Sentry.captureException(error); @@ -124,7 +120,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => { const getWaitingJobsCount = async () => { const scrapeQueue = getScrapeQueue(); const [waitingJobsCount] = await Promise.all([ - scrapeQueue.getWaitingCount(), + scrapeQueue.getWaitingCount() ]); return waitingJobsCount; @@ -144,15 +140,15 @@ app.get("/serverHealthCheck/notify", async (req, res) => { const message = { text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ timeout / 60000 - } minute(s).`, + } minute(s).` }; const response = await fetch(slackWebhookUrl, { method: "POST", headers: { - "Content-Type": "application/json", + "Content-Type": "application/json" }, - body: JSON.stringify(message), + body: JSON.stringify(message) }); if (!response.ok) { @@ -175,40 +171,80 @@ app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); -app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { - if (err instanceof ZodError) { - if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) { +app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: Response, + next: NextFunction + ) => { + if (err instanceof ZodError) { + if ( + Array.isArray(err.errors) && + err.errors.find((x) => x.message === "URL uses unsupported protocol") + ) { logger.warn("Unsupported protocol error: " + JSON.stringify(req.body)); } - res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); - } else { + res + .status(400) + .json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); + } } -}); +); Sentry.setupExpressErrorHandler(app); -app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { - if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { - return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); - } - - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); +app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: ResponseWithSentry, + next: NextFunction + ) => { + if ( + err instanceof SyntaxError && + "status" in err && + err.status === 400 && + "body" in err + ) { + return res + .status(400) + .json({ success: false, error: "Bad request, malformed JSON" }); } - } - logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id }); -}); + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack + }); + } + } + + logger.error( + "Error occurred in request! (" + + req.path + + ") -- ID " + + id + + " -- " + + verbose + ); + res + .status(500) + .json({ + success: false, + error: + "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + + id + }); + } +); logger.info(`Worker ${process.pid} started`); @@ -220,6 +256,3 @@ logger.info(`Worker ${process.pid} started`); // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); - - - diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 430dc1d4..47ecaf18 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -32,7 +32,7 @@ export async function generateCompletions( schema: schema, prompt: prompt, systemPrompt: systemPrompt, - mode: mode, + mode: mode }); // Validate the JSON output against the schema using AJV if (schema) { diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index f777dce9..563863c0 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -50,7 +50,7 @@ export async function generateOpenAICompletions({ systemPrompt = defaultPrompt, prompt, temperature, - mode, + mode }: { client: OpenAI; model?: string; @@ -68,7 +68,7 @@ export async function generateOpenAICompletions({ return { ...document, warning: - "LLM extraction was not performed since the document's content is empty or missing.", + "LLM extraction was not performed since the document's content is empty or missing." }; } const [content, numTokens] = preparedDoc; @@ -81,16 +81,16 @@ export async function generateOpenAICompletions({ messages: [ { role: "system", - content: systemPrompt, + content: systemPrompt }, { role: "user", content }, { role: "user", - content: `Transform the above content into structured json output based on the following user request: ${prompt}`, - }, + content: `Transform the above content into structured json output based on the following user request: ${prompt}` + } ], response_format: { type: "json_object" }, - temperature, + temperature }); try { @@ -106,9 +106,9 @@ export async function generateOpenAICompletions({ messages: [ { role: "system", - content: systemPrompt, + content: systemPrompt }, - { role: "user", content }, + { role: "user", content } ], tools: [ { @@ -116,12 +116,12 @@ export async function generateOpenAICompletions({ function: { name: "extract_content", description: "Extracts the content from the given webpage(s)", - parameters: schema, - }, - }, + parameters: schema + } + } ], tool_choice: { type: "function", function: { name: "extract_content" } }, - temperature, + temperature }); const c = completion.choices[0].message.tool_calls[0].function.arguments; @@ -140,6 +140,6 @@ export async function generateOpenAICompletions({ warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` - : undefined, + : undefined }; } diff --git a/apps/api/src/lib/__tests__/html-to-markdown.test.ts b/apps/api/src/lib/__tests__/html-to-markdown.test.ts index 3c68c959..f69c2949 100644 --- a/apps/api/src/lib/__tests__/html-to-markdown.test.ts +++ b/apps/api/src/lib/__tests__/html-to-markdown.test.ts @@ -1,36 +1,46 @@ -import { parseMarkdown } from '../html-to-markdown'; +import { parseMarkdown } from "../html-to-markdown"; -describe('parseMarkdown', () => { - it('should correctly convert simple HTML to Markdown', async () => { - const html = '

Hello, world!

'; - const expectedMarkdown = 'Hello, world!'; +describe("parseMarkdown", () => { + it("should correctly convert simple HTML to Markdown", async () => { + const html = "

Hello, world!

"; + const expectedMarkdown = "Hello, world!"; await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); }); - it('should convert complex HTML with nested elements to Markdown', async () => { - const html = '

Hello bold world!

  • List item
'; - const expectedMarkdown = 'Hello **bold** world!\n\n- List item'; + it("should convert complex HTML with nested elements to Markdown", async () => { + const html = + "

Hello bold world!

  • List item
"; + const expectedMarkdown = "Hello **bold** world!\n\n- List item"; await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); }); - it('should return empty string when input is empty', async () => { - const html = ''; - const expectedMarkdown = ''; + it("should return empty string when input is empty", async () => { + const html = ""; + const expectedMarkdown = ""; await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); }); - it('should handle null input gracefully', async () => { + it("should handle null input gracefully", async () => { const html = null; - const expectedMarkdown = ''; + const expectedMarkdown = ""; await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); }); - it('should handle various types of invalid HTML gracefully', async () => { + it("should handle various types of invalid HTML gracefully", async () => { const invalidHtmls = [ - { html: '

Unclosed tag', expected: 'Unclosed tag' }, - { html: '

Missing closing div', expected: 'Missing closing div' }, - { html: '

Wrong nesting

', expected: '**Wrong nesting**' }, - { html: 'Link without closing tag', expected: '[Link without closing tag](http://example.com)' } + { html: "

Unclosed tag", expected: "Unclosed tag" }, + { + html: "

Missing closing div", + expected: "Missing closing div" + }, + { + html: "

Wrong nesting

", + expected: "**Wrong nesting**" + }, + { + html: '
Link without closing tag', + expected: "[Link without closing tag](http://example.com)" + } ]; for (const { html, expected } of invalidHtmls) { diff --git a/apps/api/src/lib/__tests__/job-priority.test.ts b/apps/api/src/lib/__tests__/job-priority.test.ts index 82477379..4bd5fda9 100644 --- a/apps/api/src/lib/__tests__/job-priority.test.ts +++ b/apps/api/src/lib/__tests__/job-priority.test.ts @@ -1,7 +1,7 @@ import { getJobPriority, addJobPriority, - deleteJobPriority, + deleteJobPriority } from "../job-priority"; import { redisConnection } from "../../services/queue-service"; import { PlanType } from "../../types"; @@ -11,8 +11,8 @@ jest.mock("../../services/queue-service", () => ({ sadd: jest.fn(), srem: jest.fn(), scard: jest.fn(), - expire: jest.fn(), - }, + expire: jest.fn() + } })); describe("Job Priority Tests", () => { diff --git a/apps/api/src/lib/batch-process.ts b/apps/api/src/lib/batch-process.ts index 802d1eb1..20bb4ab6 100644 --- a/apps/api/src/lib/batch-process.ts +++ b/apps/api/src/lib/batch-process.ts @@ -1,16 +1,15 @@ export async function batchProcess( - array: T[], - batchSize: number, - asyncFunction: (item: T, index: number) => Promise - ): Promise { - const batches: T[][] = []; - for (let i = 0; i < array.length; i += batchSize) { - const batch = array.slice(i, i + batchSize); - batches.push(batch); - } - - for (const batch of batches) { - await Promise.all(batch.map((item, i) => asyncFunction(item, i))); - } + array: T[], + batchSize: number, + asyncFunction: (item: T, index: number) => Promise +): Promise { + const batches: T[][] = []; + for (let i = 0; i < array.length; i += batchSize) { + const batch = array.slice(i, i + batchSize); + batches.push(batch); } - \ No newline at end of file + + for (const batch of batches) { + await Promise.all(batch.map((item, i) => asyncFunction(item, i))); + } +} diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index 896d9429..30c9f0b4 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -2,49 +2,61 @@ import IORedis from "ioredis"; import { ScrapeOptions } from "../controllers/v1/types"; import { InternalOptions } from "../scraper/scrapeURL"; import { logger as _logger } from "./logger"; -const logger = _logger.child({module: "cache"}); +const logger = _logger.child({ module: "cache" }); -export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, { - maxRetriesPerRequest: null, -}) : null; +export const cacheRedis = process.env.CACHE_REDIS_URL + ? new IORedis(process.env.CACHE_REDIS_URL, { + maxRetriesPerRequest: null + }) + : null; -export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null { - if (!cacheRedis) return null; +export function cacheKey( + url: string, + scrapeOptions: ScrapeOptions, + internalOptions: InternalOptions +): string | null { + if (!cacheRedis) return null; - // these options disqualify a cache - if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv - || (scrapeOptions.actions && scrapeOptions.actions.length > 0) - ) { - return null; - } + // these options disqualify a cache + if ( + internalOptions.v0CrawlOnlyUrls || + internalOptions.forceEngine || + internalOptions.v0UseFastMode || + internalOptions.atsv || + (scrapeOptions.actions && scrapeOptions.actions.length > 0) + ) { + return null; + } - return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor; + return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor; } export type CacheEntry = { - url: string; - html: string; - statusCode: number; - error?: string; + url: string; + html: string; + statusCode: number; + error?: string; }; export async function saveEntryToCache(key: string, entry: CacheEntry) { - if (!cacheRedis) return; + if (!cacheRedis) return; - try { - await cacheRedis.set(key, JSON.stringify(entry)); - } catch (error) { - logger.warn("Failed to save to cache", { key, error }); - } + try { + await cacheRedis.set(key, JSON.stringify(entry)); + } catch (error) { + logger.warn("Failed to save to cache", { key, error }); + } } -export async function getEntryFromCache(key: string): Promise { - if (!cacheRedis) return null; +export async function getEntryFromCache( + key: string +): Promise { + if (!cacheRedis) return null; - try { - return JSON.parse(await cacheRedis.get(key) ?? "null"); - } catch (error) { - logger.warn("Failed to get from cache", { key, error }); - return null; - } + try { + return JSON.parse((await cacheRedis.get(key)) ?? "null"); + } catch (error) { + logger.warn("Failed to get from cache", { key, error }); + return null; + } } diff --git a/apps/api/src/lib/concurrency-limit.ts b/apps/api/src/lib/concurrency-limit.ts index 72dc1e45..aba1fd3a 100644 --- a/apps/api/src/lib/concurrency-limit.ts +++ b/apps/api/src/lib/concurrency-limit.ts @@ -4,45 +4,76 @@ import { RateLimiterMode } from "../types"; import { JobsOptions } from "bullmq"; const constructKey = (team_id: string) => "concurrency-limiter:" + team_id; -const constructQueueKey = (team_id: string) => "concurrency-limit-queue:" + team_id; +const constructQueueKey = (team_id: string) => + "concurrency-limit-queue:" + team_id; const stalledJobTimeoutMs = 2 * 60 * 1000; export function getConcurrencyLimitMax(plan: string): number { - return getRateLimiterPoints(RateLimiterMode.Scrape, undefined, plan); + return getRateLimiterPoints(RateLimiterMode.Scrape, undefined, plan); } -export async function cleanOldConcurrencyLimitEntries(team_id: string, now: number = Date.now()) { - await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now); +export async function cleanOldConcurrencyLimitEntries( + team_id: string, + now: number = Date.now() +) { + await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now); } -export async function getConcurrencyLimitActiveJobs(team_id: string, now: number = Date.now()): Promise { - return await redisConnection.zrangebyscore(constructKey(team_id), now, Infinity); +export async function getConcurrencyLimitActiveJobs( + team_id: string, + now: number = Date.now() +): Promise { + return await redisConnection.zrangebyscore( + constructKey(team_id), + now, + Infinity + ); } -export async function pushConcurrencyLimitActiveJob(team_id: string, id: string, now: number = Date.now()) { - await redisConnection.zadd(constructKey(team_id), now + stalledJobTimeoutMs, id); +export async function pushConcurrencyLimitActiveJob( + team_id: string, + id: string, + now: number = Date.now() +) { + await redisConnection.zadd( + constructKey(team_id), + now + stalledJobTimeoutMs, + id + ); } -export async function removeConcurrencyLimitActiveJob(team_id: string, id: string) { - await redisConnection.zrem(constructKey(team_id), id); +export async function removeConcurrencyLimitActiveJob( + team_id: string, + id: string +) { + await redisConnection.zrem(constructKey(team_id), id); } export type ConcurrencyLimitedJob = { - id: string; - data: any; - opts: JobsOptions; - priority?: number; + id: string; + data: any; + opts: JobsOptions; + priority?: number; +}; + +export async function takeConcurrencyLimitedJob( + team_id: string +): Promise { + const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN"); + if (res === null || res === undefined) { + return null; + } + + return JSON.parse(res[1][0][0]); } -export async function takeConcurrencyLimitedJob(team_id: string): Promise { - const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN"); - if (res === null || res === undefined) { - return null; - } - - return JSON.parse(res[1][0][0]); -} - -export async function pushConcurrencyLimitedJob(team_id: string, job: ConcurrencyLimitedJob) { - await redisConnection.zadd(constructQueueKey(team_id), job.priority ?? 1, JSON.stringify(job)); +export async function pushConcurrencyLimitedJob( + team_id: string, + job: ConcurrencyLimitedJob +) { + await redisConnection.zadd( + constructQueueKey(team_id), + job.priority ?? 1, + JSON.stringify(job) + ); } diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts index eb9c81f1..ef2dabee 100644 --- a/apps/api/src/lib/crawl-redis.test.ts +++ b/apps/api/src/lib/crawl-redis.test.ts @@ -1,33 +1,41 @@ import { generateURLPermutations } from "./crawl-redis"; describe("generateURLPermutations", () => { - it("generates permutations correctly", () => { - const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href); - expect(bareHttps.length).toBe(4); - expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); - expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true); - expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true); - expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); + it("generates permutations correctly", () => { + const bareHttps = generateURLPermutations("https://firecrawl.dev").map( + (x) => x.href + ); + expect(bareHttps.length).toBe(4); + expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); - const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href); - expect(bareHttp.length).toBe(4); - expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); - expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true); - expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true); - expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); + const bareHttp = generateURLPermutations("http://firecrawl.dev").map( + (x) => x.href + ); + expect(bareHttp.length).toBe(4); + expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); - const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href); - expect(wwwHttps.length).toBe(4); - expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); - expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true); - expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true); - expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); + const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map( + (x) => x.href + ); + expect(wwwHttps.length).toBe(4); + expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); - const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href); - expect(wwwHttp.length).toBe(4); - expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); - expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true); - expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true); - expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true); - }) -}); \ No newline at end of file + const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map( + (x) => x.href + ); + expect(wwwHttp.length).toBe(4); + expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true); + expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true); + }); +}); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 842f6ebf..ab1a238d 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -6,222 +6,331 @@ import { logger as _logger } from "./logger"; import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils"; export type StoredCrawl = { - originUrl?: string; - crawlerOptions: any; - scrapeOptions: Omit; - internalOptions: InternalOptions; - team_id: string; - plan?: string; - robots?: string; - cancelled?: boolean; - createdAt: number; + originUrl?: string; + crawlerOptions: any; + scrapeOptions: Omit; + internalOptions: InternalOptions; + team_id: string; + plan?: string; + robots?: string; + cancelled?: boolean; + createdAt: number; }; export async function saveCrawl(id: string, crawl: StoredCrawl) { - _logger.debug("Saving crawl " + id + " to Redis...", { crawl, module: "crawl-redis", method: "saveCrawl", crawlId: id, teamId: crawl.team_id, plan: crawl.plan }); - await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); - await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX"); + _logger.debug("Saving crawl " + id + " to Redis...", { + crawl, + module: "crawl-redis", + method: "saveCrawl", + crawlId: id, + teamId: crawl.team_id, + plan: crawl.plan + }); + await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); + await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX"); } export async function getCrawl(id: string): Promise { - const x = await redisConnection.get("crawl:" + id); + const x = await redisConnection.get("crawl:" + id); - if (x === null) { - return null; - } + if (x === null) { + return null; + } - return JSON.parse(x); + return JSON.parse(x); } export async function getCrawlExpiry(id: string): Promise { - const d = new Date(); - const ttl = await redisConnection.pttl("crawl:" + id); - d.setMilliseconds(d.getMilliseconds() + ttl); - d.setMilliseconds(0); - return d; + const d = new Date(); + const ttl = await redisConnection.pttl("crawl:" + id); + d.setMilliseconds(d.getMilliseconds() + ttl); + d.setMilliseconds(0); + return d; } export async function addCrawlJob(id: string, job_id: string) { - _logger.debug("Adding crawl job " + job_id + " to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJob", crawlId: id }); - await redisConnection.sadd("crawl:" + id + ":jobs", job_id); - await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); + _logger.debug("Adding crawl job " + job_id + " to Redis...", { + jobId: job_id, + module: "crawl-redis", + method: "addCrawlJob", + crawlId: id + }); + await redisConnection.sadd("crawl:" + id + ":jobs", job_id); + await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); } export async function addCrawlJobs(id: string, job_ids: string[]) { - _logger.debug("Adding crawl jobs to Redis...", { jobIds: job_ids, module: "crawl-redis", method: "addCrawlJobs", crawlId: id }); - await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids); - await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); + _logger.debug("Adding crawl jobs to Redis...", { + jobIds: job_ids, + module: "crawl-redis", + method: "addCrawlJobs", + crawlId: id + }); + await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids); + await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); } -export async function addCrawlJobDone(id: string, job_id: string, success: boolean) { - _logger.debug("Adding done crawl job to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJobDone", crawlId: id }); - await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); - await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); +export async function addCrawlJobDone( + id: string, + job_id: string, + success: boolean +) { + _logger.debug("Adding done crawl job to Redis...", { + jobId: job_id, + module: "crawl-redis", + method: "addCrawlJobDone", + crawlId: id + }); + await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); + await redisConnection.expire( + "crawl:" + id + ":jobs_done", + 24 * 60 * 60, + "NX" + ); - if (success) { - await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); - await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); - } + if (success) { + await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id); + await redisConnection.expire( + "crawl:" + id + ":jobs_done_ordered", + 24 * 60 * 60, + "NX" + ); + } } export async function getDoneJobsOrderedLength(id: string): Promise { - return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); + return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); } -export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise { - return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end); +export async function getDoneJobsOrdered( + id: string, + start = 0, + end = -1 +): Promise { + return await redisConnection.lrange( + "crawl:" + id + ":jobs_done_ordered", + start, + end + ); } export async function isCrawlFinished(id: string) { - return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs")); + return ( + (await redisConnection.scard("crawl:" + id + ":jobs_done")) === + (await redisConnection.scard("crawl:" + id + ":jobs")) + ); } export async function isCrawlFinishedLocked(id: string) { - return (await redisConnection.exists("crawl:" + id + ":finish")); + return await redisConnection.exists("crawl:" + id + ":finish"); } export async function finishCrawl(id: string) { - if (await isCrawlFinished(id)) { - _logger.debug("Marking crawl as finished.", { module: "crawl-redis", method: "finishCrawl", crawlId: id }); - const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); - if (set === 1) { - await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); - } - return set === 1 - } else { - _logger.debug("Crawl can not be finished yet, not marking as finished.", { module: "crawl-redis", method: "finishCrawl", crawlId: id }); + if (await isCrawlFinished(id)) { + _logger.debug("Marking crawl as finished.", { + module: "crawl-redis", + method: "finishCrawl", + crawlId: id + }); + const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); + if (set === 1) { + await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); } + return set === 1; + } else { + _logger.debug("Crawl can not be finished yet, not marking as finished.", { + module: "crawl-redis", + method: "finishCrawl", + crawlId: id + }); + } } export async function getCrawlJobs(id: string): Promise { - return await redisConnection.smembers("crawl:" + id + ":jobs"); + return await redisConnection.smembers("crawl:" + id + ":jobs"); } export async function getThrottledJobs(teamId: string): Promise { - return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity); + return await redisConnection.zrangebyscore( + "concurrency-limiter:" + teamId + ":throttled", + Date.now(), + Infinity + ); } export function normalizeURL(url: string, sc: StoredCrawl): string { - const urlO = new URL(url); - if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) { - urlO.search = ""; - } - urlO.hash = ""; - return urlO.href; + const urlO = new URL(url); + if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) { + urlO.search = ""; + } + urlO.hash = ""; + return urlO.href; } export function generateURLPermutations(url: string | URL): URL[] { - const urlO = new URL(url); + const urlO = new URL(url); - // Construct two versions, one with www., one without - const urlWithWWW = new URL(urlO); - const urlWithoutWWW = new URL(urlO); - if (urlO.hostname.startsWith("www.")) { - urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); - } else { - urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + // Construct two versions, one with www., one without + const urlWithWWW = new URL(urlO); + const urlWithoutWWW = new URL(urlO); + if (urlO.hostname.startsWith("www.")) { + urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4); + } else { + urlWithWWW.hostname = "www." + urlWithoutWWW.hostname; + } + + let permutations = [urlWithWWW, urlWithoutWWW]; + + // Construct more versions for http/https + permutations = permutations.flatMap((urlO) => { + if (!["http:", "https:"].includes(urlO.protocol)) { + return [urlO]; } - let permutations = [urlWithWWW, urlWithoutWWW]; + const urlWithHTTP = new URL(urlO); + const urlWithHTTPS = new URL(urlO); + urlWithHTTP.protocol = "http:"; + urlWithHTTPS.protocol = "https:"; - // Construct more versions for http/https - permutations = permutations.flatMap(urlO => { - if (!["http:", "https:"].includes(urlO.protocol)) { - return [urlO]; - } + return [urlWithHTTP, urlWithHTTPS]; + }); - const urlWithHTTP = new URL(urlO); - const urlWithHTTPS = new URL(urlO); - urlWithHTTP.protocol = "http:"; - urlWithHTTPS.protocol = "https:"; - - return [urlWithHTTP, urlWithHTTPS]; - }); - - return permutations; + return permutations; } -export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise { - let logger = _logger.child({ crawlId: id, module: "crawl-redis", method: "lockURL", preNormalizedURL: url, teamId: sc.team_id, plan: sc.plan }); +export async function lockURL( + id: string, + sc: StoredCrawl, + url: string +): Promise { + let logger = _logger.child({ + crawlId: id, + module: "crawl-redis", + method: "lockURL", + preNormalizedURL: url, + teamId: sc.team_id, + plan: sc.plan + }); - if (typeof sc.crawlerOptions?.limit === "number") { - if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) { - logger.debug("Crawl has already hit visited_unique limit, not locking URL."); - return false; - } + if (typeof sc.crawlerOptions?.limit === "number") { + if ( + (await redisConnection.scard("crawl:" + id + ":visited_unique")) >= + sc.crawlerOptions.limit + ) { + logger.debug( + "Crawl has already hit visited_unique limit, not locking URL." + ); + return false; } + } - url = normalizeURL(url, sc); - logger = logger.child({ url }); + url = normalizeURL(url, sc); + logger = logger.child({ url }); - await redisConnection.sadd("crawl:" + id + ":visited_unique", url); - await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX"); + await redisConnection.sadd("crawl:" + id + ":visited_unique", url); + await redisConnection.expire( + "crawl:" + id + ":visited_unique", + 24 * 60 * 60, + "NX" + ); - let res: boolean; - if (!sc.crawlerOptions?.deduplicateSimilarURLs) { - res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 - } else { - const permutations = generateURLPermutations(url).map(x => x.href); - // logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations }); - const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations)); - res = x === permutations.length; - } + let res: boolean; + if (!sc.crawlerOptions?.deduplicateSimilarURLs) { + res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0; + } else { + const permutations = generateURLPermutations(url).map((x) => x.href); + // logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations }); + const x = await redisConnection.sadd( + "crawl:" + id + ":visited", + ...permutations + ); + res = x === permutations.length; + } - await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); - logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { res }); - return res; + logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { + res + }); + return res; } /// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap -export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise { - urls = urls.map(url => normalizeURL(url, sc)); - const logger = _logger.child({ crawlId: id, module: "crawl-redis", method: "lockURL", teamId: sc.team_id, plan: sc.plan }); +export async function lockURLs( + id: string, + sc: StoredCrawl, + urls: string[] +): Promise { + urls = urls.map((url) => normalizeURL(url, sc)); + const logger = _logger.child({ + crawlId: id, + module: "crawl-redis", + method: "lockURL", + teamId: sc.team_id, + plan: sc.plan + }); - // Add to visited_unique set - logger.debug("Locking " + urls.length + " URLs..."); - await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls); - await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX"); + // Add to visited_unique set + logger.debug("Locking " + urls.length + " URLs..."); + await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls); + await redisConnection.expire( + "crawl:" + id + ":visited_unique", + 24 * 60 * 60, + "NX" + ); - let res: boolean; - if (!sc.crawlerOptions?.deduplicateSimilarURLs) { - const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls); - res = x === urls.length; - } else { - const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href)); - logger.debug("Adding " + allPermutations.length + " URL permutations..."); - const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations); - res = x === allPermutations.length; - } + let res: boolean; + if (!sc.crawlerOptions?.deduplicateSimilarURLs) { + const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls); + res = x === urls.length; + } else { + const allPermutations = urls.flatMap((url) => + generateURLPermutations(url).map((x) => x.href) + ); + logger.debug("Adding " + allPermutations.length + " URL permutations..."); + const x = await redisConnection.sadd( + "crawl:" + id + ":visited", + ...allPermutations + ); + res = x === allPermutations.length; + } - await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); - logger.debug("lockURLs final result: " + res, { res }); - return res; + logger.debug("lockURLs final result: " + res, { res }); + return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler { - const crawler = new WebCrawler({ - jobId: id, - initialUrl: sc.originUrl!, - baseUrl: newBase ? new URL(newBase).origin : undefined, - includes: sc.crawlerOptions?.includes ?? [], - excludes: sc.crawlerOptions?.excludes ?? [], - maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, - maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10), - limit: sc.crawlerOptions?.limit ?? 10000, - generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, - allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, - allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, - allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, - ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, - }); +export function crawlToCrawler( + id: string, + sc: StoredCrawl, + newBase?: string +): WebCrawler { + const crawler = new WebCrawler({ + jobId: id, + initialUrl: sc.originUrl!, + baseUrl: newBase ? new URL(newBase).origin : undefined, + includes: sc.crawlerOptions?.includes ?? [], + excludes: sc.crawlerOptions?.excludes ?? [], + maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, + maxCrawledDepth: getAdjustedMaxDepth( + sc.originUrl!, + sc.crawlerOptions?.maxDepth ?? 10 + ), + limit: sc.crawlerOptions?.limit ?? 10000, + generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, + allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, + allowExternalContentLinks: + sc.crawlerOptions?.allowExternalContentLinks ?? false, + allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, + ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false + }); - if (sc.robots !== undefined) { - try { - crawler.importRobotsTxt(sc.robots); - } catch (_) {} - } + if (sc.robots !== undefined) { + try { + crawler.importRobotsTxt(sc.robots); + } catch (_) {} + } - return crawler; + return crawler; } diff --git a/apps/api/src/lib/custom-error.ts b/apps/api/src/lib/custom-error.ts index 2ffe52e9..25502a8e 100644 --- a/apps/api/src/lib/custom-error.ts +++ b/apps/api/src/lib/custom-error.ts @@ -8,7 +8,7 @@ export class CustomError extends Error { statusCode: number, status: string, message: string = "", - dataIngestionJob?: any, + dataIngestionJob?: any ) { super(message); this.statusCode = statusCode; @@ -19,4 +19,3 @@ export class CustomError extends Error { Object.setPrototypeOf(this, CustomError.prototype); } } - diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index f70f17c0..ceca176c 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -14,15 +14,15 @@ export const defaultPageOptions = { export const defaultCrawlerOptions = { allowBackwardCrawling: false, limit: 10000 -} +}; export const defaultCrawlPageOptions = { onlyMainContent: false, includeHtml: false, removeTags: [], parsePDF: true -} +}; export const defaultExtractorOptions = { mode: "markdown" -} \ No newline at end of file +}; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9fa39cff..93911485 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -12,32 +12,40 @@ export interface Progress { currentDocument?: Document; } -export type Action = { - type: "wait", - milliseconds?: number, - selector?: string, -} | { - type: "click", - selector: string, -} | { - type: "screenshot", - fullPage?: boolean, -} | { - type: "write", - text: string, -} | { - type: "press", - key: string, -} | { - type: "scroll", - direction?: "up" | "down", - selector?: string, -} | { - type: "scrape", -} | { - type: "executeJavascript", - script: string, -} +export type Action = + | { + type: "wait"; + milliseconds?: number; + selector?: string; + } + | { + type: "click"; + selector: string; + } + | { + type: "screenshot"; + fullPage?: boolean; + } + | { + type: "write"; + text: string; + } + | { + type: "press"; + key: string; + } + | { + type: "scroll"; + direction?: "up" | "down"; + selector?: string; + } + | { + type: "scrape"; + } + | { + type: "executeJavascript"; + script: string; + }; export type PageOptions = { includeMarkdown?: boolean; @@ -69,11 +77,15 @@ export type PageOptions = { }; export type ExtractorOptions = { - mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html"; + mode: + | "markdown" + | "llm-extraction" + | "llm-extraction-from-markdown" + | "llm-extraction-from-raw-html"; extractionPrompt?: string; extractionSchema?: Record; userPrompt?: string; -} +}; export type SearchOptions = { limit?: number; @@ -97,7 +109,7 @@ export type CrawlerOptions = { mode?: "default" | "fast"; // have a mode of some sort allowBackwardCrawling?: boolean; allowExternalContentLinks?: boolean; -} +}; export type WebScraperOptions = { jobId: string; @@ -137,11 +149,11 @@ export class Document { actions?: { screenshots?: string[]; scrapes?: ScrapeActionContent[]; - } + }; index?: number; linksOnPage?: string[]; // Add this new field as a separate property - + constructor(data: Partial) { if (!data.content) { throw new Error("Missing required fields"); @@ -158,20 +170,19 @@ export class Document { } } - export class SearchResult { url: string; title: string; description: string; constructor(url: string, title: string, description: string) { - this.url = url; - this.title = title; - this.description = description; + this.url = url; + this.title = title; + this.description = description; } toString(): string { - return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; + return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; } } @@ -188,8 +199,7 @@ export interface FireEngineResponse { scrapeActionContent?: ScrapeActionContent[]; } - -export interface FireEngineOptions{ +export interface FireEngineOptions { mobileProxy?: boolean; method?: string; engine?: string; diff --git a/apps/api/src/lib/extract/build-document.ts b/apps/api/src/lib/extract/build-document.ts index 66417a07..79453313 100644 --- a/apps/api/src/lib/extract/build-document.ts +++ b/apps/api/src/lib/extract/build-document.ts @@ -5,9 +5,11 @@ export function buildDocument(document: Document): string { const markdown = document.markdown; // for each key in the metadata allow up to 250 characters - const metadataString = Object.entries(metadata).map(([key, value]) => { - return `${key}: ${value?.toString().slice(0, 250)}`; - }).join('\n'); + const metadataString = Object.entries(metadata) + .map(([key, value]) => { + return `${key}: ${value?.toString().slice(0, 250)}`; + }) + .join("\n"); const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`; const documentString = `${markdown}${documentMetadataString}`; diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 30aca441..044f71a4 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -1,7 +1,7 @@ import { CohereClient } from "cohere-ai"; import { MapDocument } from "../../controllers/v1/types"; const cohere = new CohereClient({ - token: process.env.COHERE_API_KEY, + token: process.env.COHERE_API_KEY }); export async function rerankDocuments( @@ -15,8 +15,14 @@ export async function rerankDocuments( query, topN, model, - returnDocuments: true, + returnDocuments: true }); - return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore })); + return rerank.results + .sort((a, b) => b.relevanceScore - a.relevanceScore) + .map((x) => ({ + document: x.document, + index: x.index, + relevanceScore: x.relevanceScore + })); } diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 92bcd4cd..7a0020d1 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,16 +1,20 @@ - -import koffi from 'koffi'; -import { join } from 'path'; -import "../services/sentry" +import koffi from "koffi"; +import { join } from "path"; +import "../services/sentry"; import * as Sentry from "@sentry/node"; -import dotenv from 'dotenv'; -import { logger } from './logger'; -import { stat } from 'fs/promises'; +import dotenv from "dotenv"; +import { logger } from "./logger"; +import { stat } from "fs/promises"; dotenv.config(); // TODO: add a timeout to the Go parser -const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so'); +const goExecutablePath = join( + process.cwd(), + "sharedLibs", + "go-html-to-md", + "html-to-markdown.so" +); class GoMarkdownConverter { private static instance: GoMarkdownConverter; @@ -18,7 +22,7 @@ class GoMarkdownConverter { private constructor() { const lib = koffi.load(goExecutablePath); - this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']); + this.convert = lib.func("ConvertHTMLToMarkdown", "string", ["string"]); } public static async getInstance(): Promise { @@ -46,9 +50,11 @@ class GoMarkdownConverter { } } -export async function parseMarkdown(html: string | null | undefined): Promise { +export async function parseMarkdown( + html: string | null | undefined +): Promise { if (!html) { - return ''; + return ""; } try { @@ -62,17 +68,25 @@ export async function parseMarkdown(html: string | null | undefined): Promise - `${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify( - info.metadata, - (_, value) => { - if (value instanceof Error) { - return { - ...value, - name: value.name, - message: value.message, - stack: value.stack, - cause: value.cause, - } - } else { - return value; - } - } - ) : ""}` -) +const logFormat = winston.format.printf( + (info) => + `${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${ + info.level.includes("error") || info.level.includes("warn") + ? JSON.stringify(info.metadata, (_, value) => { + if (value instanceof Error) { + return { + ...value, + name: value.name, + message: value.message, + stack: value.stack, + cause: value.cause + }; + } else { + return value; + } + }) + : "" + }` +); export const logger = winston.createLogger({ level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug", @@ -32,8 +34,8 @@ export const logger = winston.createLogger({ name: value.name, message: value.message, stack: value.stack, - cause: value.cause, - } + cause: value.cause + }; } else { return value; } @@ -43,9 +45,15 @@ export const logger = winston.createLogger({ new winston.transports.Console({ format: winston.format.combine( winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), - winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }), - ...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []), - ), - }), - ], + winston.format.metadata({ + fillExcept: ["message", "level", "timestamp"] + }), + ...((process.env.ENV === "production" && + process.env.SENTRY_ENVIRONMENT === "dev") || + process.env.ENV !== "production" + ? [winston.format.colorize(), logFormat] + : []) + ) + }) + ] }); diff --git a/apps/api/src/lib/parseApi.ts b/apps/api/src/lib/parseApi.ts index 4b03a405..e29135fd 100644 --- a/apps/api/src/lib/parseApi.ts +++ b/apps/api/src/lib/parseApi.ts @@ -13,7 +13,6 @@ export function parseApi(api: string) { return uuid; } - export function uuidToFcUuid(uuid: string) { const uuidWithoutDashes = uuid.replace(/-/g, ""); return `fc-${uuidWithoutDashes}`; diff --git a/apps/api/src/lib/ranker.test.ts b/apps/api/src/lib/ranker.test.ts index 6d17a08b..2b30de19 100644 --- a/apps/api/src/lib/ranker.test.ts +++ b/apps/api/src/lib/ranker.test.ts @@ -1,64 +1,61 @@ -import { performRanking } from './ranker'; +import { performRanking } from "./ranker"; -describe('performRanking', () => { - it('should rank links based on similarity to search query', async () => { +describe("performRanking", () => { + it("should rank links based on similarity to search query", async () => { const linksWithContext = [ - 'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds', - 'url: https://example.com/cats, title: Cat care guide, description: Everything about cats', - 'url: https://example.com/pets, title: General pet care, description: Care for all types of pets' + "url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds", + "url: https://example.com/cats, title: Cat care guide, description: Everything about cats", + "url: https://example.com/pets, title: General pet care, description: Care for all types of pets" ]; const links = [ - 'https://example.com/dogs', - 'https://example.com/cats', - 'https://example.com/pets' + "https://example.com/dogs", + "https://example.com/cats", + "https://example.com/pets" ]; - const searchQuery = 'cats training'; + const searchQuery = "cats training"; const result = await performRanking(linksWithContext, links, searchQuery); // Should return array of objects with link, linkWithContext, score, originalIndex expect(result).toBeInstanceOf(Array); expect(result.length).toBe(3); - + // First result should be the dogs page since query is about dogs - expect(result[0].link).toBe('https://example.com/cats'); - + expect(result[0].link).toBe("https://example.com/cats"); + // Each result should have required properties - result.forEach(item => { - expect(item).toHaveProperty('link'); - expect(item).toHaveProperty('linkWithContext'); - expect(item).toHaveProperty('score'); - expect(item).toHaveProperty('originalIndex'); - expect(typeof item.score).toBe('number'); + result.forEach((item) => { + expect(item).toHaveProperty("link"); + expect(item).toHaveProperty("linkWithContext"); + expect(item).toHaveProperty("score"); + expect(item).toHaveProperty("originalIndex"); + expect(typeof item.score).toBe("number"); expect(item.score).toBeGreaterThanOrEqual(0); expect(item.score).toBeLessThanOrEqual(1); }); // Scores should be in descending order for (let i = 1; i < result.length; i++) { - expect(result[i].score).toBeLessThanOrEqual(result[i-1].score); + expect(result[i].score).toBeLessThanOrEqual(result[i - 1].score); } }); - it('should handle empty inputs', async () => { - const result = await performRanking([], [], ''); + it("should handle empty inputs", async () => { + const result = await performRanking([], [], ""); expect(result).toEqual([]); }); - it('should maintain original order for equal scores', async () => { + it("should maintain original order for equal scores", async () => { const linksWithContext = [ - 'url: https://example.com/1, title: Similar content A, description: test', - 'url: https://example.com/2, title: Similar content B, description: test' + "url: https://example.com/1, title: Similar content A, description: test", + "url: https://example.com/2, title: Similar content B, description: test" ]; - const links = [ - 'https://example.com/1', - 'https://example.com/2' - ]; + const links = ["https://example.com/1", "https://example.com/2"]; - const searchQuery = 'test'; + const searchQuery = "test"; const result = await performRanking(linksWithContext, links, searchQuery); diff --git a/apps/api/src/lib/ranker.ts b/apps/api/src/lib/ranker.ts index e7fa235c..2f06d76d 100644 --- a/apps/api/src/lib/ranker.ts +++ b/apps/api/src/lib/ranker.ts @@ -1,18 +1,18 @@ -import axios from 'axios'; -import { configDotenv } from 'dotenv'; +import axios from "axios"; +import { configDotenv } from "dotenv"; import OpenAI from "openai"; configDotenv(); const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, + apiKey: process.env.OPENAI_API_KEY }); async function getEmbedding(text: string) { const embedding = await openai.embeddings.create({ model: "text-embedding-ada-002", input: text, - encoding_format: "float", + encoding_format: "float" }); return embedding.data[0].embedding; @@ -20,12 +20,8 @@ async function getEmbedding(text: string) { const cosineSimilarity = (vec1: number[], vec2: number[]): number => { const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0); - const magnitude1 = Math.sqrt( - vec1.reduce((sum, val) => sum + val * val, 0) - ); - const magnitude2 = Math.sqrt( - vec2.reduce((sum, val) => sum + val * val, 0) - ); + const magnitude1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0)); + const magnitude2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0)); if (magnitude1 === 0 || magnitude2 === 0) return 0; return dotProduct / (magnitude1 * magnitude2); }; @@ -40,7 +36,11 @@ const textToVector = (searchQuery: string, text: string): number[] => { }); }; -async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) { +async function performRanking( + linksWithContext: string[], + links: string[], + searchQuery: string +) { try { // Handle invalid inputs if (!searchQuery || !linksWithContext.length || !links.length) { @@ -54,27 +54,29 @@ async function performRanking(linksWithContext: string[], links: string[], searc const queryEmbedding = await getEmbedding(sanitizedQuery); // Generate embeddings for each link and calculate similarity - const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => { - try { - const linkEmbedding = await getEmbedding(linkWithContext); - const score = cosineSimilarity(queryEmbedding, linkEmbedding); - - return { - link: links[index], - linkWithContext, - score, - originalIndex: index - }; - } catch (err) { - // If embedding fails for a link, return with score 0 - return { - link: links[index], - linkWithContext, - score: 0, - originalIndex: index - }; - } - })); + const linksAndScores = await Promise.all( + linksWithContext.map(async (linkWithContext, index) => { + try { + const linkEmbedding = await getEmbedding(linkWithContext); + const score = cosineSimilarity(queryEmbedding, linkEmbedding); + + return { + link: links[index], + linkWithContext, + score, + originalIndex: index + }; + } catch (err) { + // If embedding fails for a link, return with score 0 + return { + link: links[index], + linkWithContext, + score: 0, + originalIndex: index + }; + } + }) + ); // Sort links based on similarity scores while preserving original order for equal scores linksAndScores.sort((a, b) => { diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 83873a58..6c39c722 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -6,47 +6,61 @@ import { Engine } from "../scraper/scrapeURL/engines"; configDotenv(); export type ScrapeErrorEvent = { - type: "error", - message: string, - stack?: string, -} + type: "error"; + message: string; + stack?: string; +}; export type ScrapeScrapeEvent = { - type: "scrape", - url: string, - worker?: string, - method: Engine, + type: "scrape"; + url: string; + worker?: string; + method: Engine; result: null | { - success: boolean, - response_code?: number, - response_size?: number, - error?: string | object, + success: boolean; + response_code?: number; + response_size?: number; + error?: string | object; // proxy?: string, - time_taken: number, - }, -} + time_taken: number; + }; +}; export type ScrapeQueueEvent = { - type: "queue", - event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed", - worker?: string, -} + type: "queue"; + event: + | "waiting" + | "active" + | "completed" + | "paused" + | "resumed" + | "removed" + | "failed"; + worker?: string; +}; -export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent; +export type ScrapeEvent = + | ScrapeErrorEvent + | ScrapeScrapeEvent + | ScrapeQueueEvent; export class ScrapeEvents { static async insert(jobId: string, content: ScrapeEvent) { if (jobId === "TEST") return null; - - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (useDbAuthentication) { try { - const result = await supabase.from("scrape_events").insert({ - job_id: jobId, - type: content.type, - content: content, - // created_at - }).select().single(); + const result = await supabase + .from("scrape_events") + .insert({ + job_id: jobId, + type: content.type, + content: content + // created_at + }) + .select() + .single(); return (result.data as any).id; } catch (error) { // logger.error(`Error inserting scrape event: ${error}`); @@ -57,17 +71,25 @@ export class ScrapeEvents { return null; } - static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) { + static async updateScrapeResult( + logId: number | null, + result: ScrapeScrapeEvent["result"] + ) { if (logId === null) return; try { - const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; - await supabase.from("scrape_events").update({ - content: { - ...previousLog.content, - result, - } - }).eq("id", logId); + const previousLog = ( + await supabase.from("scrape_events").select().eq("id", logId).single() + ).data as any; + await supabase + .from("scrape_events") + .update({ + content: { + ...previousLog.content, + result + } + }) + .eq("id", logId); } catch (error) { logger.error(`Error updating scrape result: ${error}`); } @@ -78,7 +100,7 @@ export class ScrapeEvents { await this.insert(((job as any).id ? (job as any).id : job) as string, { type: "queue", event, - worker: process.env.FLY_MACHINE_ID, + worker: process.env.FLY_MACHINE_ID }); } catch (error) { logger.error(`Error logging job event: ${error}`); diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index c9be72a3..2ed7c02a 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -58,7 +58,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { const { data, error } = await supabase_service .from("firecrawl_jobs") .select() - .eq("crawl_id", crawlId) + .eq("crawl_id", crawlId); if (error) { logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`); @@ -73,7 +73,6 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { return data; }; - export const supabaseGetJobByIdOnlyData = async (jobId: string) => { const { data, error } = await supabase_service .from("firecrawl_jobs") @@ -90,4 +89,4 @@ export const supabaseGetJobByIdOnlyData = async (jobId: string) => { } return data; -}; \ No newline at end of file +}; diff --git a/apps/api/src/lib/timeout.ts b/apps/api/src/lib/timeout.ts index 46d34a5a..f913817a 100644 --- a/apps/api/src/lib/timeout.ts +++ b/apps/api/src/lib/timeout.ts @@ -1 +1 @@ -export const axiosTimeout = 5000; \ No newline at end of file +export const axiosTimeout = 5000; diff --git a/apps/api/src/lib/validate-country.ts b/apps/api/src/lib/validate-country.ts index bff1c25c..797ea542 100644 --- a/apps/api/src/lib/validate-country.ts +++ b/apps/api/src/lib/validate-country.ts @@ -6,7 +6,7 @@ export const countries = { continent: "EU", capital: "Andorra la Vella", currency: ["EUR"], - languages: ["ca"], + languages: ["ca"] }, AE: { name: "United Arab Emirates", @@ -15,7 +15,7 @@ export const countries = { continent: "AS", capital: "Abu Dhabi", currency: ["AED"], - languages: ["ar"], + languages: ["ar"] }, AF: { name: "Afghanistan", @@ -24,7 +24,7 @@ export const countries = { continent: "AS", capital: "Kabul", currency: ["AFN"], - languages: ["ps", "uz", "tk"], + languages: ["ps", "uz", "tk"] }, AG: { name: "Antigua and Barbuda", @@ -33,7 +33,7 @@ export const countries = { continent: "NA", capital: "Saint John's", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, AI: { name: "Anguilla", @@ -42,7 +42,7 @@ export const countries = { continent: "NA", capital: "The Valley", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, AL: { name: "Albania", @@ -51,7 +51,7 @@ export const countries = { continent: "EU", capital: "Tirana", currency: ["ALL"], - languages: ["sq"], + languages: ["sq"] }, AM: { name: "Armenia", @@ -60,7 +60,7 @@ export const countries = { continent: "AS", capital: "Yerevan", currency: ["AMD"], - languages: ["hy", "ru"], + languages: ["hy", "ru"] }, AO: { name: "Angola", @@ -69,7 +69,7 @@ export const countries = { continent: "AF", capital: "Luanda", currency: ["AOA"], - languages: ["pt"], + languages: ["pt"] }, AQ: { name: "Antarctica", @@ -78,7 +78,7 @@ export const countries = { continent: "AN", capital: "", currency: [], - languages: [], + languages: [] }, AR: { name: "Argentina", @@ -87,7 +87,7 @@ export const countries = { continent: "SA", capital: "Buenos Aires", currency: ["ARS"], - languages: ["es", "gn"], + languages: ["es", "gn"] }, AS: { name: "American Samoa", @@ -96,7 +96,7 @@ export const countries = { continent: "OC", capital: "Pago Pago", currency: ["USD"], - languages: ["en", "sm"], + languages: ["en", "sm"] }, AT: { name: "Austria", @@ -105,7 +105,7 @@ export const countries = { continent: "EU", capital: "Vienna", currency: ["EUR"], - languages: ["de"], + languages: ["de"] }, AU: { name: "Australia", @@ -114,7 +114,7 @@ export const countries = { continent: "OC", capital: "Canberra", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, AW: { name: "Aruba", @@ -123,7 +123,7 @@ export const countries = { continent: "NA", capital: "Oranjestad", currency: ["AWG"], - languages: ["nl", "pa"], + languages: ["nl", "pa"] }, AX: { name: "Aland", @@ -133,7 +133,7 @@ export const countries = { capital: "Mariehamn", currency: ["EUR"], languages: ["sv"], - partOf: "FI", + partOf: "FI" }, AZ: { name: "Azerbaijan", @@ -143,7 +143,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Baku", currency: ["AZN"], - languages: ["az"], + languages: ["az"] }, BA: { name: "Bosnia and Herzegovina", @@ -152,7 +152,7 @@ export const countries = { continent: "EU", capital: "Sarajevo", currency: ["BAM"], - languages: ["bs", "hr", "sr"], + languages: ["bs", "hr", "sr"] }, BB: { name: "Barbados", @@ -161,7 +161,7 @@ export const countries = { continent: "NA", capital: "Bridgetown", currency: ["BBD"], - languages: ["en"], + languages: ["en"] }, BD: { name: "Bangladesh", @@ -170,7 +170,7 @@ export const countries = { continent: "AS", capital: "Dhaka", currency: ["BDT"], - languages: ["bn"], + languages: ["bn"] }, BE: { name: "Belgium", @@ -179,7 +179,7 @@ export const countries = { continent: "EU", capital: "Brussels", currency: ["EUR"], - languages: ["nl", "fr", "de"], + languages: ["nl", "fr", "de"] }, BF: { name: "Burkina Faso", @@ -188,7 +188,7 @@ export const countries = { continent: "AF", capital: "Ouagadougou", currency: ["XOF"], - languages: ["fr", "ff"], + languages: ["fr", "ff"] }, BG: { name: "Bulgaria", @@ -197,7 +197,7 @@ export const countries = { continent: "EU", capital: "Sofia", currency: ["BGN"], - languages: ["bg"], + languages: ["bg"] }, BH: { name: "Bahrain", @@ -206,7 +206,7 @@ export const countries = { continent: "AS", capital: "Manama", currency: ["BHD"], - languages: ["ar"], + languages: ["ar"] }, BI: { name: "Burundi", @@ -215,7 +215,7 @@ export const countries = { continent: "AF", capital: "Bujumbura", currency: ["BIF"], - languages: ["fr", "rn"], + languages: ["fr", "rn"] }, BJ: { name: "Benin", @@ -224,7 +224,7 @@ export const countries = { continent: "AF", capital: "Porto-Novo", currency: ["XOF"], - languages: ["fr"], + languages: ["fr"] }, BL: { name: "Saint Barthelemy", @@ -233,7 +233,7 @@ export const countries = { continent: "NA", capital: "Gustavia", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, BM: { name: "Bermuda", @@ -242,7 +242,7 @@ export const countries = { continent: "NA", capital: "Hamilton", currency: ["BMD"], - languages: ["en"], + languages: ["en"] }, BN: { name: "Brunei", @@ -251,7 +251,7 @@ export const countries = { continent: "AS", capital: "Bandar Seri Begawan", currency: ["BND"], - languages: ["ms"], + languages: ["ms"] }, BO: { name: "Bolivia", @@ -260,7 +260,7 @@ export const countries = { continent: "SA", capital: "Sucre", currency: ["BOB", "BOV"], - languages: ["es", "ay", "qu"], + languages: ["es", "ay", "qu"] }, BQ: { name: "Bonaire", @@ -269,7 +269,7 @@ export const countries = { continent: "NA", capital: "Kralendijk", currency: ["USD"], - languages: ["nl"], + languages: ["nl"] }, BR: { name: "Brazil", @@ -278,7 +278,7 @@ export const countries = { continent: "SA", capital: "Brasília", currency: ["BRL"], - languages: ["pt"], + languages: ["pt"] }, BS: { name: "Bahamas", @@ -287,7 +287,7 @@ export const countries = { continent: "NA", capital: "Nassau", currency: ["BSD"], - languages: ["en"], + languages: ["en"] }, BT: { name: "Bhutan", @@ -296,7 +296,7 @@ export const countries = { continent: "AS", capital: "Thimphu", currency: ["BTN", "INR"], - languages: ["dz"], + languages: ["dz"] }, BV: { name: "Bouvet Island", @@ -305,7 +305,7 @@ export const countries = { continent: "AN", capital: "", currency: ["NOK"], - languages: ["no", "nb", "nn"], + languages: ["no", "nb", "nn"] }, BW: { name: "Botswana", @@ -314,7 +314,7 @@ export const countries = { continent: "AF", capital: "Gaborone", currency: ["BWP"], - languages: ["en", "tn"], + languages: ["en", "tn"] }, BY: { name: "Belarus", @@ -323,7 +323,7 @@ export const countries = { continent: "EU", capital: "Minsk", currency: ["BYN"], - languages: ["be", "ru"], + languages: ["be", "ru"] }, BZ: { name: "Belize", @@ -332,7 +332,7 @@ export const countries = { continent: "NA", capital: "Belmopan", currency: ["BZD"], - languages: ["en", "es"], + languages: ["en", "es"] }, CA: { name: "Canada", @@ -341,7 +341,7 @@ export const countries = { continent: "NA", capital: "Ottawa", currency: ["CAD"], - languages: ["en", "fr"], + languages: ["en", "fr"] }, CC: { name: "Cocos (Keeling) Islands", @@ -350,7 +350,7 @@ export const countries = { continent: "AS", capital: "West Island", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, CD: { name: "Democratic Republic of the Congo", @@ -359,7 +359,7 @@ export const countries = { continent: "AF", capital: "Kinshasa", currency: ["CDF"], - languages: ["fr", "ln", "kg", "sw", "lu"], + languages: ["fr", "ln", "kg", "sw", "lu"] }, CF: { name: "Central African Republic", @@ -368,7 +368,7 @@ export const countries = { continent: "AF", capital: "Bangui", currency: ["XAF"], - languages: ["fr", "sg"], + languages: ["fr", "sg"] }, CG: { name: "Republic of the Congo", @@ -377,7 +377,7 @@ export const countries = { continent: "AF", capital: "Brazzaville", currency: ["XAF"], - languages: ["fr", "ln"], + languages: ["fr", "ln"] }, CH: { name: "Switzerland", @@ -386,7 +386,7 @@ export const countries = { continent: "EU", capital: "Bern", currency: ["CHE", "CHF", "CHW"], - languages: ["de", "fr", "it"], + languages: ["de", "fr", "it"] }, CI: { name: "Ivory Coast", @@ -395,7 +395,7 @@ export const countries = { continent: "AF", capital: "Yamoussoukro", currency: ["XOF"], - languages: ["fr"], + languages: ["fr"] }, CK: { name: "Cook Islands", @@ -404,7 +404,7 @@ export const countries = { continent: "OC", capital: "Avarua", currency: ["NZD"], - languages: ["en"], + languages: ["en"] }, CL: { name: "Chile", @@ -413,7 +413,7 @@ export const countries = { continent: "SA", capital: "Santiago", currency: ["CLF", "CLP"], - languages: ["es"], + languages: ["es"] }, CM: { name: "Cameroon", @@ -422,7 +422,7 @@ export const countries = { continent: "AF", capital: "Yaoundé", currency: ["XAF"], - languages: ["en", "fr"], + languages: ["en", "fr"] }, CN: { name: "China", @@ -431,7 +431,7 @@ export const countries = { continent: "AS", capital: "Beijing", currency: ["CNY"], - languages: ["zh"], + languages: ["zh"] }, CO: { name: "Colombia", @@ -440,7 +440,7 @@ export const countries = { continent: "SA", capital: "Bogotá", currency: ["COP"], - languages: ["es"], + languages: ["es"] }, CR: { name: "Costa Rica", @@ -449,7 +449,7 @@ export const countries = { continent: "NA", capital: "San José", currency: ["CRC"], - languages: ["es"], + languages: ["es"] }, CU: { name: "Cuba", @@ -458,7 +458,7 @@ export const countries = { continent: "NA", capital: "Havana", currency: ["CUC", "CUP"], - languages: ["es"], + languages: ["es"] }, CV: { name: "Cape Verde", @@ -467,7 +467,7 @@ export const countries = { continent: "AF", capital: "Praia", currency: ["CVE"], - languages: ["pt"], + languages: ["pt"] }, CW: { name: "Curacao", @@ -476,7 +476,7 @@ export const countries = { continent: "NA", capital: "Willemstad", currency: ["ANG"], - languages: ["nl", "pa", "en"], + languages: ["nl", "pa", "en"] }, CX: { name: "Christmas Island", @@ -485,7 +485,7 @@ export const countries = { continent: "AS", capital: "Flying Fish Cove", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, CY: { name: "Cyprus", @@ -494,7 +494,7 @@ export const countries = { continent: "EU", capital: "Nicosia", currency: ["EUR"], - languages: ["el", "tr", "hy"], + languages: ["el", "tr", "hy"] }, CZ: { name: "Czech Republic", @@ -503,7 +503,7 @@ export const countries = { continent: "EU", capital: "Prague", currency: ["CZK"], - languages: ["cs"], + languages: ["cs"] }, DE: { name: "Germany", @@ -512,7 +512,7 @@ export const countries = { continent: "EU", capital: "Berlin", currency: ["EUR"], - languages: ["de"], + languages: ["de"] }, DJ: { name: "Djibouti", @@ -521,7 +521,7 @@ export const countries = { continent: "AF", capital: "Djibouti", currency: ["DJF"], - languages: ["fr", "ar"], + languages: ["fr", "ar"] }, DK: { name: "Denmark", @@ -531,7 +531,7 @@ export const countries = { continents: ["EU", "NA"], capital: "Copenhagen", currency: ["DKK"], - languages: ["da"], + languages: ["da"] }, DM: { name: "Dominica", @@ -540,7 +540,7 @@ export const countries = { continent: "NA", capital: "Roseau", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, DO: { name: "Dominican Republic", @@ -549,7 +549,7 @@ export const countries = { continent: "NA", capital: "Santo Domingo", currency: ["DOP"], - languages: ["es"], + languages: ["es"] }, DZ: { name: "Algeria", @@ -558,7 +558,7 @@ export const countries = { continent: "AF", capital: "Algiers", currency: ["DZD"], - languages: ["ar"], + languages: ["ar"] }, EC: { name: "Ecuador", @@ -567,7 +567,7 @@ export const countries = { continent: "SA", capital: "Quito", currency: ["USD"], - languages: ["es"], + languages: ["es"] }, EE: { name: "Estonia", @@ -576,7 +576,7 @@ export const countries = { continent: "EU", capital: "Tallinn", currency: ["EUR"], - languages: ["et"], + languages: ["et"] }, EG: { name: "Egypt", @@ -586,7 +586,7 @@ export const countries = { continents: ["AF", "AS"], capital: "Cairo", currency: ["EGP"], - languages: ["ar"], + languages: ["ar"] }, EH: { name: "Western Sahara", @@ -595,7 +595,7 @@ export const countries = { continent: "AF", capital: "El Aaiún", currency: ["MAD", "DZD", "MRU"], - languages: ["es"], + languages: ["es"] }, ER: { name: "Eritrea", @@ -604,7 +604,7 @@ export const countries = { continent: "AF", capital: "Asmara", currency: ["ERN"], - languages: ["ti", "ar", "en"], + languages: ["ti", "ar", "en"] }, ES: { name: "Spain", @@ -613,7 +613,7 @@ export const countries = { continent: "EU", capital: "Madrid", currency: ["EUR"], - languages: ["es", "eu", "ca", "gl", "oc"], + languages: ["es", "eu", "ca", "gl", "oc"] }, ET: { name: "Ethiopia", @@ -622,7 +622,7 @@ export const countries = { continent: "AF", capital: "Addis Ababa", currency: ["ETB"], - languages: ["am"], + languages: ["am"] }, FI: { name: "Finland", @@ -631,7 +631,7 @@ export const countries = { continent: "EU", capital: "Helsinki", currency: ["EUR"], - languages: ["fi", "sv"], + languages: ["fi", "sv"] }, FJ: { name: "Fiji", @@ -640,7 +640,7 @@ export const countries = { continent: "OC", capital: "Suva", currency: ["FJD"], - languages: ["en", "fj", "hi", "ur"], + languages: ["en", "fj", "hi", "ur"] }, FK: { name: "Falkland Islands", @@ -649,7 +649,7 @@ export const countries = { continent: "SA", capital: "Stanley", currency: ["FKP"], - languages: ["en"], + languages: ["en"] }, FM: { name: "Micronesia", @@ -658,7 +658,7 @@ export const countries = { continent: "OC", capital: "Palikir", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, FO: { name: "Faroe Islands", @@ -667,7 +667,7 @@ export const countries = { continent: "EU", capital: "Tórshavn", currency: ["DKK"], - languages: ["fo"], + languages: ["fo"] }, FR: { name: "France", @@ -676,7 +676,7 @@ export const countries = { continent: "EU", capital: "Paris", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, GA: { name: "Gabon", @@ -685,7 +685,7 @@ export const countries = { continent: "AF", capital: "Libreville", currency: ["XAF"], - languages: ["fr"], + languages: ["fr"] }, GB: { name: "United Kingdom", @@ -694,7 +694,7 @@ export const countries = { continent: "EU", capital: "London", currency: ["GBP"], - languages: ["en"], + languages: ["en"] }, GD: { name: "Grenada", @@ -703,7 +703,7 @@ export const countries = { continent: "NA", capital: "St. George's", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, GE: { name: "Georgia", @@ -713,7 +713,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Tbilisi", currency: ["GEL"], - languages: ["ka"], + languages: ["ka"] }, GF: { name: "French Guiana", @@ -722,7 +722,7 @@ export const countries = { continent: "SA", capital: "Cayenne", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, GG: { name: "Guernsey", @@ -731,7 +731,7 @@ export const countries = { continent: "EU", capital: "St. Peter Port", currency: ["GBP"], - languages: ["en", "fr"], + languages: ["en", "fr"] }, GH: { name: "Ghana", @@ -740,7 +740,7 @@ export const countries = { continent: "AF", capital: "Accra", currency: ["GHS"], - languages: ["en"], + languages: ["en"] }, GI: { name: "Gibraltar", @@ -749,7 +749,7 @@ export const countries = { continent: "EU", capital: "Gibraltar", currency: ["GIP"], - languages: ["en"], + languages: ["en"] }, GL: { name: "Greenland", @@ -758,7 +758,7 @@ export const countries = { continent: "NA", capital: "Nuuk", currency: ["DKK"], - languages: ["kl"], + languages: ["kl"] }, GM: { name: "Gambia", @@ -767,7 +767,7 @@ export const countries = { continent: "AF", capital: "Banjul", currency: ["GMD"], - languages: ["en"], + languages: ["en"] }, GN: { name: "Guinea", @@ -776,7 +776,7 @@ export const countries = { continent: "AF", capital: "Conakry", currency: ["GNF"], - languages: ["fr", "ff"], + languages: ["fr", "ff"] }, GP: { name: "Guadeloupe", @@ -785,7 +785,7 @@ export const countries = { continent: "NA", capital: "Basse-Terre", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, GQ: { name: "Equatorial Guinea", @@ -794,7 +794,7 @@ export const countries = { continent: "AF", capital: "Malabo", currency: ["XAF"], - languages: ["es", "fr"], + languages: ["es", "fr"] }, GR: { name: "Greece", @@ -803,7 +803,7 @@ export const countries = { continent: "EU", capital: "Athens", currency: ["EUR"], - languages: ["el"], + languages: ["el"] }, GS: { name: "South Georgia and the South Sandwich Islands", @@ -812,7 +812,7 @@ export const countries = { continent: "AN", capital: "King Edward Point", currency: ["GBP"], - languages: ["en"], + languages: ["en"] }, GT: { name: "Guatemala", @@ -821,7 +821,7 @@ export const countries = { continent: "NA", capital: "Guatemala City", currency: ["GTQ"], - languages: ["es"], + languages: ["es"] }, GU: { name: "Guam", @@ -830,7 +830,7 @@ export const countries = { continent: "OC", capital: "Hagåtña", currency: ["USD"], - languages: ["en", "ch", "es"], + languages: ["en", "ch", "es"] }, GW: { name: "Guinea-Bissau", @@ -839,7 +839,7 @@ export const countries = { continent: "AF", capital: "Bissau", currency: ["XOF"], - languages: ["pt"], + languages: ["pt"] }, GY: { name: "Guyana", @@ -848,7 +848,7 @@ export const countries = { continent: "SA", capital: "Georgetown", currency: ["GYD"], - languages: ["en"], + languages: ["en"] }, HK: { name: "Hong Kong", @@ -857,7 +857,7 @@ export const countries = { continent: "AS", capital: "City of Victoria", currency: ["HKD"], - languages: ["zh", "en"], + languages: ["zh", "en"] }, HM: { name: "Heard Island and McDonald Islands", @@ -866,7 +866,7 @@ export const countries = { continent: "AN", capital: "", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, HN: { name: "Honduras", @@ -875,7 +875,7 @@ export const countries = { continent: "NA", capital: "Tegucigalpa", currency: ["HNL"], - languages: ["es"], + languages: ["es"] }, HR: { name: "Croatia", @@ -884,7 +884,7 @@ export const countries = { continent: "EU", capital: "Zagreb", currency: ["EUR"], - languages: ["hr"], + languages: ["hr"] }, HT: { name: "Haiti", @@ -893,7 +893,7 @@ export const countries = { continent: "NA", capital: "Port-au-Prince", currency: ["HTG", "USD"], - languages: ["fr", "ht"], + languages: ["fr", "ht"] }, HU: { name: "Hungary", @@ -902,7 +902,7 @@ export const countries = { continent: "EU", capital: "Budapest", currency: ["HUF"], - languages: ["hu"], + languages: ["hu"] }, ID: { name: "Indonesia", @@ -911,7 +911,7 @@ export const countries = { continent: "AS", capital: "Jakarta", currency: ["IDR"], - languages: ["id"], + languages: ["id"] }, IE: { name: "Ireland", @@ -920,7 +920,7 @@ export const countries = { continent: "EU", capital: "Dublin", currency: ["EUR"], - languages: ["ga", "en"], + languages: ["ga", "en"] }, IL: { name: "Israel", @@ -929,7 +929,7 @@ export const countries = { continent: "AS", capital: "Jerusalem", currency: ["ILS"], - languages: ["he", "ar"], + languages: ["he", "ar"] }, IM: { name: "Isle of Man", @@ -938,7 +938,7 @@ export const countries = { continent: "EU", capital: "Douglas", currency: ["GBP"], - languages: ["en", "gv"], + languages: ["en", "gv"] }, IN: { name: "India", @@ -947,7 +947,7 @@ export const countries = { continent: "AS", capital: "New Delhi", currency: ["INR"], - languages: ["hi", "en"], + languages: ["hi", "en"] }, IO: { name: "British Indian Ocean Territory", @@ -956,7 +956,7 @@ export const countries = { continent: "AS", capital: "Diego Garcia", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, IQ: { name: "Iraq", @@ -965,7 +965,7 @@ export const countries = { continent: "AS", capital: "Baghdad", currency: ["IQD"], - languages: ["ar", "ku"], + languages: ["ar", "ku"] }, IR: { name: "Iran", @@ -974,7 +974,7 @@ export const countries = { continent: "AS", capital: "Tehran", currency: ["IRR"], - languages: ["fa"], + languages: ["fa"] }, IS: { name: "Iceland", @@ -983,7 +983,7 @@ export const countries = { continent: "EU", capital: "Reykjavik", currency: ["ISK"], - languages: ["is"], + languages: ["is"] }, IT: { name: "Italy", @@ -992,7 +992,7 @@ export const countries = { continent: "EU", capital: "Rome", currency: ["EUR"], - languages: ["it"], + languages: ["it"] }, JE: { name: "Jersey", @@ -1001,7 +1001,7 @@ export const countries = { continent: "EU", capital: "Saint Helier", currency: ["GBP"], - languages: ["en", "fr"], + languages: ["en", "fr"] }, JM: { name: "Jamaica", @@ -1010,7 +1010,7 @@ export const countries = { continent: "NA", capital: "Kingston", currency: ["JMD"], - languages: ["en"], + languages: ["en"] }, JO: { name: "Jordan", @@ -1019,7 +1019,7 @@ export const countries = { continent: "AS", capital: "Amman", currency: ["JOD"], - languages: ["ar"], + languages: ["ar"] }, JP: { name: "Japan", @@ -1028,7 +1028,7 @@ export const countries = { continent: "AS", capital: "Tokyo", currency: ["JPY"], - languages: ["ja"], + languages: ["ja"] }, KE: { name: "Kenya", @@ -1037,7 +1037,7 @@ export const countries = { continent: "AF", capital: "Nairobi", currency: ["KES"], - languages: ["en", "sw"], + languages: ["en", "sw"] }, KG: { name: "Kyrgyzstan", @@ -1046,7 +1046,7 @@ export const countries = { continent: "AS", capital: "Bishkek", currency: ["KGS"], - languages: ["ky", "ru"], + languages: ["ky", "ru"] }, KH: { name: "Cambodia", @@ -1055,7 +1055,7 @@ export const countries = { continent: "AS", capital: "Phnom Penh", currency: ["KHR"], - languages: ["km"], + languages: ["km"] }, KI: { name: "Kiribati", @@ -1064,7 +1064,7 @@ export const countries = { continent: "OC", capital: "South Tarawa", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, KM: { name: "Comoros", @@ -1073,7 +1073,7 @@ export const countries = { continent: "AF", capital: "Moroni", currency: ["KMF"], - languages: ["ar", "fr"], + languages: ["ar", "fr"] }, KN: { name: "Saint Kitts and Nevis", @@ -1082,7 +1082,7 @@ export const countries = { continent: "NA", capital: "Basseterre", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, KP: { name: "North Korea", @@ -1091,7 +1091,7 @@ export const countries = { continent: "AS", capital: "Pyongyang", currency: ["KPW"], - languages: ["ko"], + languages: ["ko"] }, KR: { name: "South Korea", @@ -1100,7 +1100,7 @@ export const countries = { continent: "AS", capital: "Seoul", currency: ["KRW"], - languages: ["ko"], + languages: ["ko"] }, KW: { name: "Kuwait", @@ -1109,7 +1109,7 @@ export const countries = { continent: "AS", capital: "Kuwait City", currency: ["KWD"], - languages: ["ar"], + languages: ["ar"] }, KY: { name: "Cayman Islands", @@ -1118,7 +1118,7 @@ export const countries = { continent: "NA", capital: "George Town", currency: ["KYD"], - languages: ["en"], + languages: ["en"] }, KZ: { name: "Kazakhstan", @@ -1128,7 +1128,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Astana", currency: ["KZT"], - languages: ["kk", "ru"], + languages: ["kk", "ru"] }, LA: { name: "Laos", @@ -1137,7 +1137,7 @@ export const countries = { continent: "AS", capital: "Vientiane", currency: ["LAK"], - languages: ["lo"], + languages: ["lo"] }, LB: { name: "Lebanon", @@ -1146,7 +1146,7 @@ export const countries = { continent: "AS", capital: "Beirut", currency: ["LBP"], - languages: ["ar", "fr"], + languages: ["ar", "fr"] }, LC: { name: "Saint Lucia", @@ -1155,7 +1155,7 @@ export const countries = { continent: "NA", capital: "Castries", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, LI: { name: "Liechtenstein", @@ -1164,7 +1164,7 @@ export const countries = { continent: "EU", capital: "Vaduz", currency: ["CHF"], - languages: ["de"], + languages: ["de"] }, LK: { name: "Sri Lanka", @@ -1173,7 +1173,7 @@ export const countries = { continent: "AS", capital: "Colombo", currency: ["LKR"], - languages: ["si", "ta"], + languages: ["si", "ta"] }, LR: { name: "Liberia", @@ -1182,7 +1182,7 @@ export const countries = { continent: "AF", capital: "Monrovia", currency: ["LRD"], - languages: ["en"], + languages: ["en"] }, LS: { name: "Lesotho", @@ -1191,7 +1191,7 @@ export const countries = { continent: "AF", capital: "Maseru", currency: ["LSL", "ZAR"], - languages: ["en", "st"], + languages: ["en", "st"] }, LT: { name: "Lithuania", @@ -1200,7 +1200,7 @@ export const countries = { continent: "EU", capital: "Vilnius", currency: ["EUR"], - languages: ["lt"], + languages: ["lt"] }, LU: { name: "Luxembourg", @@ -1209,7 +1209,7 @@ export const countries = { continent: "EU", capital: "Luxembourg", currency: ["EUR"], - languages: ["fr", "de", "lb"], + languages: ["fr", "de", "lb"] }, LV: { name: "Latvia", @@ -1218,7 +1218,7 @@ export const countries = { continent: "EU", capital: "Riga", currency: ["EUR"], - languages: ["lv"], + languages: ["lv"] }, LY: { name: "Libya", @@ -1227,7 +1227,7 @@ export const countries = { continent: "AF", capital: "Tripoli", currency: ["LYD"], - languages: ["ar"], + languages: ["ar"] }, MA: { name: "Morocco", @@ -1236,7 +1236,7 @@ export const countries = { continent: "AF", capital: "Rabat", currency: ["MAD"], - languages: ["ar"], + languages: ["ar"] }, MC: { name: "Monaco", @@ -1245,7 +1245,7 @@ export const countries = { continent: "EU", capital: "Monaco", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, MD: { name: "Moldova", @@ -1254,7 +1254,7 @@ export const countries = { continent: "EU", capital: "Chișinău", currency: ["MDL"], - languages: ["ro"], + languages: ["ro"] }, ME: { name: "Montenegro", @@ -1263,7 +1263,7 @@ export const countries = { continent: "EU", capital: "Podgorica", currency: ["EUR"], - languages: ["sr", "bs", "sq", "hr"], + languages: ["sr", "bs", "sq", "hr"] }, MF: { name: "Saint Martin", @@ -1272,7 +1272,7 @@ export const countries = { continent: "NA", capital: "Marigot", currency: ["EUR"], - languages: ["en", "fr", "nl"], + languages: ["en", "fr", "nl"] }, MG: { name: "Madagascar", @@ -1281,7 +1281,7 @@ export const countries = { continent: "AF", capital: "Antananarivo", currency: ["MGA"], - languages: ["fr", "mg"], + languages: ["fr", "mg"] }, MH: { name: "Marshall Islands", @@ -1290,7 +1290,7 @@ export const countries = { continent: "OC", capital: "Majuro", currency: ["USD"], - languages: ["en", "mh"], + languages: ["en", "mh"] }, MK: { name: "North Macedonia", @@ -1299,7 +1299,7 @@ export const countries = { continent: "EU", capital: "Skopje", currency: ["MKD"], - languages: ["mk"], + languages: ["mk"] }, ML: { name: "Mali", @@ -1308,7 +1308,7 @@ export const countries = { continent: "AF", capital: "Bamako", currency: ["XOF"], - languages: ["fr"], + languages: ["fr"] }, MM: { name: "Myanmar (Burma)", @@ -1317,7 +1317,7 @@ export const countries = { continent: "AS", capital: "Naypyidaw", currency: ["MMK"], - languages: ["my"], + languages: ["my"] }, MN: { name: "Mongolia", @@ -1326,7 +1326,7 @@ export const countries = { continent: "AS", capital: "Ulan Bator", currency: ["MNT"], - languages: ["mn"], + languages: ["mn"] }, MO: { name: "Macao", @@ -1335,7 +1335,7 @@ export const countries = { continent: "AS", capital: "", currency: ["MOP"], - languages: ["zh", "pt"], + languages: ["zh", "pt"] }, MP: { name: "Northern Mariana Islands", @@ -1344,7 +1344,7 @@ export const countries = { continent: "OC", capital: "Saipan", currency: ["USD"], - languages: ["en", "ch"], + languages: ["en", "ch"] }, MQ: { name: "Martinique", @@ -1353,7 +1353,7 @@ export const countries = { continent: "NA", capital: "Fort-de-France", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, MR: { name: "Mauritania", @@ -1362,7 +1362,7 @@ export const countries = { continent: "AF", capital: "Nouakchott", currency: ["MRU"], - languages: ["ar"], + languages: ["ar"] }, MS: { name: "Montserrat", @@ -1371,7 +1371,7 @@ export const countries = { continent: "NA", capital: "Plymouth", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, MT: { name: "Malta", @@ -1380,7 +1380,7 @@ export const countries = { continent: "EU", capital: "Valletta", currency: ["EUR"], - languages: ["mt", "en"], + languages: ["mt", "en"] }, MU: { name: "Mauritius", @@ -1389,7 +1389,7 @@ export const countries = { continent: "AF", capital: "Port Louis", currency: ["MUR"], - languages: ["en"], + languages: ["en"] }, MV: { name: "Maldives", @@ -1398,7 +1398,7 @@ export const countries = { continent: "AS", capital: "Malé", currency: ["MVR"], - languages: ["dv"], + languages: ["dv"] }, MW: { name: "Malawi", @@ -1407,7 +1407,7 @@ export const countries = { continent: "AF", capital: "Lilongwe", currency: ["MWK"], - languages: ["en", "ny"], + languages: ["en", "ny"] }, MX: { name: "Mexico", @@ -1416,7 +1416,7 @@ export const countries = { continent: "NA", capital: "Mexico City", currency: ["MXN"], - languages: ["es"], + languages: ["es"] }, MY: { name: "Malaysia", @@ -1425,7 +1425,7 @@ export const countries = { continent: "AS", capital: "Kuala Lumpur", currency: ["MYR"], - languages: ["ms"], + languages: ["ms"] }, MZ: { name: "Mozambique", @@ -1434,7 +1434,7 @@ export const countries = { continent: "AF", capital: "Maputo", currency: ["MZN"], - languages: ["pt"], + languages: ["pt"] }, NA: { name: "Namibia", @@ -1443,7 +1443,7 @@ export const countries = { continent: "AF", capital: "Windhoek", currency: ["NAD", "ZAR"], - languages: ["en", "af"], + languages: ["en", "af"] }, NC: { name: "New Caledonia", @@ -1452,7 +1452,7 @@ export const countries = { continent: "OC", capital: "Nouméa", currency: ["XPF"], - languages: ["fr"], + languages: ["fr"] }, NE: { name: "Niger", @@ -1461,7 +1461,7 @@ export const countries = { continent: "AF", capital: "Niamey", currency: ["XOF"], - languages: ["fr"], + languages: ["fr"] }, NF: { name: "Norfolk Island", @@ -1470,7 +1470,7 @@ export const countries = { continent: "OC", capital: "Kingston", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, NG: { name: "Nigeria", @@ -1479,7 +1479,7 @@ export const countries = { continent: "AF", capital: "Abuja", currency: ["NGN"], - languages: ["en"], + languages: ["en"] }, NI: { name: "Nicaragua", @@ -1488,7 +1488,7 @@ export const countries = { continent: "NA", capital: "Managua", currency: ["NIO"], - languages: ["es"], + languages: ["es"] }, NL: { name: "Netherlands", @@ -1497,7 +1497,7 @@ export const countries = { continent: "EU", capital: "Amsterdam", currency: ["EUR"], - languages: ["nl"], + languages: ["nl"] }, NO: { name: "Norway", @@ -1506,7 +1506,7 @@ export const countries = { continent: "EU", capital: "Oslo", currency: ["NOK"], - languages: ["no", "nb", "nn"], + languages: ["no", "nb", "nn"] }, NP: { name: "Nepal", @@ -1515,7 +1515,7 @@ export const countries = { continent: "AS", capital: "Kathmandu", currency: ["NPR"], - languages: ["ne"], + languages: ["ne"] }, NR: { name: "Nauru", @@ -1524,7 +1524,7 @@ export const countries = { continent: "OC", capital: "Yaren", currency: ["AUD"], - languages: ["en", "na"], + languages: ["en", "na"] }, NU: { name: "Niue", @@ -1533,7 +1533,7 @@ export const countries = { continent: "OC", capital: "Alofi", currency: ["NZD"], - languages: ["en"], + languages: ["en"] }, NZ: { name: "New Zealand", @@ -1542,7 +1542,7 @@ export const countries = { continent: "OC", capital: "Wellington", currency: ["NZD"], - languages: ["en", "mi"], + languages: ["en", "mi"] }, OM: { name: "Oman", @@ -1551,7 +1551,7 @@ export const countries = { continent: "AS", capital: "Muscat", currency: ["OMR"], - languages: ["ar"], + languages: ["ar"] }, PA: { name: "Panama", @@ -1560,7 +1560,7 @@ export const countries = { continent: "NA", capital: "Panama City", currency: ["PAB", "USD"], - languages: ["es"], + languages: ["es"] }, PE: { name: "Peru", @@ -1569,7 +1569,7 @@ export const countries = { continent: "SA", capital: "Lima", currency: ["PEN"], - languages: ["es"], + languages: ["es"] }, PF: { name: "French Polynesia", @@ -1578,7 +1578,7 @@ export const countries = { continent: "OC", capital: "Papeetē", currency: ["XPF"], - languages: ["fr"], + languages: ["fr"] }, PG: { name: "Papua New Guinea", @@ -1587,7 +1587,7 @@ export const countries = { continent: "OC", capital: "Port Moresby", currency: ["PGK"], - languages: ["en"], + languages: ["en"] }, PH: { name: "Philippines", @@ -1596,7 +1596,7 @@ export const countries = { continent: "AS", capital: "Manila", currency: ["PHP"], - languages: ["en"], + languages: ["en"] }, PK: { name: "Pakistan", @@ -1605,7 +1605,7 @@ export const countries = { continent: "AS", capital: "Islamabad", currency: ["PKR"], - languages: ["en", "ur"], + languages: ["en", "ur"] }, PL: { name: "Poland", @@ -1614,7 +1614,7 @@ export const countries = { continent: "EU", capital: "Warsaw", currency: ["PLN"], - languages: ["pl"], + languages: ["pl"] }, PM: { name: "Saint Pierre and Miquelon", @@ -1623,7 +1623,7 @@ export const countries = { continent: "NA", capital: "Saint-Pierre", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, PN: { name: "Pitcairn Islands", @@ -1632,7 +1632,7 @@ export const countries = { continent: "OC", capital: "Adamstown", currency: ["NZD"], - languages: ["en"], + languages: ["en"] }, PR: { name: "Puerto Rico", @@ -1641,7 +1641,7 @@ export const countries = { continent: "NA", capital: "San Juan", currency: ["USD"], - languages: ["es", "en"], + languages: ["es", "en"] }, PS: { name: "Palestine", @@ -1650,7 +1650,7 @@ export const countries = { continent: "AS", capital: "Ramallah", currency: ["ILS"], - languages: ["ar"], + languages: ["ar"] }, PT: { name: "Portugal", @@ -1659,7 +1659,7 @@ export const countries = { continent: "EU", capital: "Lisbon", currency: ["EUR"], - languages: ["pt"], + languages: ["pt"] }, PW: { name: "Palau", @@ -1668,7 +1668,7 @@ export const countries = { continent: "OC", capital: "Ngerulmud", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, PY: { name: "Paraguay", @@ -1677,7 +1677,7 @@ export const countries = { continent: "SA", capital: "Asunción", currency: ["PYG"], - languages: ["es", "gn"], + languages: ["es", "gn"] }, QA: { name: "Qatar", @@ -1686,7 +1686,7 @@ export const countries = { continent: "AS", capital: "Doha", currency: ["QAR"], - languages: ["ar"], + languages: ["ar"] }, RE: { name: "Reunion", @@ -1695,7 +1695,7 @@ export const countries = { continent: "AF", capital: "Saint-Denis", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, RO: { name: "Romania", @@ -1704,7 +1704,7 @@ export const countries = { continent: "EU", capital: "Bucharest", currency: ["RON"], - languages: ["ro"], + languages: ["ro"] }, RS: { name: "Serbia", @@ -1713,7 +1713,7 @@ export const countries = { continent: "EU", capital: "Belgrade", currency: ["RSD"], - languages: ["sr"], + languages: ["sr"] }, RU: { name: "Russia", @@ -1723,7 +1723,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Moscow", currency: ["RUB"], - languages: ["ru"], + languages: ["ru"] }, RW: { name: "Rwanda", @@ -1732,7 +1732,7 @@ export const countries = { continent: "AF", capital: "Kigali", currency: ["RWF"], - languages: ["rw", "en", "fr"], + languages: ["rw", "en", "fr"] }, SA: { name: "Saudi Arabia", @@ -1741,7 +1741,7 @@ export const countries = { continent: "AS", capital: "Riyadh", currency: ["SAR"], - languages: ["ar"], + languages: ["ar"] }, SB: { name: "Solomon Islands", @@ -1750,7 +1750,7 @@ export const countries = { continent: "OC", capital: "Honiara", currency: ["SBD"], - languages: ["en"], + languages: ["en"] }, SC: { name: "Seychelles", @@ -1759,7 +1759,7 @@ export const countries = { continent: "AF", capital: "Victoria", currency: ["SCR"], - languages: ["fr", "en"], + languages: ["fr", "en"] }, SD: { name: "Sudan", @@ -1768,7 +1768,7 @@ export const countries = { continent: "AF", capital: "Khartoum", currency: ["SDG"], - languages: ["ar", "en"], + languages: ["ar", "en"] }, SE: { name: "Sweden", @@ -1777,7 +1777,7 @@ export const countries = { continent: "EU", capital: "Stockholm", currency: ["SEK"], - languages: ["sv"], + languages: ["sv"] }, SG: { name: "Singapore", @@ -1786,7 +1786,7 @@ export const countries = { continent: "AS", capital: "Singapore", currency: ["SGD"], - languages: ["en", "ms", "ta", "zh"], + languages: ["en", "ms", "ta", "zh"] }, SH: { name: "Saint Helena", @@ -1795,7 +1795,7 @@ export const countries = { continent: "AF", capital: "Jamestown", currency: ["SHP"], - languages: ["en"], + languages: ["en"] }, SI: { name: "Slovenia", @@ -1804,7 +1804,7 @@ export const countries = { continent: "EU", capital: "Ljubljana", currency: ["EUR"], - languages: ["sl"], + languages: ["sl"] }, SJ: { name: "Svalbard and Jan Mayen", @@ -1813,7 +1813,7 @@ export const countries = { continent: "EU", capital: "Longyearbyen", currency: ["NOK"], - languages: ["no"], + languages: ["no"] }, SK: { name: "Slovakia", @@ -1822,7 +1822,7 @@ export const countries = { continent: "EU", capital: "Bratislava", currency: ["EUR"], - languages: ["sk"], + languages: ["sk"] }, SL: { name: "Sierra Leone", @@ -1831,7 +1831,7 @@ export const countries = { continent: "AF", capital: "Freetown", currency: ["SLL"], - languages: ["en"], + languages: ["en"] }, SM: { name: "San Marino", @@ -1840,7 +1840,7 @@ export const countries = { continent: "EU", capital: "City of San Marino", currency: ["EUR"], - languages: ["it"], + languages: ["it"] }, SN: { name: "Senegal", @@ -1849,7 +1849,7 @@ export const countries = { continent: "AF", capital: "Dakar", currency: ["XOF"], - languages: ["fr"], + languages: ["fr"] }, SO: { name: "Somalia", @@ -1858,7 +1858,7 @@ export const countries = { continent: "AF", capital: "Mogadishu", currency: ["SOS"], - languages: ["so", "ar"], + languages: ["so", "ar"] }, SR: { name: "Suriname", @@ -1867,7 +1867,7 @@ export const countries = { continent: "SA", capital: "Paramaribo", currency: ["SRD"], - languages: ["nl"], + languages: ["nl"] }, SS: { name: "South Sudan", @@ -1876,7 +1876,7 @@ export const countries = { continent: "AF", capital: "Juba", currency: ["SSP"], - languages: ["en"], + languages: ["en"] }, ST: { name: "Sao Tome and Principe", @@ -1885,7 +1885,7 @@ export const countries = { continent: "AF", capital: "São Tomé", currency: ["STN"], - languages: ["pt"], + languages: ["pt"] }, SV: { name: "El Salvador", @@ -1894,7 +1894,7 @@ export const countries = { continent: "NA", capital: "San Salvador", currency: ["SVC", "USD"], - languages: ["es"], + languages: ["es"] }, SX: { name: "Sint Maarten", @@ -1903,7 +1903,7 @@ export const countries = { continent: "NA", capital: "Philipsburg", currency: ["ANG"], - languages: ["nl", "en"], + languages: ["nl", "en"] }, SY: { name: "Syria", @@ -1912,7 +1912,7 @@ export const countries = { continent: "AS", capital: "Damascus", currency: ["SYP"], - languages: ["ar"], + languages: ["ar"] }, SZ: { name: "Eswatini", @@ -1921,7 +1921,7 @@ export const countries = { continent: "AF", capital: "Lobamba", currency: ["SZL"], - languages: ["en", "ss"], + languages: ["en", "ss"] }, TC: { name: "Turks and Caicos Islands", @@ -1930,7 +1930,7 @@ export const countries = { continent: "NA", capital: "Cockburn Town", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, TD: { name: "Chad", @@ -1939,7 +1939,7 @@ export const countries = { continent: "AF", capital: "N'Djamena", currency: ["XAF"], - languages: ["fr", "ar"], + languages: ["fr", "ar"] }, TF: { name: "French Southern Territories", @@ -1948,7 +1948,7 @@ export const countries = { continent: "AN", capital: "Port-aux-Français", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, TG: { name: "Togo", @@ -1957,7 +1957,7 @@ export const countries = { continent: "AF", capital: "Lomé", currency: ["XOF"], - languages: ["fr"], + languages: ["fr"] }, TH: { name: "Thailand", @@ -1966,7 +1966,7 @@ export const countries = { continent: "AS", capital: "Bangkok", currency: ["THB"], - languages: ["th"], + languages: ["th"] }, TJ: { name: "Tajikistan", @@ -1975,7 +1975,7 @@ export const countries = { continent: "AS", capital: "Dushanbe", currency: ["TJS"], - languages: ["tg", "ru"], + languages: ["tg", "ru"] }, TK: { name: "Tokelau", @@ -1984,7 +1984,7 @@ export const countries = { continent: "OC", capital: "Fakaofo", currency: ["NZD"], - languages: ["en"], + languages: ["en"] }, TL: { name: "East Timor", @@ -1993,7 +1993,7 @@ export const countries = { continent: "OC", capital: "Dili", currency: ["USD"], - languages: ["pt"], + languages: ["pt"] }, TM: { name: "Turkmenistan", @@ -2002,7 +2002,7 @@ export const countries = { continent: "AS", capital: "Ashgabat", currency: ["TMT"], - languages: ["tk", "ru"], + languages: ["tk", "ru"] }, TN: { name: "Tunisia", @@ -2011,7 +2011,7 @@ export const countries = { continent: "AF", capital: "Tunis", currency: ["TND"], - languages: ["ar"], + languages: ["ar"] }, TO: { name: "Tonga", @@ -2020,7 +2020,7 @@ export const countries = { continent: "OC", capital: "Nuku'alofa", currency: ["TOP"], - languages: ["en", "to"], + languages: ["en", "to"] }, TR: { name: "Turkey", @@ -2030,7 +2030,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Ankara", currency: ["TRY"], - languages: ["tr"], + languages: ["tr"] }, TT: { name: "Trinidad and Tobago", @@ -2039,7 +2039,7 @@ export const countries = { continent: "NA", capital: "Port of Spain", currency: ["TTD"], - languages: ["en"], + languages: ["en"] }, TV: { name: "Tuvalu", @@ -2048,7 +2048,7 @@ export const countries = { continent: "OC", capital: "Funafuti", currency: ["AUD"], - languages: ["en"], + languages: ["en"] }, TW: { name: "Taiwan", @@ -2057,7 +2057,7 @@ export const countries = { continent: "AS", capital: "Taipei", currency: ["TWD"], - languages: ["zh"], + languages: ["zh"] }, TZ: { name: "Tanzania", @@ -2066,7 +2066,7 @@ export const countries = { continent: "AF", capital: "Dodoma", currency: ["TZS"], - languages: ["sw", "en"], + languages: ["sw", "en"] }, UA: { name: "Ukraine", @@ -2075,7 +2075,7 @@ export const countries = { continent: "EU", capital: "Kyiv", currency: ["UAH"], - languages: ["uk"], + languages: ["uk"] }, UG: { name: "Uganda", @@ -2084,7 +2084,7 @@ export const countries = { continent: "AF", capital: "Kampala", currency: ["UGX"], - languages: ["en", "sw"], + languages: ["en", "sw"] }, UM: { name: "U.S. Minor Outlying Islands", @@ -2093,7 +2093,7 @@ export const countries = { continent: "OC", capital: "", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, US: { name: "United States", @@ -2102,7 +2102,7 @@ export const countries = { continent: "NA", capital: "Washington D.C.", currency: ["USD", "USN", "USS"], - languages: ["en"], + languages: ["en"] }, UY: { name: "Uruguay", @@ -2111,7 +2111,7 @@ export const countries = { continent: "SA", capital: "Montevideo", currency: ["UYI", "UYU"], - languages: ["es"], + languages: ["es"] }, UZ: { name: "Uzbekistan", @@ -2120,7 +2120,7 @@ export const countries = { continent: "AS", capital: "Tashkent", currency: ["UZS"], - languages: ["uz", "ru"], + languages: ["uz", "ru"] }, VA: { name: "Vatican City", @@ -2129,7 +2129,7 @@ export const countries = { continent: "EU", capital: "Vatican City", currency: ["EUR"], - languages: ["it", "la"], + languages: ["it", "la"] }, VC: { name: "Saint Vincent and the Grenadines", @@ -2138,7 +2138,7 @@ export const countries = { continent: "NA", capital: "Kingstown", currency: ["XCD"], - languages: ["en"], + languages: ["en"] }, VE: { name: "Venezuela", @@ -2147,7 +2147,7 @@ export const countries = { continent: "SA", capital: "Caracas", currency: ["VES"], - languages: ["es"], + languages: ["es"] }, VG: { name: "British Virgin Islands", @@ -2156,7 +2156,7 @@ export const countries = { continent: "NA", capital: "Road Town", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, VI: { name: "U.S. Virgin Islands", @@ -2165,7 +2165,7 @@ export const countries = { continent: "NA", capital: "Charlotte Amalie", currency: ["USD"], - languages: ["en"], + languages: ["en"] }, VN: { name: "Vietnam", @@ -2174,7 +2174,7 @@ export const countries = { continent: "AS", capital: "Hanoi", currency: ["VND"], - languages: ["vi"], + languages: ["vi"] }, VU: { name: "Vanuatu", @@ -2183,7 +2183,7 @@ export const countries = { continent: "OC", capital: "Port Vila", currency: ["VUV"], - languages: ["bi", "en", "fr"], + languages: ["bi", "en", "fr"] }, WF: { name: "Wallis and Futuna", @@ -2192,7 +2192,7 @@ export const countries = { continent: "OC", capital: "Mata-Utu", currency: ["XPF"], - languages: ["fr"], + languages: ["fr"] }, WS: { name: "Samoa", @@ -2201,7 +2201,7 @@ export const countries = { continent: "OC", capital: "Apia", currency: ["WST"], - languages: ["sm", "en"], + languages: ["sm", "en"] }, XK: { name: "Kosovo", @@ -2211,7 +2211,7 @@ export const countries = { capital: "Pristina", currency: ["EUR"], languages: ["sq", "sr"], - userAssigned: true, + userAssigned: true }, YE: { name: "Yemen", @@ -2220,7 +2220,7 @@ export const countries = { continent: "AS", capital: "Sana'a", currency: ["YER"], - languages: ["ar"], + languages: ["ar"] }, YT: { name: "Mayotte", @@ -2229,7 +2229,7 @@ export const countries = { continent: "AF", capital: "Mamoudzou", currency: ["EUR"], - languages: ["fr"], + languages: ["fr"] }, ZA: { name: "South Africa", @@ -2238,7 +2238,7 @@ export const countries = { continent: "AF", capital: "Pretoria", currency: ["ZAR"], - languages: ["af", "en", "nr", "st", "ss", "tn", "ts", "ve", "xh", "zu"], + languages: ["af", "en", "nr", "st", "ss", "tn", "ts", "ve", "xh", "zu"] }, ZM: { name: "Zambia", @@ -2247,7 +2247,7 @@ export const countries = { continent: "AF", capital: "Lusaka", currency: ["ZMW"], - languages: ["en"], + languages: ["en"] }, ZW: { name: "Zimbabwe", @@ -2256,6 +2256,6 @@ export const countries = { continent: "AF", capital: "Harare", currency: ["USD", "ZAR", "BWP", "GBP", "AUD", "CNY", "INR", "JPY"], - languages: ["en", "sn", "nd"], - }, + languages: ["en", "sn", "nd"] + } }; diff --git a/apps/api/src/lib/validateUrl.test.ts b/apps/api/src/lib/validateUrl.test.ts index eec39f97..81c150fb 100644 --- a/apps/api/src/lib/validateUrl.test.ts +++ b/apps/api/src/lib/validateUrl.test.ts @@ -18,7 +18,10 @@ describe("isSameDomain", () => { }); it("should return true for a subdomain with different protocols", () => { - const result = isSameDomain("https://sub.example.com", "http://example.com"); + const result = isSameDomain( + "https://sub.example.com", + "http://example.com" + ); expect(result).toBe(true); }); @@ -30,32 +33,44 @@ describe("isSameDomain", () => { }); it("should return true for a subdomain with www prefix", () => { - const result = isSameDomain("http://www.sub.example.com", "http://example.com"); + const result = isSameDomain( + "http://www.sub.example.com", + "http://example.com" + ); expect(result).toBe(true); }); it("should return true for the same domain with www prefix", () => { - const result = isSameDomain("http://docs.s.s.example.com", "http://example.com"); + const result = isSameDomain( + "http://docs.s.s.example.com", + "http://example.com" + ); expect(result).toBe(true); }); }); - - - describe("isSameSubdomain", () => { it("should return false for a subdomain", () => { - const result = isSameSubdomain("http://example.com", "http://docs.example.com"); + const result = isSameSubdomain( + "http://example.com", + "http://docs.example.com" + ); expect(result).toBe(false); }); it("should return true for the same subdomain", () => { - const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com"); + const result = isSameSubdomain( + "http://docs.example.com", + "http://docs.example.com" + ); expect(result).toBe(true); }); it("should return false for different subdomains", () => { - const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com"); + const result = isSameSubdomain( + "http://docs.example.com", + "http://blog.example.com" + ); expect(result).toBe(false); }); @@ -72,17 +87,26 @@ describe("isSameSubdomain", () => { }); it("should return true for the same subdomain with different protocols", () => { - const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com"); + const result = isSameSubdomain( + "https://docs.example.com", + "http://docs.example.com" + ); expect(result).toBe(true); }); it("should return true for the same subdomain with www prefix", () => { - const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com"); + const result = isSameSubdomain( + "http://www.docs.example.com", + "http://docs.example.com" + ); expect(result).toBe(true); }); it("should return false for a subdomain with www prefix and different subdomain", () => { - const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com"); + const result = isSameSubdomain( + "http://www.docs.example.com", + "http://blog.example.com" + ); expect(result).toBe(false); }); }); @@ -116,19 +140,13 @@ describe("removeDuplicateUrls", () => { }); it("should prefer https over http", () => { - const urls = [ - "http://example.com", - "https://example.com" - ]; + const urls = ["http://example.com", "https://example.com"]; const result = removeDuplicateUrls(urls); expect(result).toEqual(["https://example.com"]); }); it("should prefer non-www over www", () => { - const urls = [ - "https://www.example.com", - "https://example.com" - ]; + const urls = ["https://www.example.com", "https://example.com"]; const result = removeDuplicateUrls(urls); expect(result).toEqual(["https://example.com"]); }); @@ -140,19 +158,13 @@ describe("removeDuplicateUrls", () => { }); it("should handle URLs with different cases", () => { - const urls = [ - "https://EXAMPLE.com", - "https://example.com" - ]; + const urls = ["https://EXAMPLE.com", "https://example.com"]; const result = removeDuplicateUrls(urls); expect(result).toEqual(["https://EXAMPLE.com"]); }); it("should handle URLs with trailing slashes", () => { - const urls = [ - "https://example.com", - "https://example.com/" - ]; + const urls = ["https://example.com", "https://example.com/"]; const result = removeDuplicateUrls(urls); expect(result).toEqual(["https://example.com"]); }); diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index 14a74de8..dc27c136 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -58,9 +58,9 @@ export const checkUrl = (url: string) => { * Same domain check * It checks if the domain of the url is the same as the base url * It accounts true for subdomains and www.subdomains - * @param url - * @param baseUrl - * @returns + * @param url + * @param baseUrl + * @returns */ export function isSameDomain(url: string, baseUrl: string) { const { urlObj: urlObj1, error: error1 } = getURLobj(url); @@ -74,16 +74,21 @@ export function isSameDomain(url: string, baseUrl: string) { const typedUrlObj2 = urlObj2 as URL; const cleanHostname = (hostname: string) => { - return hostname.startsWith('www.') ? hostname.slice(4) : hostname; + return hostname.startsWith("www.") ? hostname.slice(4) : hostname; }; - const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.'); - const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.'); + const domain1 = cleanHostname(typedUrlObj1.hostname) + .split(".") + .slice(-2) + .join("."); + const domain2 = cleanHostname(typedUrlObj2.hostname) + .split(".") + .slice(-2) + .join("."); return domain1 === domain2; } - export function isSameSubdomain(url: string, baseUrl: string) { const { urlObj: urlObj1, error: error1 } = getURLobj(url); const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl); @@ -96,20 +101,31 @@ export function isSameSubdomain(url: string, baseUrl: string) { const typedUrlObj2 = urlObj2 as URL; const cleanHostname = (hostname: string) => { - return hostname.startsWith('www.') ? hostname.slice(4) : hostname; + return hostname.startsWith("www.") ? hostname.slice(4) : hostname; }; - const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.'); - const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.'); + const domain1 = cleanHostname(typedUrlObj1.hostname) + .split(".") + .slice(-2) + .join("."); + const domain2 = cleanHostname(typedUrlObj2.hostname) + .split(".") + .slice(-2) + .join("."); - const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.'); - const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.'); + const subdomain1 = cleanHostname(typedUrlObj1.hostname) + .split(".") + .slice(0, -2) + .join("."); + const subdomain2 = cleanHostname(typedUrlObj2.hostname) + .split(".") + .slice(0, -2) + .join("."); // Check if the domains are the same and the subdomains are the same return domain1 === domain2 && subdomain1 === subdomain2; } - export const checkAndUpdateURLForMap = (url: string) => { if (!protocolIncluded(url)) { url = `http://${url}`; @@ -119,7 +135,6 @@ export const checkAndUpdateURLForMap = (url: string) => { url = url.slice(0, -1); } - const { error, urlObj } = getURLobj(url); if (error) { throw new Error("Invalid URL"); @@ -137,34 +152,34 @@ export const checkAndUpdateURLForMap = (url: string) => { return { urlObj: typedUrlObj, url: url }; }; - - - - export function removeDuplicateUrls(urls: string[]): string[] { const urlMap = new Map(); for (const url of urls) { const parsedUrl = new URL(url); const protocol = parsedUrl.protocol; - const hostname = parsedUrl.hostname.replace(/^www\./, ''); + const hostname = parsedUrl.hostname.replace(/^www\./, ""); const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash; - + const key = `${hostname}${path}`; - + if (!urlMap.has(key)) { urlMap.set(key, url); } else { const existingUrl = new URL(urlMap.get(key)!); const existingProtocol = existingUrl.protocol; - - if (protocol === 'https:' && existingProtocol === 'http:') { + + if (protocol === "https:" && existingProtocol === "http:") { urlMap.set(key, url); - } else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) { + } else if ( + protocol === existingProtocol && + !parsedUrl.hostname.startsWith("www.") && + existingUrl.hostname.startsWith("www.") + ) { urlMap.set(key, url); } } } return [...new Set(Array.from(urlMap.values()))]; -} \ No newline at end of file +} diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index a6cd539d..ab3f4d4b 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -8,10 +8,10 @@ let warningCount = 0; export function withAuth( originalFunction: (...args: U) => Promise, - mockSuccess: T, + mockSuccess: T ) { return async function (...args: U): Promise { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (!useDbAuthentication) { if (warningCount < 5) { logger.warn("You're bypassing authentication"); diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 90d4a47f..981189ab 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -2,7 +2,7 @@ import { Job } from "bullmq"; import { WebScraperOptions, RunWebScraperParams, - RunWebScraperResult, + RunWebScraperResult } from "../types"; import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../controllers/v1/types"; @@ -10,25 +10,31 @@ import { supabase_service } from "../services/supabase"; import { logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; import { configDotenv } from "dotenv"; -import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL"; +import { + EngineResultsTracker, + scrapeURL, + ScrapeUrlResponse +} from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; configDotenv(); export async function startWebScraperPipeline({ job, - token, + token }: { job: Job & { id: string }; token: string; }) { - return (await runWebScraper({ + return await runWebScraper({ url: job.data.url, mode: job.data.mode, scrapeOptions: { ...job.data.scrapeOptions, - ...(job.data.crawl_id ? ({ - formats: job.data.scrapeOptions.formats.concat(["rawHtml"]), - }): {}), + ...(job.data.crawl_id + ? { + formats: job.data.scrapeOptions.formats.concat(["rawHtml"]) + } + : {}) }, internalOptions: job.data.internalOptions, // onSuccess: (result, mode) => { @@ -42,8 +48,8 @@ export async function startWebScraperPipeline({ team_id: job.data.team_id, bull_job_id: job.id.toString(), priority: job.opts.priority, - is_scrape: job.data.is_scrape ?? false, - })); + is_scrape: job.data.is_scrape ?? false + }); } export async function runWebScraper({ @@ -56,28 +62,40 @@ export async function runWebScraper({ team_id, bull_job_id, priority, - is_scrape=false, + is_scrape = false }: RunWebScraperParams): Promise { let response: ScrapeUrlResponse | undefined = undefined; let engines: EngineResultsTracker = {}; try { - response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions }); + response = await scrapeURL(bull_job_id, url, scrapeOptions, { + priority, + ...internalOptions + }); if (!response.success) { if (response.error instanceof Error) { throw response.error; } else { - throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error)); + throw new Error( + "scrapeURL error: " + + (Array.isArray(response.error) + ? JSON.stringify(response.error) + : typeof response.error === "object" + ? JSON.stringify({ ...response.error }) + : response.error) + ); } } - if(is_scrape === false) { + if (is_scrape === false) { let creditsToBeBilled = 1; // Assuming 1 credit per document if (scrapeOptions.extract) { creditsToBeBilled = 5; } - billTeam(team_id, undefined, creditsToBeBilled).catch(error => { - logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); + billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { + logger.error( + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}` + ); // Optionally, you could notify an admin or add to a retry queue here }); } @@ -88,42 +106,70 @@ export async function runWebScraper({ engines = response.engines; return response; } catch (error) { - engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {})); + engines = + response !== undefined + ? response.engines + : typeof error === "object" && error !== null + ? ((error as any).results ?? {}) + : {}; if (response !== undefined) { return { ...response, success: false, - error, - } + error + }; } else { - return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines }; + return { + success: false, + error, + logs: ["no logs -- error coming from runWebScraper"], + engines + }; } // onError(error); } finally { - const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[]; + const engineOrder = Object.entries(engines) + .sort((a, b) => a[1].startedAt - b[1].startedAt) + .map((x) => x[0]) as Engine[]; for (const engine of engineOrder) { - const result = engines[engine] as Exclude; + const result = engines[engine] as Exclude< + EngineResultsTracker[Engine], + undefined + >; ScrapeEvents.insert(bull_job_id, { type: "scrape", url, method: engine, result: { success: result.state === "success", - response_code: (result.state === "success" ? result.result.statusCode : undefined), - response_size: (result.state === "success" ? result.result.html.length : undefined), - error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined), - time_taken: result.finishedAt - result.startedAt, - }, + response_code: + result.state === "success" ? result.result.statusCode : undefined, + response_size: + result.state === "success" ? result.result.html.length : undefined, + error: + result.state === "error" + ? result.error + : result.state === "timeout" + ? "Timed out" + : undefined, + time_taken: result.finishedAt - result.startedAt + } }); } } } -const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => { +const saveJob = async ( + job: Job, + result: any, + token: string, + mode: string, + engines?: EngineResultsTracker +) => { try { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (useDbAuthentication) { const { data, error } = await supabase_service .from("firecrawl_jobs") @@ -140,12 +186,12 @@ const saveJob = async (job: Job, result: any, token: string, mode: string, engin // } catch (error) { // // I think the job won't exist here anymore // } - // } else { - // try { - // await job.moveToCompleted(result, token, false); - // } catch (error) { - // // I think the job won't exist here anymore - // } + // } else { + // try { + // await job.moveToCompleted(result, token, false); + // } catch (error) { + // // I think the job won't exist here anymore + // } } ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index ac61519a..861ae9fc 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -4,7 +4,7 @@ import { autoscalerController, checkQueuesController, cleanBefore24hCompleteJobsController, - queuesController, + queuesController } from "../controllers/v0/admin/queue"; import { wrap } from "./v1"; import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; @@ -26,10 +26,7 @@ adminRouter.get( checkQueuesController ); -adminRouter.get( - `/admin/${process.env.BULL_AUTH_KEY}/queues`, - queuesController -); +adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController); adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/autoscaler`, @@ -38,5 +35,5 @@ adminRouter.get( adminRouter.post( `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, - wrap(acucCacheClearController), + wrap(acucCacheClearController) ); diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 2169c2bd..3a7bda65 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -27,4 +27,4 @@ v0Router.post("/v0/search", searchController); // Health/Probe routes v0Router.get("/v0/health/liveness", livenessController); -v0Router.get("/v0/health/readiness", readinessController); \ No newline at end of file +v0Router.get("/v0/health/readiness", readinessController); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 048e1efc..206423ba 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -4,7 +4,12 @@ import { crawlController } from "../controllers/v1/crawl"; import { scrapeController } from "../../src/controllers/v1/scrape"; import { crawlStatusController } from "../controllers/v1/crawl-status"; import { mapController } from "../controllers/v1/map"; -import { ErrorResponse, RequestWithACUC, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; +import { + ErrorResponse, + RequestWithACUC, + RequestWithAuth, + RequestWithMaybeAuth +} from "../controllers/v1/types"; import { RateLimiterMode } from "../types"; import { authenticateUser } from "../controllers/auth"; import { createIdempotencyKey } from "../services/idempotency/create"; @@ -27,89 +32,114 @@ import { extractController } from "../controllers/v1/extract"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; -function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { - return (req, res, next) => { - (async () => { - if (!minimum && req.body) { - minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1; - } - const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1); - if (chunk) { - req.acuc = chunk; - } - if (!success) { - logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); - if (!res.headersSent) { - return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." }); - } - } - req.account = { remainingCredits }; - next(); - })() - .catch(err => next(err)); - }; -} - -export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { - return (req, res, next) => { - (async () => { - const auth = await authenticateUser( - req, - res, - rateLimiterMode, - ); - - if (!auth.success) { - if (!res.headersSent) { - return res.status(auth.status).json({ success: false, error: auth.error }); - } else { - return; - } - } - - const { team_id, plan, chunk } = auth; - - req.auth = { team_id, plan }; - req.acuc = chunk ?? undefined; - if (chunk) { - req.account = { remainingCredits: chunk.remaining_credits }; - } - next(); - })() - .catch(err => next(err)); - } -} - -function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) { +function checkCreditsMiddleware( + minimum?: number +): (req: RequestWithAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { (async () => { - if (req.headers["x-idempotency-key"]) { - const isIdempotencyValid = await validateIdempotencyKey(req); - if (!isIdempotencyValid) { - if (!res.headersSent) { - return res.status(409).json({ success: false, error: "Idempotency key already used" }); - } - } - createIdempotencyKey(req); + if (!minimum && req.body) { + minimum = + (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1; + } + const { success, remainingCredits, chunk } = await checkTeamCredits( + req.acuc, + req.auth.team_id, + minimum ?? 1 + ); + if (chunk) { + req.acuc = chunk; + } + if (!success) { + logger.error( + `Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}` + ); + if (!res.headersSent) { + return res + .status(402) + .json({ + success: false, + error: + "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." + }); } - next(); - })() - .catch(err => next(err)); + } + req.account = { remainingCredits }; + next(); + })().catch((err) => next(err)); + }; +} + +export function authMiddleware( + rateLimiterMode: RateLimiterMode +): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { + return (req, res, next) => { + (async () => { + const auth = await authenticateUser(req, res, rateLimiterMode); + + if (!auth.success) { + if (!res.headersSent) { + return res + .status(auth.status) + .json({ success: false, error: auth.error }); + } else { + return; + } + } + + const { team_id, plan, chunk } = auth; + + req.auth = { team_id, plan }; + req.acuc = chunk ?? undefined; + if (chunk) { + req.account = { remainingCredits: chunk.remaining_credits }; + } + next(); + })().catch((err) => next(err)); + }; +} + +function idempotencyMiddleware( + req: Request, + res: Response, + next: NextFunction +) { + (async () => { + if (req.headers["x-idempotency-key"]) { + const isIdempotencyValid = await validateIdempotencyKey(req); + if (!isIdempotencyValid) { + if (!res.headersSent) { + return res + .status(409) + .json({ success: false, error: "Idempotency key already used" }); + } + } + createIdempotencyKey(req); + } + next(); + })().catch((err) => next(err)); } function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { - if (!res.headersSent) { - return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." }); - } + if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { + if (!res.headersSent) { + return res + .status(403) + .json({ + success: false, + error: + "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." + }); } - next(); + } + next(); } -export function wrap(controller: (req: Request, res: Response) => Promise): (req: Request, res: Response, next: NextFunction) => any { - return (req, res, next) => { - controller(req, res) - .catch(err => next(err)) - } +export function wrap( + controller: (req: Request, res: Response) => Promise +): (req: Request, res: Response, next: NextFunction) => any { + return (req, res, next) => { + controller(req, res).catch((err) => next(err)); + }; } expressWs(express()); @@ -117,80 +147,71 @@ expressWs(express()); export const v1Router = express.Router(); v1Router.post( - "/scrape", - authMiddleware(RateLimiterMode.Scrape), - checkCreditsMiddleware(1), - blocklistMiddleware, - wrap(scrapeController) + "/scrape", + authMiddleware(RateLimiterMode.Scrape), + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(scrapeController) ); v1Router.post( - "/crawl", - authMiddleware(RateLimiterMode.Crawl), - checkCreditsMiddleware(), - blocklistMiddleware, - idempotencyMiddleware, - wrap(crawlController) + "/crawl", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, + wrap(crawlController) ); v1Router.post( - "/batch/scrape", - authMiddleware(RateLimiterMode.Crawl), - checkCreditsMiddleware(), - blocklistMiddleware, - idempotencyMiddleware, - wrap(batchScrapeController) + "/batch/scrape", + authMiddleware(RateLimiterMode.Crawl), + checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, + wrap(batchScrapeController) ); v1Router.post( - "/map", - authMiddleware(RateLimiterMode.Map), - checkCreditsMiddleware(1), - blocklistMiddleware, - wrap(mapController) + "/map", + authMiddleware(RateLimiterMode.Map), + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(mapController) ); v1Router.get( - "/crawl/:jobId", - authMiddleware(RateLimiterMode.CrawlStatus), - wrap(crawlStatusController) + "/crawl/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(crawlStatusController) ); v1Router.get( - "/batch/scrape/:jobId", - authMiddleware(RateLimiterMode.CrawlStatus), - // Yes, it uses the same controller as the normal crawl status controller - wrap((req:any, res):any => crawlStatusController(req, res, true)) + "/batch/scrape/:jobId", + authMiddleware(RateLimiterMode.CrawlStatus), + // Yes, it uses the same controller as the normal crawl status controller + wrap((req: any, res): any => crawlStatusController(req, res, true)) ); +v1Router.get("/scrape/:jobId", wrap(scrapeStatusController)); + v1Router.get( - "/scrape/:jobId", - wrap(scrapeStatusController) + "/concurrency-check", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(concurrencyCheckController) ); -v1Router.get( - "/concurrency-check", - authMiddleware(RateLimiterMode.CrawlStatus), - wrap(concurrencyCheckController) -); - -v1Router.ws( - "/crawl/:jobId", - crawlStatusWSController -); +v1Router.ws("/crawl/:jobId", crawlStatusWSController); v1Router.post( - "/extract", - authMiddleware(RateLimiterMode.Scrape), - checkCreditsMiddleware(1), - wrap(extractController) + "/extract", + authMiddleware(RateLimiterMode.Scrape), + checkCreditsMiddleware(1), + wrap(extractController) ); - - // v1Router.post("/crawlWebsitePreview", crawlPreviewController); - v1Router.delete( "/crawl/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), @@ -207,4 +228,3 @@ v1Router.delete( // Health/Probe routes // v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/readiness", readinessController); - diff --git a/apps/api/src/run-req.ts b/apps/api/src/run-req.ts index 6d29916d..61ee61bd 100644 --- a/apps/api/src/run-req.ts +++ b/apps/api/src/run-req.ts @@ -18,19 +18,19 @@ async function sendCrawl(result: Result): Promise { { url: url, crawlerOptions: { - limit: 75, + limit: 75 }, pageOptions: { includeHtml: true, replaceAllPathsWithAbsolutePaths: true, - waitFor: 1000, - }, + waitFor: 1000 + } }, { headers: { "Content-Type": "application/json", - Authorization: `Bearer `, - }, + Authorization: `Bearer ` + } } ); result.idempotency_key = idempotencyKey; @@ -51,8 +51,8 @@ async function getContent(result: Result): Promise { { headers: { "Content-Type": "application/json", - Authorization: `Bearer `, - }, + Authorization: `Bearer ` + } } ); if (response.data.status === "completed") { @@ -95,9 +95,9 @@ async function processResults(results: Result[]): Promise { // Save the result to the file try { // Save job id along with the start_url - const resultWithJobId = results.map(r => ({ + const resultWithJobId = results.map((r) => ({ start_url: r.start_url, - job_id: r.job_id, + job_id: r.job_id })); await fs.writeFile( "results_with_job_id_4000_6000.json", diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index eba0ddb4..da2b7d61 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -1,27 +1,29 @@ // crawler.test.ts -import { WebCrawler } from '../crawler'; -import axios from 'axios'; -import robotsParser from 'robots-parser'; +import { WebCrawler } from "../crawler"; +import axios from "axios"; +import robotsParser from "robots-parser"; -jest.mock('axios'); -jest.mock('robots-parser'); +jest.mock("axios"); +jest.mock("robots-parser"); -describe('WebCrawler', () => { +describe("WebCrawler", () => { let crawler: WebCrawler; const mockAxios = axios as jest.Mocked; - const mockRobotsParser = robotsParser as jest.MockedFunction; + const mockRobotsParser = robotsParser as jest.MockedFunction< + typeof robotsParser + >; let maxCrawledDepth: number; beforeEach(() => { // Setup default mocks mockAxios.get.mockImplementation((url) => { - if (url.includes('robots.txt')) { - return Promise.resolve({ data: 'User-agent: *\nAllow: /' }); - } else if (url.includes('sitemap.xml')) { - return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs + if (url.includes("robots.txt")) { + return Promise.resolve({ data: "User-agent: *\nAllow: /" }); + } else if (url.includes("sitemap.xml")) { + return Promise.resolve({ data: "sitemap content" }); // You would normally parse this to URLs } - return Promise.resolve({ data: '' }); + return Promise.resolve({ data: "" }); }); mockRobotsParser.mockReturnValue({ @@ -30,42 +32,45 @@ describe('WebCrawler', () => { getMatchingLineNumber: jest.fn().mockReturnValue(0), getCrawlDelay: jest.fn().mockReturnValue(0), getSitemaps: jest.fn().mockReturnValue([]), - getPreferredHost: jest.fn().mockReturnValue('example.com') + getPreferredHost: jest.fn().mockReturnValue("example.com") }); }); - it('should respect the limit parameter by not returning more links than specified', async () => { - const initialUrl = 'http://example.com'; - const limit = 2; // Set a limit for the number of links + it("should respect the limit parameter by not returning more links than specified", async () => { + const initialUrl = "http://example.com"; + const limit = 2; // Set a limit for the number of links crawler = new WebCrawler({ jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], - limit: limit, // Apply the limit + limit: limit, // Apply the limit maxCrawledDepth: 10 }); // Mock sitemap fetching function to return more links than the limit - crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ - initialUrl, - initialUrl + '/page1', - initialUrl + '/page2', - initialUrl + '/page3' - ]); + crawler["tryFetchSitemapLinks"] = jest + .fn() + .mockResolvedValue([ + initialUrl, + initialUrl + "/page1", + initialUrl + "/page2", + initialUrl + "/page3" + ]); - const filteredLinks = crawler['filterLinks']( - [initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'], + const filteredLinks = crawler["filterLinks"]( + [ + initialUrl, + initialUrl + "/page1", + initialUrl + "/page2", + initialUrl + "/page3" + ], limit, 10 ); - expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit - expect(filteredLinks).toEqual([ - initialUrl, - initialUrl + '/page1' - ]); + expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit + expect(filteredLinks).toEqual([initialUrl, initialUrl + "/page1"]); }); }); - diff --git a/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts b/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts index 968ed121..662a7376 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts @@ -1,5 +1,5 @@ -import CacheableLookup from 'cacheable-lookup'; -import https from 'node:https'; +import CacheableLookup from "cacheable-lookup"; +import https from "node:https"; import axios from "axios"; describe("DNS", () => { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index cac03a68..be3cdf72 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -40,7 +40,7 @@ export class WebCrawler { allowBackwardCrawling = false, allowExternalContentLinks = false, allowSubdomains = false, - ignoreRobotsTxt = false, + ignoreRobotsTxt = false }: { jobId: string; initialUrl: string; @@ -75,9 +75,14 @@ export class WebCrawler { this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" }); } - public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { + public filterLinks( + sitemapLinks: string[], + limit: number, + maxDepth: number, + fromMap: boolean = false + ): string[] { // If the initial URL is a sitemap.xml, skip filtering - if (this.initialUrl.endsWith('sitemap.xml') && fromMap) { + if (this.initialUrl.endsWith("sitemap.xml") && fromMap) { return sitemapLinks.slice(0, limit); } @@ -87,14 +92,17 @@ export class WebCrawler { try { url = new URL(link.trim(), this.baseUrl); } catch (error) { - this.logger.debug(`Error processing link: ${link}`, { link, error, method: "filterLinks" }); + this.logger.debug(`Error processing link: ${link}`, { + link, + error, + method: "filterLinks" + }); return false; } const path = url.pathname; - + const depth = getURLDepth(url.toString()); - // Check if the link exceeds the maximum depth allowed if (depth > maxDepth) { return false; @@ -113,9 +121,11 @@ export class WebCrawler { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0 && this.includes[0] !== "") { - if (!this.includes.some((includePattern) => - new RegExp(includePattern).test(path) - )) { + if ( + !this.includes.some((includePattern) => + new RegExp(includePattern).test(path) + ) + ) { return false; } } @@ -128,8 +138,11 @@ export class WebCrawler { } catch (_) { return false; } - const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); - const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); + const initialHostname = normalizedInitialUrl.hostname.replace( + /^www\./, + "" + ); + const linkHostname = normalizedLink.hostname.replace(/^www\./, ""); // Ensure the protocol and hostname match, and the path starts with the initial URL's path // commented to able to handling external link on allowExternalContentLinks @@ -138,15 +151,22 @@ export class WebCrawler { // } if (!this.allowBackwardCrawling) { - if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { + if ( + !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname) + ) { return false; } } - const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true); + const isAllowed = this.ignoreRobotsTxt + ? true + : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true); // Check if the link is disallowed by robots.txt if (!isAllowed) { - this.logger.debug(`Link disallowed by robots.txt: ${link}`, { method: "filterLinks", link }); + this.logger.debug(`Link disallowed by robots.txt: ${link}`, { + method: "filterLinks", + link + }); return false; } @@ -161,12 +181,15 @@ export class WebCrawler { public async getRobotsTxt(skipTlsVerification = false): Promise { let extraArgs = {}; - if(skipTlsVerification) { + if (skipTlsVerification) { extraArgs["httpsAgent"] = new https.Agent({ rejectUnauthorized: false }); } - const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs }); + const response = await axios.get(this.robotsTxtUrl, { + timeout: axiosTimeout, + ...extraArgs + }); return response.data; } @@ -174,15 +197,25 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, txt); } - public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> { - this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { method: "tryGetSitemap" }); + public async tryGetSitemap( + fromMap: boolean = false, + onlySitemap: boolean = false + ): Promise<{ url: string; html: string }[] | null> { + this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { + method: "tryGetSitemap" + }); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - if(fromMap && onlySitemap) { - return sitemapLinks.map(link => ({ url: link, html: "" })); + if (fromMap && onlySitemap) { + return sitemapLinks.map((link) => ({ url: link, html: "" })); } if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap); - return filteredLinks.map(link => ({ url: link, html: "" })); + let filteredLinks = this.filterLinks( + sitemapLinks, + this.limit, + this.maxCrawledDepth, + fromMap + ); + return filteredLinks.map((link) => ({ url: link, html: "" })); } return null; } @@ -204,15 +237,18 @@ export class WebCrawler { } const path = urlObj.pathname; - if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS - if (this.isInternalLink(fullUrl) && + if (this.isInternalLink(fullUrl)) { + // INTERNAL LINKS + if ( + this.isInternalLink(fullUrl) && this.noSections(fullUrl) && !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) ) { return fullUrl; } - } else { // EXTERNAL LINKS + } else { + // EXTERNAL LINKS if ( this.isInternalLink(url) && this.allowExternalContentLinks && @@ -224,7 +260,11 @@ export class WebCrawler { } } - if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) { + if ( + this.allowSubdomains && + !this.isSocialMediaOrEmail(fullUrl) && + this.isSubdomain(fullUrl) + ) { return fullUrl; } @@ -261,14 +301,20 @@ export class WebCrawler { return links; } - private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean { - return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)) + private isRobotsAllowed( + url: string, + ignoreRobotsTxt: boolean = false + ): boolean { + return ignoreRobotsTxt + ? true + : this.robots + ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) + : true; } private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { return this.excludes.some((pattern) => { - if (onlyDomains) - return this.matchesExcludesExternalDomains(url); + if (onlyDomains) return this.matchesExcludesExternalDomains(url); return this.excludes.some((pattern) => new RegExp(pattern).test(url)); }); @@ -282,11 +328,14 @@ export class WebCrawler { const pathname = urlObj.pathname; for (let domain of this.excludes) { - let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, '')); + let domainObj = new URL("http://" + domain.replace(/^https?:\/\//, "")); let domainHostname = domainObj.hostname; let domainPathname = domainObj.pathname; - if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) { + if ( + hostname === domainHostname || + hostname.endsWith(`.${domainHostname}`) + ) { if (pathname.startsWith(domainPathname)) { return true; } @@ -298,8 +347,13 @@ export class WebCrawler { } } - private isExternalMainPage(url:string):boolean { - return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length) + private isExternalMainPage(url: string): boolean { + return !Boolean( + url + .split("/") + .slice(3) + .filter((subArray) => subArray.length > 0).length + ); } private noSections(link: string): boolean { @@ -308,14 +362,19 @@ export class WebCrawler { private isInternalLink(link: string): boolean { const urlObj = new URL(link, this.baseUrl); - const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim(); + const baseDomain = this.baseUrl + .replace(/^https?:\/\//, "") + .replace(/^www\./, "") + .trim(); const linkDomain = urlObj.hostname.replace(/^www\./, "").trim(); - + return linkDomain === baseDomain; } private isSubdomain(link: string): boolean { - return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".")); + return new URL(link, this.baseUrl).hostname.endsWith( + "." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".") + ); } public isFile(url: string): boolean { @@ -329,7 +388,7 @@ export class WebCrawler { ".ico", ".svg", ".tiff", - // ".pdf", + // ".pdf", ".zip", ".exe", ".dmg", @@ -350,10 +409,13 @@ export class WebCrawler { ]; try { - const urlWithoutQuery = url.split('?')[0].toLowerCase(); + const urlWithoutQuery = url.split("?")[0].toLowerCase(); return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); } catch (error) { - this.logger.error(`Error processing URL in isFile`, { method: "isFile", error }); + this.logger.error(`Error processing URL in isFile`, { + method: "isFile", + error + }); return false; } } @@ -369,7 +431,7 @@ export class WebCrawler { "github.com", "calendly.com", "discord.gg", - "discord.com", + "discord.com" ]; return socialMediaOrEmail.some((ext) => url.includes(ext)); } @@ -383,10 +445,7 @@ export class WebCrawler { return url; }; - - const sitemapUrl = url.endsWith(".xml") - ? url - : `${url}/sitemap.xml`; + const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; let sitemapLinks: string[] = []; @@ -395,12 +454,18 @@ export class WebCrawler { if (response.status === 200) { sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger); } - } catch (error) { - this.logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}`, { method: "tryFetchSitemapLinks", sitemapUrl, error }); + } catch (error) { + this.logger.debug( + `Failed to fetch sitemap with axios from ${sitemapUrl}`, + { method: "tryFetchSitemapLinks", sitemapUrl, error } + ); if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { - const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }, this.logger); + const response = await getLinksFromSitemap( + { sitemapUrl, mode: "fire-engine" }, + this.logger + ); if (response) { sitemapLinks = response; } @@ -410,24 +475,41 @@ export class WebCrawler { if (sitemapLinks.length === 0) { const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); + const response = await axios.get(baseUrlSitemap, { + timeout: axiosTimeout + }); if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }, this.logger); + sitemapLinks = await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, + this.logger + ); } } catch (error) { - this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", sitemapUrl: baseUrlSitemap, error }); + this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { + method: "tryFetchSitemapLinks", + sitemapUrl: baseUrlSitemap, + error + }); if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { - sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }, this.logger); + sitemapLinks = await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, + this.logger + ); } } } const normalizedUrl = normalizeUrl(url); - const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); + const normalizedSitemapLinks = sitemapLinks.map((link) => + normalizeUrl(link) + ); // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl - if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { + if ( + !normalizedSitemapLinks.includes(normalizedUrl) && + sitemapLinks.length > 0 + ) { sitemapLinks.push(url); } return sitemapLinks; diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index 48aa2ffd..ba77b78b 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -3,9 +3,17 @@ import { logger } from "../../../lib/logger"; export async function handleCustomScraping( text: string, url: string -): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { +): Promise<{ + scraper: string; + url: string; + waitAfterLoad?: number; + pageOptions?: { scrollXPaths?: string[] }; +} | null> { // Check for Readme Docs special case - if (text.includes(' { try { let content: string = ""; try { - if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') { + if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; - } else if (mode === 'fire-engine') { - const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }); + } else if (mode === "fire-engine") { + const response = await scrapeURL( + "sitemap", + sitemapUrl, + scrapeOptions.parse({ formats: ["rawHtml"] }), + { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true } + ); if (!response.success) { throw response.error; } content = response.document.rawHtml!; } } catch (error) { - logger.error(`Request failed for ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error }); + logger.error(`Request failed for ${sitemapUrl}`, { + method: "getLinksFromSitemap", + mode, + sitemapUrl, + error + }); return allUrls; } @@ -42,26 +52,46 @@ export async function getLinksFromSitemap( if (root && root.sitemap) { const sitemapPromises = root.sitemap - .filter(sitemap => sitemap.loc && sitemap.loc.length > 0) - .map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }, logger)); + .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) + .map((sitemap) => + getLinksFromSitemap( + { sitemapUrl: sitemap.loc[0], allUrls, mode }, + logger + ) + ); await Promise.all(sitemapPromises); } else if (root && root.url) { const validUrls = root.url - .filter(url => url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) - .map(url => url.loc[0]); + .filter( + (url) => + url.loc && + url.loc.length > 0 && + !WebCrawler.prototype.isFile(url.loc[0]) + ) + .map((url) => url.loc[0]); allUrls.push(...validUrls); } } catch (error) { - logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error }); + logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { + method: "getLinksFromSitemap", + mode, + sitemapUrl, + error + }); } return allUrls; } -export const fetchSitemapData = async (url: string, timeout?: number): Promise => { +export const fetchSitemapData = async ( + url: string, + timeout?: number +): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout }); + const response = await axios.get(sitemapUrl, { + timeout: timeout || axiosTimeout + }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); @@ -71,8 +101,10 @@ export const fetchSitemapData = async (url: string, timeout?: number): Promise { - describe('isUrlBlocked', () => { +describe("Blocklist Functionality", () => { + describe("isUrlBlocked", () => { test.each([ - 'https://facebook.com/fake-test', - 'https://x.com/user-profile', - 'https://twitter.com/home', - 'https://instagram.com/explore', - 'https://linkedin.com/in/johndoe', - 'https://snapchat.com/add/johndoe', - 'https://tiktok.com/@johndoe', - 'https://reddit.com/r/funny', - 'https://tumblr.com/dashboard', - 'https://flickr.com/photos/johndoe', - 'https://whatsapp.com/download', - 'https://wechat.com/features', - 'https://telegram.org/apps' - ])('should return true for blocklisted URL %s', (url) => { + "https://facebook.com/fake-test", + "https://x.com/user-profile", + "https://twitter.com/home", + "https://instagram.com/explore", + "https://linkedin.com/in/johndoe", + "https://snapchat.com/add/johndoe", + "https://tiktok.com/@johndoe", + "https://reddit.com/r/funny", + "https://tumblr.com/dashboard", + "https://flickr.com/photos/johndoe", + "https://whatsapp.com/download", + "https://wechat.com/features", + "https://telegram.org/apps" + ])("should return true for blocklisted URL %s", (url) => { expect(isUrlBlocked(url)).toBe(true); }); test.each([ - 'https://facebook.com/policy', - 'https://twitter.com/tos', - 'https://instagram.com/about/legal/terms', - 'https://linkedin.com/legal/privacy-policy', - 'https://pinterest.com/about/privacy', - 'https://snapchat.com/legal/terms', - 'https://tiktok.com/legal/privacy-policy', - 'https://reddit.com/policies', - 'https://tumblr.com/policy/en/privacy', - 'https://flickr.com/help/terms', - 'https://whatsapp.com/legal', - 'https://wechat.com/en/privacy-policy', - 'https://telegram.org/tos' - ])('should return false for allowed URLs with keywords %s', (url) => { + "https://facebook.com/policy", + "https://twitter.com/tos", + "https://instagram.com/about/legal/terms", + "https://linkedin.com/legal/privacy-policy", + "https://pinterest.com/about/privacy", + "https://snapchat.com/legal/terms", + "https://tiktok.com/legal/privacy-policy", + "https://reddit.com/policies", + "https://tumblr.com/policy/en/privacy", + "https://flickr.com/help/terms", + "https://whatsapp.com/legal", + "https://wechat.com/en/privacy-policy", + "https://telegram.org/tos" + ])("should return false for allowed URLs with keywords %s", (url) => { expect(isUrlBlocked(url)).toBe(false); }); - test('should return false for non-blocklisted domain', () => { - const url = 'https://example.com'; + test("should return false for non-blocklisted domain", () => { + const url = "https://example.com"; expect(isUrlBlocked(url)).toBe(false); }); - test('should handle invalid URLs gracefully', () => { - const url = 'htp://invalid-url'; + test("should handle invalid URLs gracefully", () => { + const url = "htp://invalid-url"; expect(isUrlBlocked(url)).toBe(false); }); }); test.each([ - 'https://subdomain.facebook.com', - 'https://facebook.com.someotherdomain.com', - 'https://www.facebook.com/profile', - 'https://api.twitter.com/info', - 'https://instagram.com/accounts/login' - ])('should return true for URLs with blocklisted domains in subdomains or paths %s', (url) => { + "https://subdomain.facebook.com", + "https://facebook.com.someotherdomain.com", + "https://www.facebook.com/profile", + "https://api.twitter.com/info", + "https://instagram.com/accounts/login" + ])( + "should return true for URLs with blocklisted domains in subdomains or paths %s", + (url) => { + expect(isUrlBlocked(url)).toBe(true); + } + ); + + test.each([ + "https://example.com/facebook.com", + "https://example.com/redirect?url=https://twitter.com", + "https://facebook.com.policy.example.com" + ])( + "should return false for URLs where blocklisted domain is part of another domain or path %s", + (url) => { + expect(isUrlBlocked(url)).toBe(false); + } + ); + + test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])( + "should handle case variations %s", + (url) => { + expect(isUrlBlocked(url)).toBe(true); + } + ); + + test.each([ + "https://facebook.com?redirect=https://example.com", + "https://twitter.com?query=something" + ])("should handle query parameters %s", (url) => { expect(isUrlBlocked(url)).toBe(true); }); - test.each([ - 'https://example.com/facebook.com', - 'https://example.com/redirect?url=https://twitter.com', - 'https://facebook.com.policy.example.com' - ])('should return false for URLs where blocklisted domain is part of another domain or path %s', (url) => { + test("should handle internationalized domain names", () => { + const url = "https://xn--d1acpjx3f.xn--p1ai"; expect(isUrlBlocked(url)).toBe(false); }); - - test.each([ - 'https://FACEBOOK.com', - 'https://INSTAGRAM.com/@something' - ])('should handle case variations %s', (url) => { - expect(isUrlBlocked(url)).toBe(true); - }); - - test.each([ - 'https://facebook.com?redirect=https://example.com', - 'https://twitter.com?query=something' - ])('should handle query parameters %s', (url) => { - expect(isUrlBlocked(url)).toBe(true); - }); - - test('should handle internationalized domain names', () => { - const url = 'https://xn--d1acpjx3f.xn--p1ai'; - expect(isUrlBlocked(url)).toBe(false); - }); -}); \ No newline at end of file +}); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts index 863a6893..4cfa2686 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts @@ -1,47 +1,42 @@ -import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils'; +import { getURLDepth, getAdjustedMaxDepth } from "../maxDepthUtils"; -describe('Testing getURLDepth and getAdjustedMaxDepth', () => { - it('should return 0 for root - mendable.ai', () => { - const enteredURL = "https://www.mendable.ai/" +describe("Testing getURLDepth and getAdjustedMaxDepth", () => { + it("should return 0 for root - mendable.ai", () => { + const enteredURL = "https://www.mendable.ai/"; expect(getURLDepth(enteredURL)).toBe(0); }); - it('should return 0 for root - scrapethissite.com', () => { - const enteredURL = "https://scrapethissite.com/" + it("should return 0 for root - scrapethissite.com", () => { + const enteredURL = "https://scrapethissite.com/"; expect(getURLDepth(enteredURL)).toBe(0); }); - it('should return 1 for scrapethissite.com/pages', () => { - const enteredURL = "https://scrapethissite.com/pages" + it("should return 1 for scrapethissite.com/pages", () => { + const enteredURL = "https://scrapethissite.com/pages"; expect(getURLDepth(enteredURL)).toBe(1); }); - it('should return 2 for scrapethissite.com/pages/articles', () => { - const enteredURL = "https://scrapethissite.com/pages/articles" + it("should return 2 for scrapethissite.com/pages/articles", () => { + const enteredURL = "https://scrapethissite.com/pages/articles"; expect(getURLDepth(enteredURL)).toBe(2); - }); - it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => { - const enteredURL = "https://scrapethissite.com" + it("Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1", () => { + const enteredURL = "https://scrapethissite.com"; expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1); - }); - it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => { - const enteredURL = "https://scrapethissite.com" - expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); - - }); - - it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => { - const enteredURL = "https://mendable.ai" + it("Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0", () => { + const enteredURL = "https://scrapethissite.com"; expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); }); - it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => { - const enteredURL = "https://scrapethissite.com/pages/articles" + it("Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0", () => { + const enteredURL = "https://mendable.ai"; + expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); + }); + + it("Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2", () => { + const enteredURL = "https://scrapethissite.com/pages/articles"; expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4); }); - - }); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index dea4c614..e60943e6 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,68 +1,75 @@ import { logger } from "../../../lib/logger"; const socialMediaBlocklist = [ - 'facebook.com', - 'x.com', - 'twitter.com', - 'instagram.com', - 'linkedin.com', - 'snapchat.com', - 'tiktok.com', - 'reddit.com', - 'tumblr.com', - 'flickr.com', - 'whatsapp.com', - 'wechat.com', - 'telegram.org', - 'researchhub.com', - 'youtube.com', - 'corterix.com', - 'southwest.com', - 'ryanair.com' + "facebook.com", + "x.com", + "twitter.com", + "instagram.com", + "linkedin.com", + "snapchat.com", + "tiktok.com", + "reddit.com", + "tumblr.com", + "flickr.com", + "whatsapp.com", + "wechat.com", + "telegram.org", + "researchhub.com", + "youtube.com", + "corterix.com", + "southwest.com", + "ryanair.com" ]; const allowedKeywords = [ - 'pulse', - 'privacy', - 'terms', - 'policy', - 'user-agreement', - 'legal', - 'help', - 'policies', - 'support', - 'contact', - 'about', - 'careers', - 'blog', - 'press', - 'conditions', - 'tos', - '://library.tiktok.com', - '://ads.tiktok.com', - '://tiktok.com/business', - '://developers.facebook.com' + "pulse", + "privacy", + "terms", + "policy", + "user-agreement", + "legal", + "help", + "policies", + "support", + "contact", + "about", + "careers", + "blog", + "press", + "conditions", + "tos", + "://library.tiktok.com", + "://ads.tiktok.com", + "://tiktok.com/business", + "://developers.facebook.com" ]; export function isUrlBlocked(url: string): boolean { const lowerCaseUrl = url.toLowerCase(); // Check if the URL contains any allowed keywords as whole words - if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) { + if ( + allowedKeywords.some((keyword) => + new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl) + ) + ) { return false; } try { - if (!url.startsWith('http://') && !url.startsWith('https://')) { - url = 'https://' + url; + if (!url.startsWith("http://") && !url.startsWith("https://")) { + url = "https://" + url; } - + const urlObj = new URL(url); const hostname = urlObj.hostname.toLowerCase(); // Check if the URL matches any domain in the blocklist - const isBlocked = socialMediaBlocklist.some(domain => { - const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}(\\.|$)`, 'i'); + const isBlocked = socialMediaBlocklist.some((domain) => { + const domainPattern = new RegExp( + `(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`, + "i" + ); return domainPattern.test(hostname); }); diff --git a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts index bcacc210..3db7c5c1 100644 --- a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts +++ b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts @@ -1,12 +1,15 @@ - - -export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number { +export function getAdjustedMaxDepth( + url: string, + maxCrawlDepth: number +): number { const baseURLDepth = getURLDepth(url); const adjustedMaxDepth = maxCrawlDepth + baseURLDepth; return adjustedMaxDepth; } export function getURLDepth(url: string): number { - const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html"); + const pathSplits = new URL(url).pathname + .split("/") + .filter((x) => x !== "" && x !== "index.php" && x !== "index.html"); return pathSplits.length; } diff --git a/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts b/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts index 2845589c..73452c42 100644 --- a/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts +++ b/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts @@ -1,7 +1,5 @@ -export const removeBase64Images = async ( - markdown: string, -) => { +export const removeBase64Images = async (markdown: string) => { const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g; - markdown = markdown.replace(regex, '$1()'); + markdown = markdown.replace(regex, "$1()"); return markdown; }; diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts index 9506be0f..f6ffcb13 100644 --- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts @@ -4,16 +4,16 @@ import { Meta } from "../.."; import { EngineError } from "../../error"; export async function scrapeCache(meta: Meta): Promise { - const key = cacheKey(meta.url, meta.options, meta.internalOptions); - if (key === null) throw new EngineError("Scrape not eligible for caching"); + const key = cacheKey(meta.url, meta.options, meta.internalOptions); + if (key === null) throw new EngineError("Scrape not eligible for caching"); - const entry = await getEntryFromCache(key); - if (entry === null) throw new EngineError("Cache missed"); + const entry = await getEntryFromCache(key); + if (entry === null) throw new EngineError("Cache missed"); - return { - url: entry.url, - html: entry.html, - statusCode: entry.statusCode, - error: entry.error, - }; -} \ No newline at end of file + return { + url: entry.url, + html: entry.html, + statusCode: entry.statusCode, + error: entry.error + }; +} diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index 9881fae7..02ed0c3f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -4,12 +4,12 @@ import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta): Promise { - const { response, tempFilePath } = await downloadFile(meta.id, meta.url); + const { response, tempFilePath } = await downloadFile(meta.id, meta.url); - return { - url: response.url, - statusCode: response.status, + return { + url: response.url, + statusCode: response.status, - html: (await mammoth.convertToHtml({ path: tempFilePath })).value, - } + html: (await mammoth.convertToHtml({ path: tempFilePath })).value + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 2c809901..92f2d451 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -3,26 +3,34 @@ import { Meta } from "../.."; import { TimeoutError } from "../../error"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; -export async function scrapeURLWithFetch(meta: Meta): Promise { - const timeout = 20000; +export async function scrapeURLWithFetch( + meta: Meta +): Promise { + const timeout = 20000; - const response = await Promise.race([ - fetch(meta.url, { - redirect: "follow", - headers: meta.options.headers, - }), - (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); - throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } }); - })() - ]); + const response = await Promise.race([ + fetch(meta.url, { + redirect: "follow", + headers: meta.options.headers + }), + (async () => { + await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); + throw new TimeoutError( + "Fetch was unable to scrape the page before timing out", + { cause: { timeout } } + ); + })() + ]); - specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any)); + specialtyScrapeCheck( + meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), + Object.fromEntries(response.headers as any) + ); - return { - url: response.url, - html: await response.text(), - statusCode: response.status, - // TODO: error? - }; + return { + url: response.url, + html: await response.text(), + statusCode: response.status + // TODO: error? + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 53c19f3c..c3742d26 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -6,105 +6,132 @@ import { robustFetch } from "../../lib/fetch"; import { EngineError, SiteError } from "../../error"; const successSchema = z.object({ - jobId: z.string(), - state: z.literal("completed"), - processing: z.literal(false), + jobId: z.string(), + state: z.literal("completed"), + processing: z.literal(false), - // timeTaken: z.number(), - content: z.string(), - url: z.string().optional(), + // timeTaken: z.number(), + content: z.string(), + url: z.string().optional(), - pageStatusCode: z.number(), - pageError: z.string().optional(), + pageStatusCode: z.number(), + pageError: z.string().optional(), - // TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability - responseHeaders: z.record(z.string(), z.string()).optional(), + // TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability + responseHeaders: z.record(z.string(), z.string()).optional(), - // timeTakenCookie: z.number().optional(), - // timeTakenRequest: z.number().optional(), + // timeTakenCookie: z.number().optional(), + // timeTakenRequest: z.number().optional(), - // legacy: playwright only - screenshot: z.string().optional(), + // legacy: playwright only + screenshot: z.string().optional(), - // new: actions - screenshots: z.string().array().optional(), - actionContent: z.object({ - url: z.string(), - html: z.string(), - }).array().optional(), -}) + // new: actions + screenshots: z.string().array().optional(), + actionContent: z + .object({ + url: z.string(), + html: z.string() + }) + .array() + .optional() +}); export type FireEngineCheckStatusSuccess = z.infer; const processingSchema = z.object({ - jobId: z.string(), - state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown", "prioritized"]), - processing: z.boolean(), + jobId: z.string(), + state: z.enum([ + "delayed", + "active", + "waiting", + "waiting-children", + "unknown", + "prioritized" + ]), + processing: z.boolean() }); const failedSchema = z.object({ - jobId: z.string(), - state: z.literal("failed"), - processing: z.literal(false), - error: z.string(), + jobId: z.string(), + state: z.literal("failed"), + processing: z.literal(false), + error: z.string() }); export class StillProcessingError extends Error { - constructor(jobId: string) { - super("Job is still under processing", { cause: { jobId } }) - } + constructor(jobId: string) { + super("Job is still under processing", { cause: { jobId } }); + } } -export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise { - const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; +export async function fireEngineCheckStatus( + logger: Logger, + jobId: string +): Promise { + const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; - const status = await Sentry.startSpan({ - name: "fire-engine: Check status", - attributes: { - jobId, + const status = await Sentry.startSpan( + { + name: "fire-engine: Check status", + attributes: { + jobId + } + }, + async (span) => { + return await robustFetch({ + url: `${fireEngineURL}/scrape/${jobId}`, + method: "GET", + logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }), + headers: { + ...(Sentry.isInitialized() + ? { + "sentry-trace": Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span) + } + : {}) } - }, async span => { - return await robustFetch( - { - url: `${fireEngineURL}/scrape/${jobId}`, - method: "GET", - logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }), - headers: { - ...(Sentry.isInitialized() ? ({ - "sentry-trace": Sentry.spanToTraceHeader(span), - "baggage": Sentry.spanToBaggageHeader(span), - }) : {}), - }, - } - ) - }); + }); + } + ); - const successParse = successSchema.safeParse(status); - const processingParse = processingSchema.safeParse(status); - const failedParse = failedSchema.safeParse(status); + const successParse = successSchema.safeParse(status); + const processingParse = processingSchema.safeParse(status); + const failedParse = failedSchema.safeParse(status); - if (successParse.success) { - logger.debug("Scrape succeeded!", { jobId }); - return successParse.data; - } else if (processingParse.success) { - throw new StillProcessingError(jobId); - } else if (failedParse.success) { - logger.debug("Scrape job failed", { status, jobId }); - if (typeof status.error === "string" && status.error.includes("Chrome error: ")) { - throw new SiteError(status.error.split("Chrome error: ")[1]); - } else { - throw new EngineError("Scrape job failed", { - cause: { - status, jobId - } - }); - } + if (successParse.success) { + logger.debug("Scrape succeeded!", { jobId }); + return successParse.data; + } else if (processingParse.success) { + throw new StillProcessingError(jobId); + } else if (failedParse.success) { + logger.debug("Scrape job failed", { status, jobId }); + if ( + typeof status.error === "string" && + status.error.includes("Chrome error: ") + ) { + throw new SiteError(status.error.split("Chrome error: ")[1]); } else { - logger.debug("Check status returned response not matched by any schema", { status, jobId }); - throw new Error("Check status returned response not matched by any schema", { - cause: { - status, jobId - } - }); + throw new EngineError("Scrape job failed", { + cause: { + status, + jobId + } + }); } + } else { + logger.debug("Check status returned response not matched by any schema", { + status, + jobId + }); + throw new Error( + "Check status returned response not matched by any schema", + { + cause: { + status, + jobId + } + } + ); + } } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts index ed07be88..96d73390 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts @@ -4,30 +4,33 @@ import * as Sentry from "@sentry/node"; import { robustFetch } from "../../lib/fetch"; export async function fireEngineDelete(logger: Logger, jobId: string) { - const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; + const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; - await Sentry.startSpan({ - name: "fire-engine: Delete scrape", - attributes: { - jobId, - } - }, async span => { - await robustFetch( - { - url: `${fireEngineURL}/scrape/${jobId}`, - method: "DELETE", - headers: { - ...(Sentry.isInitialized() ? ({ - "sentry-trace": Sentry.spanToTraceHeader(span), - "baggage": Sentry.spanToBaggageHeader(span), - }) : {}), - }, - ignoreResponse: true, - ignoreFailure: true, - logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }), - } - ) - }); + await Sentry.startSpan( + { + name: "fire-engine: Delete scrape", + attributes: { + jobId + } + }, + async (span) => { + await robustFetch({ + url: `${fireEngineURL}/scrape/${jobId}`, + method: "DELETE", + headers: { + ...(Sentry.isInitialized() + ? { + "sentry-trace": Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span) + } + : {}) + }, + ignoreResponse: true, + ignoreFailure: true, + logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }) + }); + } + ); - // We do not care whether this fails or not. -} \ No newline at end of file + // We do not care whether this fails or not. +} diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index ae953c1b..851b8faf 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -1,8 +1,18 @@ import { Logger } from "winston"; import { Meta } from "../.."; -import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape"; +import { + fireEngineScrape, + FireEngineScrapeRequestChromeCDP, + FireEngineScrapeRequestCommon, + FireEngineScrapeRequestPlaywright, + FireEngineScrapeRequestTLSClient +} from "./scrape"; import { EngineScrapeResult } from ".."; -import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus"; +import { + fireEngineCheckStatus, + FireEngineCheckStatusSuccess, + StillProcessingError +} from "./checkStatus"; import { EngineError, SiteError, TimeoutError } from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; @@ -13,203 +23,293 @@ export const defaultTimeout = 10000; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the // `scrapeURLWithFireEngine*` functions. -async function performFireEngineScrape( - logger: Logger, - request: FireEngineScrapeRequestCommon & Engine, - timeout = defaultTimeout, +async function performFireEngineScrape< + Engine extends + | FireEngineScrapeRequestChromeCDP + | FireEngineScrapeRequestPlaywright + | FireEngineScrapeRequestTLSClient +>( + logger: Logger, + request: FireEngineScrapeRequestCommon & Engine, + timeout = defaultTimeout ): Promise { - const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request); + const scrape = await fireEngineScrape( + logger.child({ method: "fireEngineScrape" }), + request + ); - const startTime = Date.now(); - const errorLimit = 3; - let errors: any[] = []; - let status: FireEngineCheckStatusSuccess | undefined = undefined; + const startTime = Date.now(); + const errorLimit = 3; + let errors: any[] = []; + let status: FireEngineCheckStatusSuccess | undefined = undefined; - while (status === undefined) { - if (errors.length >= errorLimit) { - logger.error("Error limit hit.", { errors }); - throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } }); - } - - if (Date.now() - startTime > timeout) { - logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout }); - throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } }); - } - - try { - status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId) - } catch (error) { - if (error instanceof StillProcessingError) { - // nop - } else if (error instanceof EngineError || error instanceof SiteError) { - logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId }); - throw error; - } else { - Sentry.captureException(error); - errors.push(error); - logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId }); - } - } - - await new Promise((resolve) => setTimeout(resolve, 250)); + while (status === undefined) { + if (errors.length >= errorLimit) { + logger.error("Error limit hit.", { errors }); + throw new Error("Error limit hit. See e.cause.errors for errors.", { + cause: { errors } + }); } - return status; + if (Date.now() - startTime > timeout) { + logger.info( + "Fire-engine was unable to scrape the page before timing out.", + { errors, timeout } + ); + throw new TimeoutError( + "Fire-engine was unable to scrape the page before timing out", + { cause: { errors, timeout } } + ); + } + + try { + status = await fireEngineCheckStatus( + logger.child({ method: "fireEngineCheckStatus" }), + scrape.jobId + ); + } catch (error) { + if (error instanceof StillProcessingError) { + // nop + } else if (error instanceof EngineError || error instanceof SiteError) { + logger.debug("Fire-engine scrape job failed.", { + error, + jobId: scrape.jobId + }); + throw error; + } else { + Sentry.captureException(error); + errors.push(error); + logger.debug( + `An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, + { error, jobId: scrape.jobId } + ); + } + } + + await new Promise((resolve) => setTimeout(resolve, 250)); + } + + return status; } -export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise { - const actions: Action[] = [ - // Transform waitFor option into an action (unsupported by chrome-cdp) - ...(meta.options.waitFor !== 0 ? [{ +export async function scrapeURLWithFireEngineChromeCDP( + meta: Meta +): Promise { + const actions: Action[] = [ + // Transform waitFor option into an action (unsupported by chrome-cdp) + ...(meta.options.waitFor !== 0 + ? [ + { type: "wait" as const, - milliseconds: meta.options.waitFor, - }] : []), + milliseconds: meta.options.waitFor + } + ] + : []), - // Transform screenshot format into an action (unsupported by chrome-cdp) - ...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{ + // Transform screenshot format into an action (unsupported by chrome-cdp) + ...(meta.options.formats.includes("screenshot") || + meta.options.formats.includes("screenshot@fullPage") + ? [ + { type: "screenshot" as const, - fullPage: meta.options.formats.includes("screenshot@fullPage"), - }] : []), + fullPage: meta.options.formats.includes("screenshot@fullPage") + } + ] + : []), - // Include specified actions - ...(meta.options.actions ?? []), - ]; + // Include specified actions + ...(meta.options.actions ?? []) + ]; - const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = { - url: meta.url, - engine: "chrome-cdp", - instantReturn: true, - skipTlsVerification: meta.options.skipTlsVerification, - headers: meta.options.headers, - ...(actions.length > 0 ? ({ - actions, - }) : {}), - priority: meta.internalOptions.priority, - geolocation: meta.options.geolocation, - mobile: meta.options.mobile, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic - disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, - // TODO: scrollXPaths - }; + const request: FireEngineScrapeRequestCommon & + FireEngineScrapeRequestChromeCDP = { + url: meta.url, + engine: "chrome-cdp", + instantReturn: true, + skipTlsVerification: meta.options.skipTlsVerification, + headers: meta.options.headers, + ...(actions.length > 0 + ? { + actions + } + : {}), + priority: meta.internalOptions.priority, + geolocation: meta.options.geolocation, + mobile: meta.options.mobile, + timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic + disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache + // TODO: scrollXPaths + }; - const totalWait = actions.reduce((a,x) => x.type === "wait" ? (x.milliseconds ?? 1000) + a : a, 0); + const totalWait = actions.reduce( + (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), + 0 + ); - let response = await performFireEngineScrape( - meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }), - request, - meta.options.timeout !== undefined - ? defaultTimeout + totalWait - : Infinity, // TODO: better timeout handling + let response = await performFireEngineScrape( + meta.logger.child({ + method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", + request + }), + request, + meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity // TODO: better timeout handling + ); + + specialtyScrapeCheck( + meta.logger.child({ + method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" + }), + response.responseHeaders + ); + + if ( + meta.options.formats.includes("screenshot") || + meta.options.formats.includes("screenshot@fullPage") + ) { + meta.logger.debug( + "Transforming screenshots from actions into screenshot field", + { screenshots: response.screenshots } ); + response.screenshot = (response.screenshots ?? [])[0]; + (response.screenshots ?? []).splice(0, 1); + meta.logger.debug("Screenshot transformation done", { + screenshots: response.screenshots, + screenshot: response.screenshot + }); + } - specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders); + if (!response.url) { + meta.logger.warn("Fire-engine did not return the response's URL", { + response, + sourceURL: meta.url + }); + } - if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) { - meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots }); - response.screenshot = (response.screenshots ?? [])[0]; - (response.screenshots ?? []).splice(0, 1); - meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot }); - } + return { + url: response.url ?? meta.url, - if (!response.url) { - meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url }); - } + html: response.content, + error: response.pageError, + statusCode: response.pageStatusCode, - return { - url: response.url ?? meta.url, - - html: response.content, - error: response.pageError, - statusCode: response.pageStatusCode, - - screenshot: response.screenshot, - ...(actions.length > 0 ? { - actions: { - screenshots: response.screenshots ?? [], - scrapes: response.actionContent ?? [], - } - } : {}), - }; + screenshot: response.screenshot, + ...(actions.length > 0 + ? { + actions: { + screenshots: response.screenshots ?? [], + scrapes: response.actionContent ?? [] + } + } + : {}) + }; } -export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise { - const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { - url: meta.url, - engine: "playwright", - instantReturn: true, +export async function scrapeURLWithFireEnginePlaywright( + meta: Meta +): Promise { + const request: FireEngineScrapeRequestCommon & + FireEngineScrapeRequestPlaywright = { + url: meta.url, + engine: "playwright", + instantReturn: true, - headers: meta.options.headers, - priority: meta.internalOptions.priority, - screenshot: meta.options.formats.includes("screenshot"), - fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), - wait: meta.options.waitFor, - geolocation: meta.options.geolocation, + headers: meta.options.headers, + priority: meta.internalOptions.priority, + screenshot: meta.options.formats.includes("screenshot"), + fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"), + wait: meta.options.waitFor, + geolocation: meta.options.geolocation, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic - }; + timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic + }; - let response = await performFireEngineScrape( - meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }), - request, - meta.options.timeout !== undefined - ? defaultTimeout + meta.options.waitFor - : Infinity, // TODO: better timeout handling - ); - - specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders); + let response = await performFireEngineScrape( + meta.logger.child({ + method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", + request + }), + request, + meta.options.timeout !== undefined + ? defaultTimeout + meta.options.waitFor + : Infinity // TODO: better timeout handling + ); - if (!response.url) { - meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url }); - } + specialtyScrapeCheck( + meta.logger.child({ + method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" + }), + response.responseHeaders + ); - return { - url: response.url ?? meta.url, + if (!response.url) { + meta.logger.warn("Fire-engine did not return the response's URL", { + response, + sourceURL: meta.url + }); + } - html: response.content, - error: response.pageError, - statusCode: response.pageStatusCode, + return { + url: response.url ?? meta.url, - ...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({ - screenshot: response.screenshots[0], - }) : {}), - }; + html: response.content, + error: response.pageError, + statusCode: response.pageStatusCode, + + ...(response.screenshots !== undefined && response.screenshots.length > 0 + ? { + screenshot: response.screenshots[0] + } + : {}) + }; } -export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise { - const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { - url: meta.url, - engine: "tlsclient", - instantReturn: true, +export async function scrapeURLWithFireEngineTLSClient( + meta: Meta +): Promise { + const request: FireEngineScrapeRequestCommon & + FireEngineScrapeRequestTLSClient = { + url: meta.url, + engine: "tlsclient", + instantReturn: true, - headers: meta.options.headers, - priority: meta.internalOptions.priority, + headers: meta.options.headers, + priority: meta.internalOptions.priority, - atsv: meta.internalOptions.atsv, - geolocation: meta.options.geolocation, - disableJsDom: meta.internalOptions.v0DisableJsDom, + atsv: meta.internalOptions.atsv, + geolocation: meta.options.geolocation, + disableJsDom: meta.internalOptions.v0DisableJsDom, - timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic - }; + timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic + }; - let response = await performFireEngineScrape( - meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }), - request, - meta.options.timeout !== undefined - ? defaultTimeout - : Infinity, // TODO: better timeout handling - ); + let response = await performFireEngineScrape( + meta.logger.child({ + method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", + request + }), + request, + meta.options.timeout !== undefined ? defaultTimeout : Infinity // TODO: better timeout handling + ); - specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders); + specialtyScrapeCheck( + meta.logger.child({ + method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" + }), + response.responseHeaders + ); - if (!response.url) { - meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url }); - } + if (!response.url) { + meta.logger.warn("Fire-engine did not return the response's URL", { + response, + sourceURL: meta.url + }); + } - return { - url: response.url ?? meta.url, + return { + url: response.url ?? meta.url, - html: response.content, - error: response.pageError, - statusCode: response.pageStatusCode, - }; + html: response.content, + error: response.pageError, + statusCode: response.pageStatusCode + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index 6efb4348..ffca4b41 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -6,92 +6,100 @@ import { Action } from "../../../../lib/entities"; import { robustFetch } from "../../lib/fetch"; export type FireEngineScrapeRequestCommon = { - url: string; - - headers?: { [K: string]: string }; + url: string; - blockMedia?: boolean; // default: true - blockAds?: boolean; // default: true - // pageOptions?: any; // unused, .scrollXPaths is considered on FE side + headers?: { [K: string]: string }; - // useProxy?: boolean; // unused, default: true - // customProxy?: string; // unused + blockMedia?: boolean; // default: true + blockAds?: boolean; // default: true + // pageOptions?: any; // unused, .scrollXPaths is considered on FE side - // disableSmartWaitCache?: boolean; // unused, default: false - // skipDnsCheck?: boolean; // unused, default: false + // useProxy?: boolean; // unused, default: true + // customProxy?: string; // unused - priority?: number; // default: 1 - // team_id?: string; // unused - logRequest?: boolean; // default: true - instantReturn?: boolean; // default: false - geolocation?: { country?: string; languages?: string[]; }; + // disableSmartWaitCache?: boolean; // unused, default: false + // skipDnsCheck?: boolean; // unused, default: false - timeout?: number; -} + priority?: number; // default: 1 + // team_id?: string; // unused + logRequest?: boolean; // default: true + instantReturn?: boolean; // default: false + geolocation?: { country?: string; languages?: string[] }; + + timeout?: number; +}; export type FireEngineScrapeRequestChromeCDP = { - engine: "chrome-cdp"; - skipTlsVerification?: boolean; - actions?: Action[]; - blockMedia?: true; // cannot be false - mobile?: boolean; - disableSmartWaitCache?: boolean; + engine: "chrome-cdp"; + skipTlsVerification?: boolean; + actions?: Action[]; + blockMedia?: true; // cannot be false + mobile?: boolean; + disableSmartWaitCache?: boolean; }; export type FireEngineScrapeRequestPlaywright = { - engine: "playwright"; - blockAds?: boolean; // default: true + engine: "playwright"; + blockAds?: boolean; // default: true - // mutually exclusive, default: false - screenshot?: boolean; - fullPageScreenshot?: boolean; + // mutually exclusive, default: false + screenshot?: boolean; + fullPageScreenshot?: boolean; - wait?: number; // default: 0 + wait?: number; // default: 0 }; export type FireEngineScrapeRequestTLSClient = { - engine: "tlsclient"; - atsv?: boolean; // v0 only, default: false - disableJsDom?: boolean; // v0 only, default: false - // blockAds?: boolean; // default: true + engine: "tlsclient"; + atsv?: boolean; // v0 only, default: false + disableJsDom?: boolean; // v0 only, default: false + // blockAds?: boolean; // default: true }; const schema = z.object({ - jobId: z.string(), - processing: z.boolean(), + jobId: z.string(), + processing: z.boolean() }); -export async function fireEngineScrape ( - logger: Logger, - request: FireEngineScrapeRequestCommon & Engine, +export async function fireEngineScrape< + Engine extends + | FireEngineScrapeRequestChromeCDP + | FireEngineScrapeRequestPlaywright + | FireEngineScrapeRequestTLSClient +>( + logger: Logger, + request: FireEngineScrapeRequestCommon & Engine ): Promise> { - const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; + const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; - // TODO: retries + // TODO: retries - const scrapeRequest = await Sentry.startSpan({ - name: "fire-engine: Scrape", - attributes: { - url: request.url, + const scrapeRequest = await Sentry.startSpan( + { + name: "fire-engine: Scrape", + attributes: { + url: request.url + } + }, + async (span) => { + return await robustFetch({ + url: `${fireEngineURL}/scrape`, + method: "POST", + headers: { + ...(Sentry.isInitialized() + ? { + "sentry-trace": Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span) + } + : {}) }, - }, async span => { - return await robustFetch( - { - url: `${fireEngineURL}/scrape`, - method: "POST", - headers: { - ...(Sentry.isInitialized() ? ({ - "sentry-trace": Sentry.spanToTraceHeader(span), - "baggage": Sentry.spanToBaggageHeader(span), - }) : {}), - }, - body: request, - logger: logger.child({ method: "fireEngineScrape/robustFetch" }), - schema, - tryCount: 3, - } - ); - }); + body: request, + logger: logger.child({ method: "fireEngineScrape/robustFetch" }), + schema, + tryCount: 3 + }); + } + ); - return scrapeRequest; -} \ No newline at end of file + return scrapeRequest; +} diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 8a8f4476..1d9db249 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -1,316 +1,387 @@ import { ScrapeActionContent } from "../../../lib/entities"; import { Meta } from ".."; import { scrapeDOCX } from "./docx"; -import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine"; +import { + scrapeURLWithFireEngineChromeCDP, + scrapeURLWithFireEnginePlaywright, + scrapeURLWithFireEngineTLSClient +} from "./fire-engine"; import { scrapePDF } from "./pdf"; import { scrapeURLWithScrapingBee } from "./scrapingbee"; import { scrapeURLWithFetch } from "./fetch"; import { scrapeURLWithPlaywright } from "./playwright"; import { scrapeCache } from "./cache"; -export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache"; - -const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; -const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; -const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined; -const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined; +export type Engine = + | "fire-engine;chrome-cdp" + | "fire-engine;playwright" + | "fire-engine;tlsclient" + | "scrapingbee" + | "scrapingbeeLoad" + | "playwright" + | "fetch" + | "pdf" + | "docx" + | "cache"; +const useScrapingBee = + process.env.SCRAPING_BEE_API_KEY !== "" && + process.env.SCRAPING_BEE_API_KEY !== undefined; +const useFireEngine = + process.env.FIRE_ENGINE_BETA_URL !== "" && + process.env.FIRE_ENGINE_BETA_URL !== undefined; +const usePlaywright = + process.env.PLAYWRIGHT_MICROSERVICE_URL !== "" && + process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined; +const useCache = + process.env.CACHE_REDIS_URL !== "" && + process.env.CACHE_REDIS_URL !== undefined; export const engines: Engine[] = [ - // ...(useCache ? [ "cache" as const ] : []), - ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []), - ...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []), - ...(usePlaywright ? [ "playwright" as const ] : []), - "fetch", - "pdf", - "docx", + // ...(useCache ? [ "cache" as const ] : []), + ...(useFireEngine + ? [ + "fire-engine;chrome-cdp" as const, + "fire-engine;playwright" as const, + "fire-engine;tlsclient" as const + ] + : []), + ...(useScrapingBee + ? ["scrapingbee" as const, "scrapingbeeLoad" as const] + : []), + ...(usePlaywright ? ["playwright" as const] : []), + "fetch", + "pdf", + "docx" ]; export const featureFlags = [ - "actions", - "waitFor", - "screenshot", - "screenshot@fullScreen", - "pdf", - "docx", - "atsv", - "location", - "mobile", - "skipTlsVerification", - "useFastMode", + "actions", + "waitFor", + "screenshot", + "screenshot@fullScreen", + "pdf", + "docx", + "atsv", + "location", + "mobile", + "skipTlsVerification", + "useFastMode" ] as const; -export type FeatureFlag = typeof featureFlags[number]; +export type FeatureFlag = (typeof featureFlags)[number]; export const featureFlagOptions: { - [F in FeatureFlag]: { - priority: number; - } + [F in FeatureFlag]: { + priority: number; + }; } = { - "actions": { priority: 20 }, - "waitFor": { priority: 1 }, - "screenshot": { priority: 10 }, - "screenshot@fullScreen": { priority: 10 }, - "pdf": { priority: 100 }, - "docx": { priority: 100 }, - "atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not - "useFastMode": { priority: 90 }, - "location": { priority: 10 }, - "mobile": { priority: 10 }, - "skipTlsVerification": { priority: 10 }, + actions: { priority: 20 }, + waitFor: { priority: 1 }, + screenshot: { priority: 10 }, + "screenshot@fullScreen": { priority: 10 }, + pdf: { priority: 100 }, + docx: { priority: 100 }, + atsv: { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not + useFastMode: { priority: 90 }, + location: { priority: 10 }, + mobile: { priority: 10 }, + skipTlsVerification: { priority: 10 } } as const; export type EngineScrapeResult = { - url: string; + url: string; - html: string; - markdown?: string; - statusCode: number; - error?: string; + html: string; + markdown?: string; + statusCode: number; + error?: string; - screenshot?: string; - actions?: { - screenshots: string[]; - scrapes: ScrapeActionContent[]; - }; -} + screenshot?: string; + actions?: { + screenshots: string[]; + scrapes: ScrapeActionContent[]; + }; +}; const engineHandlers: { - [E in Engine]: (meta: Meta) => Promise + [E in Engine]: (meta: Meta) => Promise; } = { - "cache": scrapeCache, - "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, - "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, - "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, - "scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"), - "scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"), - "playwright": scrapeURLWithPlaywright, - "fetch": scrapeURLWithFetch, - "pdf": scrapePDF, - "docx": scrapeDOCX, + cache: scrapeCache, + "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, + "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, + "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, + scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"), + scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"), + playwright: scrapeURLWithPlaywright, + fetch: scrapeURLWithFetch, + pdf: scrapePDF, + docx: scrapeDOCX }; export const engineOptions: { - [E in Engine]: { - // A list of feature flags the engine supports. - features: { [F in FeatureFlag]: boolean }, + [E in Engine]: { + // A list of feature flags the engine supports. + features: { [F in FeatureFlag]: boolean }; - // This defines the order of engines in general. The engine with the highest quality will be used the most. - // Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX - quality: number, - } + // This defines the order of engines in general. The engine with the highest quality will be used the most. + // Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX + quality: number; + }; } = { - "cache": { - features: { - "actions": false, - "waitFor": true, - "screenshot": false, - "screenshot@fullScreen": false, - "pdf": false, // TODO: figure this out - "docx": false, // TODO: figure this out - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": false, - }, - quality: 1000, // cache should always be tried first + cache: { + features: { + actions: false, + waitFor: true, + screenshot: false, + "screenshot@fullScreen": false, + pdf: false, // TODO: figure this out + docx: false, // TODO: figure this out + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: false }, - "fire-engine;chrome-cdp": { - features: { - "actions": true, - "waitFor": true, // through actions transform - "screenshot": true, // through actions transform - "screenshot@fullScreen": true, // through actions transform - "pdf": false, - "docx": false, - "atsv": false, - "location": true, - "mobile": true, - "skipTlsVerification": true, - "useFastMode": false, - }, - quality: 50, + quality: 1000 // cache should always be tried first + }, + "fire-engine;chrome-cdp": { + features: { + actions: true, + waitFor: true, // through actions transform + screenshot: true, // through actions transform + "screenshot@fullScreen": true, // through actions transform + pdf: false, + docx: false, + atsv: false, + location: true, + mobile: true, + skipTlsVerification: true, + useFastMode: false }, - "fire-engine;playwright": { - features: { - "actions": false, - "waitFor": true, - "screenshot": true, - "screenshot@fullScreen": true, - "pdf": false, - "docx": false, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": false, - }, - quality: 40, + quality: 50 + }, + "fire-engine;playwright": { + features: { + actions: false, + waitFor: true, + screenshot: true, + "screenshot@fullScreen": true, + pdf: false, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: false }, - "scrapingbee": { - features: { - "actions": false, - "waitFor": true, - "screenshot": true, - "screenshot@fullScreen": true, - "pdf": false, - "docx": false, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": false, - }, - quality: 30, + quality: 40 + }, + scrapingbee: { + features: { + actions: false, + waitFor: true, + screenshot: true, + "screenshot@fullScreen": true, + pdf: false, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: false }, - "scrapingbeeLoad": { - features: { - "actions": false, - "waitFor": true, - "screenshot": true, - "screenshot@fullScreen": true, - "pdf": false, - "docx": false, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": false, - }, - quality: 29, + quality: 30 + }, + scrapingbeeLoad: { + features: { + actions: false, + waitFor: true, + screenshot: true, + "screenshot@fullScreen": true, + pdf: false, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: false }, - "playwright": { - features: { - "actions": false, - "waitFor": true, - "screenshot": false, - "screenshot@fullScreen": false, - "pdf": false, - "docx": false, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": false, - }, - quality: 20, + quality: 29 + }, + playwright: { + features: { + actions: false, + waitFor: true, + screenshot: false, + "screenshot@fullScreen": false, + pdf: false, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: false }, - "fire-engine;tlsclient": { - features: { - "actions": false, - "waitFor": false, - "screenshot": false, - "screenshot@fullScreen": false, - "pdf": false, - "docx": false, - "atsv": true, - "location": true, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": true, - }, - quality: 10, + quality: 20 + }, + "fire-engine;tlsclient": { + features: { + actions: false, + waitFor: false, + screenshot: false, + "screenshot@fullScreen": false, + pdf: false, + docx: false, + atsv: true, + location: true, + mobile: false, + skipTlsVerification: false, + useFastMode: true }, - "fetch": { - features: { - "actions": false, - "waitFor": false, - "screenshot": false, - "screenshot@fullScreen": false, - "pdf": false, - "docx": false, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": true, - }, - quality: 5, + quality: 10 + }, + fetch: { + features: { + actions: false, + waitFor: false, + screenshot: false, + "screenshot@fullScreen": false, + pdf: false, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: true }, - "pdf": { - features: { - "actions": false, - "waitFor": false, - "screenshot": false, - "screenshot@fullScreen": false, - "pdf": true, - "docx": false, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": true, - }, - quality: -10, + quality: 5 + }, + pdf: { + features: { + actions: false, + waitFor: false, + screenshot: false, + "screenshot@fullScreen": false, + pdf: true, + docx: false, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: true }, - "docx": { - features: { - "actions": false, - "waitFor": false, - "screenshot": false, - "screenshot@fullScreen": false, - "pdf": false, - "docx": true, - "atsv": false, - "location": false, - "mobile": false, - "skipTlsVerification": false, - "useFastMode": true, - }, - quality: -10, + quality: -10 + }, + docx: { + features: { + actions: false, + waitFor: false, + screenshot: false, + "screenshot@fullScreen": false, + pdf: false, + docx: true, + atsv: false, + location: false, + mobile: false, + skipTlsVerification: false, + useFastMode: true }, + quality: -10 + } }; export function buildFallbackList(meta: Meta): { - engine: Engine, - unsupportedFeatures: Set, + engine: Engine; + unsupportedFeatures: Set; }[] { - const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0); - const priorityThreshold = Math.floor(prioritySum / 2); - let selectedEngines: { - engine: Engine, - supportScore: number, - unsupportedFeatures: Set, - }[] = []; + const prioritySum = [...meta.featureFlags].reduce( + (a, x) => a + featureFlagOptions[x].priority, + 0 + ); + const priorityThreshold = Math.floor(prioritySum / 2); + let selectedEngines: { + engine: Engine; + supportScore: number; + unsupportedFeatures: Set; + }[] = []; - const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines; + const currentEngines = + meta.internalOptions.forceEngine !== undefined + ? [meta.internalOptions.forceEngine] + : engines; - for (const engine of currentEngines) { - const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]); - const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0); + for (const engine of currentEngines) { + const supportedFlags = new Set([ + ...Object.entries(engineOptions[engine].features) + .filter( + ([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true + ) + .map(([k, _]) => k) + ]); + const supportScore = [...supportedFlags].reduce( + (a, x) => a + featureFlagOptions[x].priority, + 0 + ); - const unsupportedFeatures = new Set([...meta.featureFlags]); - for (const flag of meta.featureFlags) { - if (supportedFlags.has(flag)) { - unsupportedFeatures.delete(flag); - } - } + const unsupportedFeatures = new Set([...meta.featureFlags]); + for (const flag of meta.featureFlags) { + if (supportedFlags.has(flag)) { + unsupportedFeatures.delete(flag); + } + } - if (supportScore >= priorityThreshold) { - selectedEngines.push({ engine, supportScore, unsupportedFeatures }); - meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures }); - } else { - meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures}); + if (supportScore >= priorityThreshold) { + selectedEngines.push({ engine, supportScore, unsupportedFeatures }); + meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { + supportScore, + prioritySum, + priorityThreshold, + featureFlags: [...meta.featureFlags], + unsupportedFeatures + }); + } else { + meta.logger.debug( + `Engine ${engine} does not meet feature priority threshold`, + { + supportScore, + prioritySum, + priorityThreshold, + featureFlags: [...meta.featureFlags], + unsupportedFeatures } + ); } + } - if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) { - selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0); - } + if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) { + selectedEngines = selectedEngines.filter( + (x) => engineOptions[x.engine].quality > 0 + ); + } - selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality); + selectedEngines.sort( + (a, b) => + b.supportScore - a.supportScore || + engineOptions[b.engine].quality - engineOptions[a.engine].quality + ); - return selectedEngines; + return selectedEngines; } -export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise { - const fn = engineHandlers[engine]; - const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine }); - const _meta = { - ...meta, - logger, - }; +export async function scrapeURLWithEngine( + meta: Meta, + engine: Engine +): Promise { + const fn = engineHandlers[engine]; + const logger = meta.logger.child({ + method: fn.name ?? "scrapeURLWithEngine", + engine + }); + const _meta = { + ...meta, + logger + }; - return await fn(_meta); + return await fn(_meta); } diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index b441943c..62313a71 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -10,152 +10,179 @@ import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; -type PDFProcessorResult = {html: string, markdown?: string}; +type PDFProcessorResult = { html: string; markdown?: string }; -async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise { - meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath }); +async function scrapePDFWithLlamaParse( + meta: Meta, + tempFilePath: string +): Promise { + meta.logger.debug("Processing PDF document with LlamaIndex", { + tempFilePath + }); - const uploadForm = new FormData(); + const uploadForm = new FormData(); - // This is utterly stupid but it works! - mogery - uploadForm.append("file", { - [Symbol.toStringTag]: "Blob", - name: tempFilePath, - stream() { - return createReadStream(tempFilePath) as unknown as ReadableStream - }, - arrayBuffer() { - throw Error("Unimplemented in mock Blob: arrayBuffer") - }, - size: (await fs.stat(tempFilePath)).size, - text() { - throw Error("Unimplemented in mock Blob: text") - }, - slice(start, end, contentType) { - throw Error("Unimplemented in mock Blob: slice") - }, - type: "application/pdf", - } as Blob); + // This is utterly stupid but it works! - mogery + uploadForm.append("file", { + [Symbol.toStringTag]: "Blob", + name: tempFilePath, + stream() { + return createReadStream( + tempFilePath + ) as unknown as ReadableStream; + }, + arrayBuffer() { + throw Error("Unimplemented in mock Blob: arrayBuffer"); + }, + size: (await fs.stat(tempFilePath)).size, + text() { + throw Error("Unimplemented in mock Blob: text"); + }, + slice(start, end, contentType) { + throw Error("Unimplemented in mock Blob: slice"); + }, + type: "application/pdf" + } as Blob); - const upload = await robustFetch({ - url: "https://api.cloud.llamaindex.ai/api/parsing/upload", - method: "POST", + const upload = await robustFetch({ + url: "https://api.cloud.llamaindex.ai/api/parsing/upload", + method: "POST", + headers: { + Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}` + }, + body: uploadForm, + logger: meta.logger.child({ + method: "scrapePDFWithLlamaParse/upload/robustFetch" + }), + schema: z.object({ + id: z.string() + }) + }); + + const jobId = upload.id; + + // TODO: timeout, retries + const startedAt = Date.now(); + + while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { + try { + const result = await robustFetch({ + url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, + method: "GET", headers: { - "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`, + Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}` }, - body: uploadForm, - logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }), - schema: z.object({ - id: z.string(), + logger: meta.logger.child({ + method: "scrapePDFWithLlamaParse/result/robustFetch" }), - }); - - const jobId = upload.id; - - // TODO: timeout, retries - const startedAt = Date.now(); - - while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { - try { - const result = await robustFetch({ - url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, - method: "GET", - headers: { - "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`, - }, - logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }), - schema: z.object({ - markdown: z.string(), - }), - }); - return { - markdown: result.markdown, - html: await marked.parse(result.markdown, { async: true }), - }; - } catch (e) { - if (e instanceof Error && e.message === "Request sent failure status") { - if ((e.cause as any).response.status === 404) { - // no-op, result not up yet - } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) { - // URL is not a PDF, actually! - meta.logger.debug("URL is not actually a PDF, signalling..."); - throw new RemoveFeatureError(["pdf"]); - } else { - throw new Error("LlamaParse threw an error", { - cause: e.cause, - }); - } - } else { - throw e; - } + schema: z.object({ + markdown: z.string() + }) + }); + return { + markdown: result.markdown, + html: await marked.parse(result.markdown, { async: true }) + }; + } catch (e) { + if (e instanceof Error && e.message === "Request sent failure status") { + if ((e.cause as any).response.status === 404) { + // no-op, result not up yet + } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) { + // URL is not a PDF, actually! + meta.logger.debug("URL is not actually a PDF, signalling..."); + throw new RemoveFeatureError(["pdf"]); + } else { + throw new Error("LlamaParse threw an error", { + cause: e.cause + }); } - - await new Promise((resolve) => setTimeout(() => resolve(), 250)); + } else { + throw e; + } } - throw new Error("LlamaParse timed out"); + await new Promise((resolve) => setTimeout(() => resolve(), 250)); + } + + throw new Error("LlamaParse timed out"); } -async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise { - meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); +async function scrapePDFWithParsePDF( + meta: Meta, + tempFilePath: string +): Promise { + meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); - const result = await PdfParse(await fs.readFile(tempFilePath)); - const escaped = escapeHtml(result.text); + const result = await PdfParse(await fs.readFile(tempFilePath)); + const escaped = escapeHtml(result.text); - return { - markdown: escaped, - html: escaped, - }; + return { + markdown: escaped, + html: escaped + }; } export async function scrapePDF(meta: Meta): Promise { - if (!meta.options.parsePDF) { - const file = await fetchFileToBuffer(meta.url); - const content = file.buffer.toString("base64"); - return { - url: file.response.url, - statusCode: file.response.status, - - html: content, - markdown: content, - }; - } - - const { response, tempFilePath } = await downloadFile(meta.id, meta.url); - - let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { - try { - result = await scrapePDFWithLlamaParse({ - ...meta, - logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }), - }, tempFilePath); - } catch (error) { - if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { error }); - } else if (error instanceof RemoveFeatureError) { - throw error; - } else { - meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error }); - Sentry.captureException(error); - } - } - } - - if (result === null) { - result = await scrapePDFWithParsePDF({ - ...meta, - logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }), - }, tempFilePath); - } - - await fs.unlink(tempFilePath); - + if (!meta.options.parsePDF) { + const file = await fetchFileToBuffer(meta.url); + const content = file.buffer.toString("base64"); return { - url: response.url, - statusCode: response.status, + url: file.response.url, + statusCode: file.response.status, - html: result.html, - markdown: result.markdown, + html: content, + markdown: content + }; + } + + const { response, tempFilePath } = await downloadFile(meta.id, meta.url); + + let result: PDFProcessorResult | null = null; + if (process.env.LLAMAPARSE_API_KEY) { + try { + result = await scrapePDFWithLlamaParse( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithLlamaParse" + }) + }, + tempFilePath + ); + } catch (error) { + if (error instanceof Error && error.message === "LlamaParse timed out") { + meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + error + }); + } else if (error instanceof RemoveFeatureError) { + throw error; + } else { + meta.logger.warn( + "LlamaParse failed to parse PDF -- falling back to parse-pdf", + { error } + ); + Sentry.captureException(error); + } } + } + + if (result === null) { + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }) + }, + tempFilePath + ); + } + + await fs.unlink(tempFilePath); + + return { + url: response.url, + statusCode: response.status, + + html: result.html, + markdown: result.markdown + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index 887b8b64..a8c16045 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -4,39 +4,44 @@ import { Meta } from "../.."; import { TimeoutError } from "../../error"; import { robustFetch } from "../../lib/fetch"; -export async function scrapeURLWithPlaywright(meta: Meta): Promise { - const timeout = 20000 + meta.options.waitFor; +export async function scrapeURLWithPlaywright( + meta: Meta +): Promise { + const timeout = 20000 + meta.options.waitFor; - const response = await Promise.race([ - await robustFetch({ - url: process.env.PLAYWRIGHT_MICROSERVICE_URL!, - headers: { - "Content-Type": "application/json", - }, - body: { - url: meta.url, - wait_after_load: meta.options.waitFor, - timeout, - headers: meta.options.headers, - }, - method: "POST", - logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"), - schema: z.object({ - content: z.string(), - pageStatusCode: z.number(), - pageError: z.string().optional(), - }), - }), - (async () => { - await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); - throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } }); - })(), - ]); + const response = await Promise.race([ + await robustFetch({ + url: process.env.PLAYWRIGHT_MICROSERVICE_URL!, + headers: { + "Content-Type": "application/json" + }, + body: { + url: meta.url, + wait_after_load: meta.options.waitFor, + timeout, + headers: meta.options.headers + }, + method: "POST", + logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"), + schema: z.object({ + content: z.string(), + pageStatusCode: z.number(), + pageError: z.string().optional() + }) + }), + (async () => { + await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); + throw new TimeoutError( + "Playwright was unable to scrape the page before timing out", + { cause: { timeout } } + ); + })() + ]); - return { - url: meta.url, // TODO: impove redirect following - html: response.content, - statusCode: response.pageStatusCode, - error: response.pageError, - } + return { + url: meta.url, // TODO: impove redirect following + html: response.content, + statusCode: response.pageStatusCode, + error: response.pageError + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 9b946bf0..8388016a 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -7,60 +7,82 @@ import { EngineError } from "../../error"; const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); -export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "networkidle2"): ((meta: Meta) => Promise) { - return async (meta: Meta): Promise => { - let response: AxiosResponse; - try { - response = await client.get({ - url: meta.url, - params: { - timeout: 15000, // TODO: dynamic timeout based on request timeout - wait_browser: wait_browser, - wait: Math.min(meta.options.waitFor, 35000), - transparent_status_code: true, - json_response: true, - screenshot: meta.options.formats.includes("screenshot"), - screenshot_full_page: meta.options.formats.includes("screenshot@fullPage"), - }, - headers: { - "ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery - }, - }); - } catch (error) { - if (error instanceof AxiosError && error.response !== undefined) { - response = error.response; - } else { - throw error; - } +export function scrapeURLWithScrapingBee( + wait_browser: "domcontentloaded" | "networkidle2" +): (meta: Meta) => Promise { + return async (meta: Meta): Promise => { + let response: AxiosResponse; + try { + response = await client.get({ + url: meta.url, + params: { + timeout: 15000, // TODO: dynamic timeout based on request timeout + wait_browser: wait_browser, + wait: Math.min(meta.options.waitFor, 35000), + transparent_status_code: true, + json_response: true, + screenshot: meta.options.formats.includes("screenshot"), + screenshot_full_page: meta.options.formats.includes( + "screenshot@fullPage" + ) + }, + headers: { + "ScrapingService-Request": "TRUE" // this is sent to the page, not to ScrapingBee - mogery } + }); + } catch (error) { + if (error instanceof AxiosError && error.response !== undefined) { + response = error.response; + } else { + throw error; + } + } - const data: Buffer = response.data; - const body = JSON.parse(new TextDecoder().decode(data)); + const data: Buffer = response.data; + const body = JSON.parse(new TextDecoder().decode(data)); - const headers = body.headers ?? {}; - const isHiddenEngineError = !(headers["Date"] ?? headers["date"] ?? headers["Content-Type"] ?? headers["content-type"]); + const headers = body.headers ?? {}; + const isHiddenEngineError = !( + headers["Date"] ?? + headers["date"] ?? + headers["Content-Type"] ?? + headers["content-type"] + ); - if (body.errors || body.body?.error || isHiddenEngineError) { - meta.logger.error("ScrapingBee threw an error", { body: body.body?.error ?? body.errors ?? body.body ?? body }); - throw new EngineError("Engine error #34", { cause: { body, statusCode: response.status } }); - } + if (body.errors || body.body?.error || isHiddenEngineError) { + meta.logger.error("ScrapingBee threw an error", { + body: body.body?.error ?? body.errors ?? body.body ?? body + }); + throw new EngineError("Engine error #34", { + cause: { body, statusCode: response.status } + }); + } - if (typeof body.body !== "string") { - meta.logger.error("ScrapingBee: Body is not string??", { body }); - throw new EngineError("Engine error #35", { cause: { body, statusCode: response.status } }); - } + if (typeof body.body !== "string") { + meta.logger.error("ScrapingBee: Body is not string??", { body }); + throw new EngineError("Engine error #35", { + cause: { body, statusCode: response.status } + }); + } - specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" }), body.headers); + specialtyScrapeCheck( + meta.logger.child({ + method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" + }), + body.headers + ); - return { - url: body["resolved-url"] ?? meta.url, + return { + url: body["resolved-url"] ?? meta.url, - html: body.body, - error: response.status >= 300 ? response.statusText : undefined, - statusCode: response.status, - ...(body.screenshot ? ({ - screenshot: `data:image/png;base64,${body.screenshot}`, - }) : {}), - }; + html: body.body, + error: response.status >= 300 ? response.statusText : undefined, + statusCode: response.status, + ...(body.screenshot + ? { + screenshot: `data:image/png;base64,${body.screenshot}` + } + : {}) }; + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 736faba7..84a52425 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -7,48 +7,53 @@ import { v4 as uuid } from "uuid"; import * as undici from "undici"; export async function fetchFileToBuffer(url: string): Promise<{ - response: Response, - buffer: Buffer + response: Response; + buffer: Buffer; }> { - const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying - return { - response, - buffer: Buffer.from(await response.arrayBuffer()), - }; + const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying + return { + response, + buffer: Buffer.from(await response.arrayBuffer()) + }; } -export async function downloadFile(id: string, url: string): Promise<{ - response: undici.Response - tempFilePath: string +export async function downloadFile( + id: string, + url: string +): Promise<{ + response: undici.Response; + tempFilePath: string; }> { - const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); - const tempFileWrite = createWriteStream(tempFilePath); + const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); + const tempFileWrite = createWriteStream(tempFilePath); - // TODO: maybe we could use tlsclient for this? for proxying - // use undici to ignore SSL for now - const response = await undici.fetch(url, { - dispatcher: new undici.Agent({ - connect: { - rejectUnauthorized: false, - }, - }) - }); - - // This should never happen in the current state of JS (2024), but let's check anyways. - if (response.body === null) { - throw new EngineError("Response body was null", { cause: { response } }); - } - - response.body.pipeTo(Writable.toWeb(tempFileWrite)); - await new Promise((resolve, reject) => { - tempFileWrite.on("finish", () => resolve(null)); - tempFileWrite.on("error", (error) => { - reject(new EngineError("Failed to write to temp file", { cause: { error } })); - }); + // TODO: maybe we could use tlsclient for this? for proxying + // use undici to ignore SSL for now + const response = await undici.fetch(url, { + dispatcher: new undici.Agent({ + connect: { + rejectUnauthorized: false + } }) + }); - return { - response, - tempFilePath, - }; + // This should never happen in the current state of JS (2024), but let's check anyways. + if (response.body === null) { + throw new EngineError("Response body was null", { cause: { response } }); + } + + response.body.pipeTo(Writable.toWeb(tempFileWrite)); + await new Promise((resolve, reject) => { + tempFileWrite.on("finish", () => resolve(null)); + tempFileWrite.on("error", (error) => { + reject( + new EngineError("Failed to write to temp file", { cause: { error } }) + ); + }); + }); + + return { + response, + tempFilePath + }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts index b10657a4..4f497e52 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts @@ -1,14 +1,32 @@ import { Logger } from "winston"; import { AddFeatureError } from "../../error"; -export function specialtyScrapeCheck(logger: Logger, headers: Record | undefined) { - const contentType = (Object.entries(headers ?? {}).find(x => x[0].toLowerCase() === "content-type") ?? [])[1]; +export function specialtyScrapeCheck( + logger: Logger, + headers: Record | undefined +) { + const contentType = (Object.entries(headers ?? {}).find( + (x) => x[0].toLowerCase() === "content-type" + ) ?? [])[1]; - if (contentType === undefined) { - logger.warn("Failed to check contentType -- was not present in headers", { headers }); - } else if (contentType === "application/pdf" || contentType.startsWith("application/pdf;")) { // .pdf - throw new AddFeatureError(["pdf"]); - } else if (contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document;")) { // .docx - throw new AddFeatureError(["docx"]); - } + if (contentType === undefined) { + logger.warn("Failed to check contentType -- was not present in headers", { + headers + }); + } else if ( + contentType === "application/pdf" || + contentType.startsWith("application/pdf;") + ) { + // .pdf + throw new AddFeatureError(["pdf"]); + } else if ( + contentType === + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || + contentType.startsWith( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document;" + ) + ) { + // .docx + throw new AddFeatureError(["docx"]); + } } diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index ccd7a359..c6eb45e3 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -1,51 +1,57 @@ -import { EngineResultsTracker } from "." -import { Engine, FeatureFlag } from "./engines" +import { EngineResultsTracker } from "."; +import { Engine, FeatureFlag } from "./engines"; export class EngineError extends Error { - constructor(message?: string, options?: ErrorOptions) { - super(message, options) - } + constructor(message?: string, options?: ErrorOptions) { + super(message, options); + } } export class TimeoutError extends Error { - constructor(message?: string, options?: ErrorOptions) { - super(message, options) - } + constructor(message?: string, options?: ErrorOptions) { + super(message, options); + } } export class NoEnginesLeftError extends Error { - public fallbackList: Engine[]; - public results: EngineResultsTracker; + public fallbackList: Engine[]; + public results: EngineResultsTracker; - constructor(fallbackList: Engine[], results: EngineResultsTracker) { - super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com."); - this.fallbackList = fallbackList; - this.results = results; - } + constructor(fallbackList: Engine[], results: EngineResultsTracker) { + super( + "All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com." + ); + this.fallbackList = fallbackList; + this.results = results; + } } export class AddFeatureError extends Error { - public featureFlags: FeatureFlag[]; + public featureFlags: FeatureFlag[]; - constructor(featureFlags: FeatureFlag[]) { - super("New feature flags have been discovered: " + featureFlags.join(", ")); - this.featureFlags = featureFlags; - } + constructor(featureFlags: FeatureFlag[]) { + super("New feature flags have been discovered: " + featureFlags.join(", ")); + this.featureFlags = featureFlags; + } } export class RemoveFeatureError extends Error { - public featureFlags: FeatureFlag[]; + public featureFlags: FeatureFlag[]; - constructor(featureFlags: FeatureFlag[]) { - super("Incorrect feature flags have been discovered: " + featureFlags.join(", ")); - this.featureFlags = featureFlags; - } + constructor(featureFlags: FeatureFlag[]) { + super( + "Incorrect feature flags have been discovered: " + featureFlags.join(", ") + ); + this.featureFlags = featureFlags; + } } export class SiteError extends Error { - public code: string; - constructor(code: string) { - super("Specified URL is failing to load in the browser. Error code: " + code) - this.code = code; - } + public code: string; + constructor(code: string) { + super( + "Specified URL is failing to load in the browser. Error code: " + code + ); + this.code = code; + } } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index f394ca2b..0a0b6c92 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -3,84 +3,104 @@ import * as Sentry from "@sentry/node"; import { Document, ScrapeOptions } from "../../controllers/v1/types"; import { logger } from "../../lib/logger"; -import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines"; +import { + buildFallbackList, + Engine, + EngineScrapeResult, + FeatureFlag, + scrapeURLWithEngine +} from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { AddFeatureError, EngineError, NoEnginesLeftError, RemoveFeatureError, SiteError, TimeoutError } from "./error"; +import { + AddFeatureError, + EngineError, + NoEnginesLeftError, + RemoveFeatureError, + SiteError, + TimeoutError +} from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; import { urlSpecificParams } from "./lib/urlSpecificParams"; -export type ScrapeUrlResponse = ({ - success: true, - document: Document, -} | { - success: false, - error: any, -}) & { - logs: any[], - engines: EngineResultsTracker, -} +export type ScrapeUrlResponse = ( + | { + success: true; + document: Document; + } + | { + success: false; + error: any; + } +) & { + logs: any[]; + engines: EngineResultsTracker; +}; export type Meta = { - id: string; - url: string; - options: ScrapeOptions; - internalOptions: InternalOptions; - logger: Logger; - logs: any[]; - featureFlags: Set; -} + id: string; + url: string; + options: ScrapeOptions; + internalOptions: InternalOptions; + logger: Logger; + logs: any[]; + featureFlags: Set; +}; -function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions: InternalOptions): Set { - const flags: Set = new Set(); +function buildFeatureFlags( + url: string, + options: ScrapeOptions, + internalOptions: InternalOptions +): Set { + const flags: Set = new Set(); - if (options.actions !== undefined) { - flags.add("actions"); - } + if (options.actions !== undefined) { + flags.add("actions"); + } - if (options.formats.includes("screenshot")) { - flags.add("screenshot"); - } + if (options.formats.includes("screenshot")) { + flags.add("screenshot"); + } - if (options.formats.includes("screenshot@fullPage")) { - flags.add("screenshot@fullScreen"); - } + if (options.formats.includes("screenshot@fullPage")) { + flags.add("screenshot@fullScreen"); + } - if (options.waitFor !== 0) { - flags.add("waitFor"); - } + if (options.waitFor !== 0) { + flags.add("waitFor"); + } - if (internalOptions.atsv) { - flags.add("atsv"); - } + if (internalOptions.atsv) { + flags.add("atsv"); + } - if (options.location || options.geolocation) { - flags.add("location"); - } + if (options.location || options.geolocation) { + flags.add("location"); + } - if (options.mobile) { - flags.add("mobile"); - } - - if (options.skipTlsVerification) { - flags.add("skipTlsVerification"); - } + if (options.mobile) { + flags.add("mobile"); + } - if (internalOptions.v0UseFastMode) { - flags.add("useFastMode"); - } + if (options.skipTlsVerification) { + flags.add("skipTlsVerification"); + } - const urlO = new URL(url); + if (internalOptions.v0UseFastMode) { + flags.add("useFastMode"); + } - if (urlO.pathname.endsWith(".pdf")) { - flags.add("pdf"); - } + const urlO = new URL(url); - if (urlO.pathname.endsWith(".docx")) { - flags.add("docx"); - } + if (urlO.pathname.endsWith(".pdf")) { + flags.add("pdf"); + } - return flags; + if (urlO.pathname.endsWith(".docx")) { + flags.add("docx"); + } + + return flags; } // The meta object contains all required information to perform a scrape. @@ -88,244 +108,314 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions: // The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required) // Having a meta object that is treated as immutable helps the code stay clean and easily tracable, // while also retaining the benefits that WebScraper had from its OOP design. -function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta { - const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; - if (specParams !== undefined) { - options = Object.assign(options, specParams.scrapeOptions); - internalOptions = Object.assign(internalOptions, specParams.internalOptions); - } +function buildMetaObject( + id: string, + url: string, + options: ScrapeOptions, + internalOptions: InternalOptions +): Meta { + const specParams = + urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; + if (specParams !== undefined) { + options = Object.assign(options, specParams.scrapeOptions); + internalOptions = Object.assign( + internalOptions, + specParams.internalOptions + ); + } - const _logger = logger.child({ module: "ScrapeURL", scrapeId: id, scrapeURL: url }); - const logs: any[] = []; + const _logger = logger.child({ + module: "ScrapeURL", + scrapeId: id, + scrapeURL: url + }); + const logs: any[] = []; - return { - id, url, options, internalOptions, - logger: _logger, - logs, - featureFlags: buildFeatureFlags(url, options, internalOptions), - }; + return { + id, + url, + options, + internalOptions, + logger: _logger, + logs, + featureFlags: buildFeatureFlags(url, options, internalOptions) + }; } export type InternalOptions = { - priority?: number; // Passed along to fire-engine - forceEngine?: Engine; - atsv?: boolean; // anti-bot solver, beta + priority?: number; // Passed along to fire-engine + forceEngine?: Engine; + atsv?: boolean; // anti-bot solver, beta - v0CrawlOnlyUrls?: boolean; - v0UseFastMode?: boolean; - v0DisableJsDom?: boolean; + v0CrawlOnlyUrls?: boolean; + v0UseFastMode?: boolean; + v0DisableJsDom?: boolean; - disableSmartWaitCache?: boolean; // Passed along to fire-engine + disableSmartWaitCache?: boolean; // Passed along to fire-engine }; -export type EngineResultsTracker = { [E in Engine]?: ({ - state: "error", - error: any, - unexpected: boolean, -} | { - state: "success", - result: EngineScrapeResult & { markdown: string }, - factors: Record, - unsupportedFeatures: Set, -} | { - state: "timeout", -}) & { - startedAt: number, - finishedAt: number, -} }; +export type EngineResultsTracker = { + [E in Engine]?: ( + | { + state: "error"; + error: any; + unexpected: boolean; + } + | { + state: "success"; + result: EngineScrapeResult & { markdown: string }; + factors: Record; + unsupportedFeatures: Set; + } + | { + state: "timeout"; + } + ) & { + startedAt: number; + finishedAt: number; + }; +}; export type EngineScrapeResultWithContext = { - engine: Engine, - unsupportedFeatures: Set, - result: (EngineScrapeResult & { markdown: string }), + engine: Engine; + unsupportedFeatures: Set; + result: EngineScrapeResult & { markdown: string }; }; function safeguardCircularError(error: T): T { - if (typeof error === "object" && error !== null && (error as any).results) { - const newError = structuredClone(error); - delete (newError as any).results; - return newError; - } else { - return error; - } + if (typeof error === "object" && error !== null && (error as any).results) { + const newError = structuredClone(error); + delete (newError as any).results; + return newError; + } else { + return error; + } } -async function scrapeURLLoop( - meta: Meta -): Promise { - meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`,); +async function scrapeURLLoop(meta: Meta): Promise { + meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`); - // TODO: handle sitemap data, see WebScraper/index.ts:280 - // TODO: ScrapeEvents + // TODO: handle sitemap data, see WebScraper/index.ts:280 + // TODO: ScrapeEvents - const fallbackList = buildFallbackList(meta); + const fallbackList = buildFallbackList(meta); - const results: EngineResultsTracker = {}; - let result: EngineScrapeResultWithContext | null = null; + const results: EngineResultsTracker = {}; + let result: EngineScrapeResultWithContext | null = null; - for (const { engine, unsupportedFeatures } of fallbackList) { - const startedAt = Date.now(); - try { - meta.logger.info("Scraping via " + engine + "..."); - const _engineResult = await scrapeURLWithEngine(meta, engine); - if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly. - _engineResult.markdown = await parseMarkdown(_engineResult.html); - } - const engineResult = _engineResult as EngineScrapeResult & { markdown: string }; + for (const { engine, unsupportedFeatures } of fallbackList) { + const startedAt = Date.now(); + try { + meta.logger.info("Scraping via " + engine + "..."); + const _engineResult = await scrapeURLWithEngine(meta, engine); + if (_engineResult.markdown === undefined) { + // Some engines emit Markdown directly. + _engineResult.markdown = await parseMarkdown(_engineResult.html); + } + const engineResult = _engineResult as EngineScrapeResult & { + markdown: string; + }; - // Success factors - const isLongEnough = engineResult.markdown.length >= 20; - const isGoodStatusCode = (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || engineResult.statusCode === 304; - const hasNoPageError = engineResult.error === undefined; + // Success factors + const isLongEnough = engineResult.markdown.length >= 20; + const isGoodStatusCode = + (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || + engineResult.statusCode === 304; + const hasNoPageError = engineResult.error === undefined; - results[engine] = { - state: "success", - result: engineResult, - factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, - unsupportedFeatures, - startedAt, - finishedAt: Date.now(), - }; + results[engine] = { + state: "success", + result: engineResult, + factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, + unsupportedFeatures, + startedAt, + finishedAt: Date.now() + }; - // NOTE: TODO: what to do when status code is bad is tough... - // we cannot just rely on text because error messages can be brief and not hit the limit - // should we just use all the fallbacks and pick the one with the longest text? - mogery - if (isLongEnough || !isGoodStatusCode) { - meta.logger.info("Scrape via " + engine + " deemed successful.", { factors: { isLongEnough, isGoodStatusCode, hasNoPageError } }); - result = { - engine, - unsupportedFeatures, - result: engineResult as EngineScrapeResult & { markdown: string } - }; - break; - } - } catch (error) { - if (error instanceof EngineError) { - meta.logger.info("Engine " + engine + " could not scrape the page.", { error }); - results[engine] = { - state: "error", - error: safeguardCircularError(error), - unexpected: false, - startedAt, - finishedAt: Date.now(), - }; - } else if (error instanceof TimeoutError) { - meta.logger.info("Engine " + engine + " timed out while scraping.", { error }); - results[engine] = { - state: "timeout", - startedAt, - finishedAt: Date.now(), - }; - } else if (error instanceof AddFeatureError || error instanceof RemoveFeatureError) { - throw error; - } else if (error instanceof LLMRefusalError) { - results[engine] = { - state: "error", - error: safeguardCircularError(error), - unexpected: true, - startedAt, - finishedAt: Date.now(), - } - error.results = results; - meta.logger.warn("LLM refusal encountered", { error }); - throw error; - } else if (error instanceof SiteError) { - throw error; - } else { - Sentry.captureException(error); - meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error }); - results[engine] = { - state: "error", - error: safeguardCircularError(error), - unexpected: true, - startedAt, - finishedAt: Date.now(), - } - } - } + // NOTE: TODO: what to do when status code is bad is tough... + // we cannot just rely on text because error messages can be brief and not hit the limit + // should we just use all the fallbacks and pick the one with the longest text? - mogery + if (isLongEnough || !isGoodStatusCode) { + meta.logger.info("Scrape via " + engine + " deemed successful.", { + factors: { isLongEnough, isGoodStatusCode, hasNoPageError } + }); + result = { + engine, + unsupportedFeatures, + result: engineResult as EngineScrapeResult & { markdown: string } + }; + break; + } + } catch (error) { + if (error instanceof EngineError) { + meta.logger.info("Engine " + engine + " could not scrape the page.", { + error + }); + results[engine] = { + state: "error", + error: safeguardCircularError(error), + unexpected: false, + startedAt, + finishedAt: Date.now() + }; + } else if (error instanceof TimeoutError) { + meta.logger.info("Engine " + engine + " timed out while scraping.", { + error + }); + results[engine] = { + state: "timeout", + startedAt, + finishedAt: Date.now() + }; + } else if ( + error instanceof AddFeatureError || + error instanceof RemoveFeatureError + ) { + throw error; + } else if (error instanceof LLMRefusalError) { + results[engine] = { + state: "error", + error: safeguardCircularError(error), + unexpected: true, + startedAt, + finishedAt: Date.now() + }; + error.results = results; + meta.logger.warn("LLM refusal encountered", { error }); + throw error; + } else if (error instanceof SiteError) { + throw error; + } else { + Sentry.captureException(error); + meta.logger.info( + "An unexpected error happened while scraping with " + engine + ".", + { error } + ); + results[engine] = { + state: "error", + error: safeguardCircularError(error), + unexpected: true, + startedAt, + finishedAt: Date.now() + }; + } } + } - if (result === null) { - throw new NoEnginesLeftError(fallbackList.map(x => x.engine), results); + if (result === null) { + throw new NoEnginesLeftError( + fallbackList.map((x) => x.engine), + results + ); + } + + let document: Document = { + markdown: result.result.markdown, + rawHtml: result.result.html, + screenshot: result.result.screenshot, + actions: result.result.actions, + metadata: { + sourceURL: meta.url, + url: result.result.url, + statusCode: result.result.statusCode, + error: result.result.error } + }; - let document: Document = { - markdown: result.result.markdown, - rawHtml: result.result.html, - screenshot: result.result.screenshot, - actions: result.result.actions, - metadata: { - sourceURL: meta.url, - url: result.result.url, - statusCode: result.result.statusCode, - error: result.result.error, - }, - } + if (result.unsupportedFeatures.size > 0) { + const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`; + meta.logger.warn(warning, { + engine: result.engine, + unsupportedFeatures: result.unsupportedFeatures + }); + document.warning = + document.warning !== undefined + ? document.warning + " " + warning + : warning; + } - if (result.unsupportedFeatures.size > 0) { - const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`; - meta.logger.warn(warning, { engine: result.engine, unsupportedFeatures: result.unsupportedFeatures }); - document.warning = document.warning !== undefined ? document.warning + " " + warning : warning; - } + document = await executeTransformers(meta, document); - document = await executeTransformers(meta, document); - - return { - success: true, - document, - logs: meta.logs, - engines: results, - }; + return { + success: true, + document, + logs: meta.logs, + engines: results + }; } export async function scrapeURL( - id: string, - url: string, - options: ScrapeOptions, - internalOptions: InternalOptions = {}, + id: string, + url: string, + options: ScrapeOptions, + internalOptions: InternalOptions = {} ): Promise { - const meta = buildMetaObject(id, url, options, internalOptions); - try { - while (true) { - try { - return await scrapeURLLoop(meta); - } catch (error) { - if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) { - meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags }); - meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags)); - } else if (error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined) { - meta.logger.debug("Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags }); - meta.featureFlags = new Set([...meta.featureFlags].filter(x => !error.featureFlags.includes(x))); - } else { - throw error; - } - } - } - } catch (error) { - let results: EngineResultsTracker = {}; - - if (error instanceof NoEnginesLeftError) { - meta.logger.warn("scrapeURL: All scraping engines failed!", { error }); - results = error.results; - } else if (error instanceof LLMRefusalError) { - meta.logger.warn("scrapeURL: LLM refused to extract content", { error }); - results = error.results!; - } else if (error instanceof Error && error.message.includes("Invalid schema for response_format")) { // TODO: seperate into custom error - meta.logger.warn("scrapeURL: LLM schema error", { error }); - // TODO: results? - } else if (error instanceof SiteError) { - meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); + const meta = buildMetaObject(id, url, options, internalOptions); + try { + while (true) { + try { + return await scrapeURLLoop(meta); + } catch (error) { + if ( + error instanceof AddFeatureError && + meta.internalOptions.forceEngine === undefined + ) { + meta.logger.debug( + "More feature flags requested by scraper: adding " + + error.featureFlags.join(", "), + { error, existingFlags: meta.featureFlags } + ); + meta.featureFlags = new Set( + [...meta.featureFlags].concat(error.featureFlags) + ); + } else if ( + error instanceof RemoveFeatureError && + meta.internalOptions.forceEngine === undefined + ) { + meta.logger.debug( + "Incorrect feature flags reported by scraper: removing " + + error.featureFlags.join(","), + { error, existingFlags: meta.featureFlags } + ); + meta.featureFlags = new Set( + [...meta.featureFlags].filter( + (x) => !error.featureFlags.includes(x) + ) + ); } else { - Sentry.captureException(error); - meta.logger.error("scrapeURL: Unexpected error happened", { error }); - // TODO: results? - } - - return { - success: false, - error, - logs: meta.logs, - engines: results, + throw error; } + } } + } catch (error) { + let results: EngineResultsTracker = {}; + + if (error instanceof NoEnginesLeftError) { + meta.logger.warn("scrapeURL: All scraping engines failed!", { error }); + results = error.results; + } else if (error instanceof LLMRefusalError) { + meta.logger.warn("scrapeURL: LLM refused to extract content", { error }); + results = error.results!; + } else if ( + error instanceof Error && + error.message.includes("Invalid schema for response_format") + ) { + // TODO: seperate into custom error + meta.logger.warn("scrapeURL: LLM schema error", { error }); + // TODO: results? + } else if (error instanceof SiteError) { + meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); + } else { + Sentry.captureException(error); + meta.logger.error("scrapeURL: Unexpected error happened", { error }); + // TODO: results? + } + + return { + success: false, + error, + logs: meta.logs, + engines: results + }; + } } diff --git a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts index 484bf725..6d71c036 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts @@ -3,33 +3,36 @@ import { load } from "cheerio"; import { logger } from "../../../lib/logger"; export function extractLinks(html: string, baseUrl: string): string[] { - const $ = load(html); - const links: string[] = []; - - $('a').each((_, element) => { - const href = $(element).attr('href'); - if (href) { - try { - if (href.startsWith('http://') || href.startsWith('https://')) { - // Absolute URL, add as is - links.push(href); - } else if (href.startsWith('/')) { - // Relative URL starting with '/', append to origin - links.push(new URL(href, baseUrl).href); - } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { - // Relative URL not starting with '/', append to base URL - links.push(new URL(href, baseUrl).href); - } else if (href.startsWith('mailto:')) { - // mailto: links, add as is - links.push(href); - } - // Fragment-only links (#) are ignored - } catch (error) { - logger.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, { error }); - } + const $ = load(html); + const links: string[] = []; + + $("a").each((_, element) => { + const href = $(element).attr("href"); + if (href) { + try { + if (href.startsWith("http://") || href.startsWith("https://")) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith("/")) { + // Relative URL starting with '/', append to origin + links.push(new URL(href, baseUrl).href); + } else if (!href.startsWith("#") && !href.startsWith("mailto:")) { + // Relative URL not starting with '/', append to base URL + links.push(new URL(href, baseUrl).href); + } else if (href.startsWith("mailto:")) { + // mailto: links, add as is + links.push(href); } - }); - - // Remove duplicates and return - return [...new Set(links)]; -} \ No newline at end of file + // Fragment-only links (#) are ignored + } catch (error) { + logger.error( + `Failed to construct URL for href: ${href} with base: ${baseUrl}`, + { error } + ); + } + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index f3fe5a5b..0f581373 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -2,7 +2,10 @@ import { load } from "cheerio"; import { Document } from "../../../controllers/v1/types"; import { Meta } from ".."; -export function extractMetadata(meta: Meta, html: string): Document["metadata"] { +export function extractMetadata( + meta: Meta, + html: string +): Document["metadata"] { let title: string | undefined = undefined; let description: string | undefined = undefined; let language: string | undefined = undefined; @@ -39,36 +42,54 @@ export function extractMetadata(meta: Meta, html: string): Document["metadata"] try { title = soup("title").text() || undefined; description = soup('meta[name="description"]').attr("content") || undefined; - + // Assuming the language is part of the URL as per the regex pattern - language = soup('html').attr('lang') || undefined; + language = soup("html").attr("lang") || undefined; keywords = soup('meta[name="keywords"]').attr("content") || undefined; robots = soup('meta[name="robots"]').attr("content") || undefined; ogTitle = soup('meta[property="og:title"]').attr("content") || undefined; - ogDescription = soup('meta[property="og:description"]').attr("content") || undefined; + ogDescription = + soup('meta[property="og:description"]').attr("content") || undefined; ogUrl = soup('meta[property="og:url"]').attr("content") || undefined; ogImage = soup('meta[property="og:image"]').attr("content") || undefined; ogAudio = soup('meta[property="og:audio"]').attr("content") || undefined; - ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || undefined; + ogDeterminer = + soup('meta[property="og:determiner"]').attr("content") || undefined; ogLocale = soup('meta[property="og:locale"]').attr("content") || undefined; - ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || undefined; - ogSiteName = soup('meta[property="og:site_name"]').attr("content") || undefined; + ogLocaleAlternate = + soup('meta[property="og:locale:alternate"]') + .map((i, el) => soup(el).attr("content")) + .get() || undefined; + ogSiteName = + soup('meta[property="og:site_name"]').attr("content") || undefined; ogVideo = soup('meta[property="og:video"]').attr("content") || undefined; - articleSection = soup('meta[name="article:section"]').attr("content") || undefined; + articleSection = + soup('meta[name="article:section"]').attr("content") || undefined; articleTag = soup('meta[name="article:tag"]').attr("content") || undefined; - publishedTime = soup('meta[property="article:published_time"]').attr("content") || undefined; - modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || undefined; - dcTermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || undefined; - dcDescription = soup('meta[name="dc.description"]').attr("content") || undefined; + publishedTime = + soup('meta[property="article:published_time"]').attr("content") || + undefined; + modifiedTime = + soup('meta[property="article:modified_time"]').attr("content") || + undefined; + dcTermsKeywords = + soup('meta[name="dcterms.keywords"]').attr("content") || undefined; + dcDescription = + soup('meta[name="dc.description"]').attr("content") || undefined; dcSubject = soup('meta[name="dc.subject"]').attr("content") || undefined; - dcTermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || undefined; - dcTermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || undefined; + dcTermsSubject = + soup('meta[name="dcterms.subject"]').attr("content") || undefined; + dcTermsAudience = + soup('meta[name="dcterms.audience"]').attr("content") || undefined; dcType = soup('meta[name="dc.type"]').attr("content") || undefined; - dcTermsType = soup('meta[name="dcterms.type"]').attr("content") || undefined; + dcTermsType = + soup('meta[name="dcterms.type"]').attr("content") || undefined; dcDate = soup('meta[name="dc.date"]').attr("content") || undefined; - dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || undefined; - dcTermsCreated = soup('meta[name="dcterms.created"]').attr("content") || undefined; + dcDateCreated = + soup('meta[name="dc.date.created"]').attr("content") || undefined; + dcTermsCreated = + soup('meta[name="dcterms.created"]').attr("content") || undefined; try { // Extract all meta tags for custom metadata @@ -127,6 +148,6 @@ export function extractMetadata(meta: Meta, html: string): Document["metadata"] publishedTime, articleTag, articleSection, - ...customMetadata, + ...customMetadata }; } diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 09a280b8..400c23a7 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -4,143 +4,210 @@ import { v4 as uuid } from "uuid"; import * as Sentry from "@sentry/node"; export type RobustFetchParams> = { - url: string; - logger: Logger, - method: "GET" | "POST" | "DELETE" | "PUT"; - body?: any; - headers?: Record; - schema?: Schema; - dontParseResponse?: boolean; - ignoreResponse?: boolean; - ignoreFailure?: boolean; - requestId?: string; - tryCount?: number; - tryCooldown?: number; + url: string; + logger: Logger; + method: "GET" | "POST" | "DELETE" | "PUT"; + body?: any; + headers?: Record; + schema?: Schema; + dontParseResponse?: boolean; + ignoreResponse?: boolean; + ignoreFailure?: boolean; + requestId?: string; + tryCount?: number; + tryCooldown?: number; }; -export async function robustFetch, Output = z.infer>({ +export async function robustFetch< + Schema extends z.Schema, + Output = z.infer +>({ + url, + logger, + method = "GET", + body, + headers, + schema, + ignoreResponse = false, + ignoreFailure = false, + requestId = uuid(), + tryCount = 1, + tryCooldown +}: RobustFetchParams): Promise { + const params = { url, logger, - method = "GET", + method, body, headers, schema, - ignoreResponse = false, - ignoreFailure = false, - requestId = uuid(), - tryCount = 1, - tryCooldown, -}: RobustFetchParams): Promise { - const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount, tryCooldown }; + ignoreResponse, + ignoreFailure, + tryCount, + tryCooldown + }; - let request: Response; - try { - request = await fetch(url, { - method, - headers: { - ...(body instanceof FormData - ? ({}) - : body !== undefined ? ({ - "Content-Type": "application/json", - }) : {}), - ...(headers !== undefined ? headers : {}), - }, - ...(body instanceof FormData ? ({ - body, - }) : body !== undefined ? ({ - body: JSON.stringify(body), - }) : {}), + let request: Response; + try { + request = await fetch(url, { + method, + headers: { + ...(body instanceof FormData + ? {} + : body !== undefined + ? { + "Content-Type": "application/json" + } + : {}), + ...(headers !== undefined ? headers : {}) + }, + ...(body instanceof FormData + ? { + body + } + : body !== undefined + ? { + body: JSON.stringify(body) + } + : {}) + }); + } catch (error) { + if (!ignoreFailure) { + Sentry.captureException(error); + if (tryCount > 1) { + logger.debug( + "Request failed, trying " + (tryCount - 1) + " more times", + { params, error, requestId } + ); + return await robustFetch({ + ...params, + requestId, + tryCount: tryCount - 1 }); - } catch (error) { - if (!ignoreFailure) { - Sentry.captureException(error); - if (tryCount > 1) { - logger.debug("Request failed, trying " + (tryCount - 1) + " more times", { params, error, requestId }); - return await robustFetch({ - ...params, - requestId, - tryCount: tryCount - 1, - }); - } else { - logger.debug("Request failed", { params, error, requestId }); - throw new Error("Request failed", { - cause: { - params, requestId, error, - }, - }); - } - - } else { - return null as Output; - } - } - - if (ignoreResponse === true) { - return null as Output; - } - - const response = { - status: request.status, - headers: request.headers, - body: await request.text(), // NOTE: can this throw an exception? - }; - - if (request.status >= 300) { - if (tryCount > 1) { - logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId }); - if (tryCooldown !== undefined) { - await new Promise((resolve) => setTimeout(() => resolve(null), tryCooldown)); - } - return await robustFetch({ - ...params, - requestId, - tryCount: tryCount - 1, - }); - } else { - logger.debug("Request sent failure status", { params, request, response, requestId }); - throw new Error("Request sent failure status", { - cause: { - params, request, response, requestId, - }, - }); - } - } - - let data: Output; - try { - data = JSON.parse(response.body); - } catch (error) { - logger.debug("Request sent malformed JSON", { params, request, response, requestId }); - throw new Error("Request sent malformed JSON", { - cause: { - params, request, response, requestId, - }, + } else { + logger.debug("Request failed", { params, error, requestId }); + throw new Error("Request failed", { + cause: { + params, + requestId, + error + } }); + } + } else { + return null as Output; } + } - if (schema) { - try { - data = schema.parse(data); - } catch (error) { - if (error instanceof ZodError) { - logger.debug("Response does not match provided schema", { params, request, response, requestId, error, schema }); - throw new Error("Response does not match provided schema", { - cause: { - params, request, response, requestId, - error, schema, - } - }); - } else { - logger.debug("Parsing response with provided schema failed", { params, request, response, requestId, error, schema }); - throw new Error("Parsing response with provided schema failed", { - cause: { - params, request, response, requestId, - error, schema - } - }); - } + if (ignoreResponse === true) { + return null as Output; + } + + const response = { + status: request.status, + headers: request.headers, + body: await request.text() // NOTE: can this throw an exception? + }; + + if (request.status >= 300) { + if (tryCount > 1) { + logger.debug( + "Request sent failure status, trying " + (tryCount - 1) + " more times", + { params, request, response, requestId } + ); + if (tryCooldown !== undefined) { + await new Promise((resolve) => + setTimeout(() => resolve(null), tryCooldown) + ); + } + return await robustFetch({ + ...params, + requestId, + tryCount: tryCount - 1 + }); + } else { + logger.debug("Request sent failure status", { + params, + request, + response, + requestId + }); + throw new Error("Request sent failure status", { + cause: { + params, + request, + response, + requestId } + }); } + } - return data; -} \ No newline at end of file + let data: Output; + try { + data = JSON.parse(response.body); + } catch (error) { + logger.debug("Request sent malformed JSON", { + params, + request, + response, + requestId + }); + throw new Error("Request sent malformed JSON", { + cause: { + params, + request, + response, + requestId + } + }); + } + + if (schema) { + try { + data = schema.parse(data); + } catch (error) { + if (error instanceof ZodError) { + logger.debug("Response does not match provided schema", { + params, + request, + response, + requestId, + error, + schema + }); + throw new Error("Response does not match provided schema", { + cause: { + params, + request, + response, + requestId, + error, + schema + } + }); + } else { + logger.debug("Parsing response with provided schema failed", { + params, + request, + response, + requestId, + error, + schema + }); + throw new Error("Parsing response with provided schema failed", { + cause: { + params, + request, + response, + requestId, + error, + schema + } + }); + } + } + } + + return data; +} diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 9458ed0f..7701aeaf 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -4,55 +4,53 @@ import { AnyNode, Cheerio, load } from "cheerio"; import { ScrapeOptions } from "../../../controllers/v1/types"; const excludeNonMainTags = [ - "header", - "footer", - "nav", - "aside", - ".header", - ".top", - ".navbar", - "#header", - ".footer", - ".bottom", - "#footer", - ".sidebar", - ".side", - ".aside", - "#sidebar", - ".modal", - ".popup", - "#modal", - ".overlay", - ".ad", - ".ads", - ".advert", - "#ad", - ".lang-selector", - ".language", - "#language-selector", - ".social", - ".social-media", - ".social-links", - "#social", - ".menu", - ".navigation", - "#nav", - ".breadcrumbs", - "#breadcrumbs", - "#search-form", - ".search", - "#search", - ".share", - "#share", - ".widget", - "#widget", - ".cookie", - "#cookie" + "header", + "footer", + "nav", + "aside", + ".header", + ".top", + ".navbar", + "#header", + ".footer", + ".bottom", + "#footer", + ".sidebar", + ".side", + ".aside", + "#sidebar", + ".modal", + ".popup", + "#modal", + ".overlay", + ".ad", + ".ads", + ".advert", + "#ad", + ".lang-selector", + ".language", + "#language-selector", + ".social", + ".social-media", + ".social-links", + "#social", + ".menu", + ".navigation", + "#nav", + ".breadcrumbs", + "#breadcrumbs", + "#search-form", + ".search", + "#search", + ".share", + "#share", + ".widget", + "#widget", + ".cookie", + "#cookie" ]; -const forceIncludeMainTags = [ - "#main" -]; +const forceIncludeMainTags = ["#main"]; export const removeUnwantedElements = ( html: string, @@ -60,58 +58,65 @@ export const removeUnwantedElements = ( ) => { const soup = load(html); - if (scrapeOptions.includeTags && scrapeOptions.includeTags.filter(x => x.trim().length !== 0).length > 0) { + if ( + scrapeOptions.includeTags && + scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0 + ) { // Create a new root element to hold the tags to keep const newRoot = load("
")("div"); scrapeOptions.includeTags.forEach((tag) => { - soup(tag).each((_, element) => { - newRoot.append(soup(element).clone()); - }); + soup(tag).each((_, element) => { + newRoot.append(soup(element).clone()); + }); }); return newRoot.html() ?? ""; } soup("script, style, noscript, meta, head").remove(); - if (scrapeOptions.excludeTags && scrapeOptions.excludeTags.filter(x => x.trim().length !== 0).length > 0) { - scrapeOptions.excludeTags.forEach((tag) => { - let elementsToRemove: Cheerio; - if (tag.startsWith("*") && tag.endsWith("*")) { - let classMatch = false; + if ( + scrapeOptions.excludeTags && + scrapeOptions.excludeTags.filter((x) => x.trim().length !== 0).length > 0 + ) { + scrapeOptions.excludeTags.forEach((tag) => { + let elementsToRemove: Cheerio; + if (tag.startsWith("*") && tag.endsWith("*")) { + let classMatch = false; - const regexPattern = new RegExp(tag.slice(1, -1), "i"); - elementsToRemove = soup("*").filter((i, element) => { - if (element.type === "tag") { - const attributes = element.attribs; - const tagNameMatches = regexPattern.test(element.name); - const attributesMatch = Object.keys(attributes).some((attr) => - regexPattern.test(`${attr}="${attributes[attr]}"`) - ); - if (tag.startsWith("*.")) { - classMatch = Object.keys(attributes).some((attr) => - regexPattern.test(`class="${attributes[attr]}"`) - ); - } - return tagNameMatches || attributesMatch || classMatch; - } - return false; - }); - } else { - elementsToRemove = soup(tag); + const regexPattern = new RegExp(tag.slice(1, -1), "i"); + elementsToRemove = soup("*").filter((i, element) => { + if (element.type === "tag") { + const attributes = element.attribs; + const tagNameMatches = regexPattern.test(element.name); + const attributesMatch = Object.keys(attributes).some((attr) => + regexPattern.test(`${attr}="${attributes[attr]}"`) + ); + if (tag.startsWith("*.")) { + classMatch = Object.keys(attributes).some((attr) => + regexPattern.test(`class="${attributes[attr]}"`) + ); } - elementsToRemove.remove(); + return tagNameMatches || attributesMatch || classMatch; + } + return false; }); - } + } else { + elementsToRemove = soup(tag); + } + elementsToRemove.remove(); + }); + } - if (scrapeOptions.onlyMainContent) { - excludeNonMainTags.forEach((tag) => { - const elementsToRemove = soup(tag) - .filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join("")); - - elementsToRemove.remove(); - }); - } - - const cleanedHtml = soup.html(); - return cleanedHtml; + if (scrapeOptions.onlyMainContent) { + excludeNonMainTags.forEach((tag) => { + const elementsToRemove = soup(tag).filter( + forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join("") + ); + + elementsToRemove.remove(); + }); + } + + const cleanedHtml = soup.html(); + return cleanedHtml; }; diff --git a/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts b/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts index 7ce4f66e..0810dc93 100644 --- a/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts +++ b/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts @@ -2,8 +2,8 @@ import { InternalOptions } from ".."; import { ScrapeOptions } from "../../../controllers/v1/types"; export type UrlSpecificParams = { - scrapeOptions: Partial, - internalOptions: Partial, + scrapeOptions: Partial; + internalOptions: Partial; }; // const docsParam: UrlSpecificParams = { @@ -12,40 +12,40 @@ export type UrlSpecificParams = { // } export const urlSpecificParams: Record = { - // "support.greenpay.me": docsParam, - // "docs.pdw.co": docsParam, - // "developers.notion.com": docsParam, - // "docs2.hubitat.com": docsParam, - // "rsseau.fr": docsParam, - // "help.salesforce.com": docsParam, - // "scrapethissite.com": { - // scrapeOptions: {}, - // internalOptions: { forceEngine: "fetch" }, - // }, - // "eonhealth.com": { - // defaultScraper: "fire-engine", - // params: { - // fireEngineOptions: { - // mobileProxy: true, - // method: "get", - // engine: "request", - // }, - // }, - // }, - // "notion.com": { - // scrapeOptions: { waitFor: 2000 }, - // internalOptions: { forceEngine: "fire-engine;playwright" } - // }, - // "developer.apple.com": { - // scrapeOptions: { waitFor: 2000 }, - // internalOptions: { forceEngine: "fire-engine;playwright" } - // }, - "digikey.com": { - scrapeOptions: {}, - internalOptions: { forceEngine: "fire-engine;tlsclient" } - }, - "lorealparis.hu": { - scrapeOptions: {}, - internalOptions: { forceEngine: "fire-engine;tlsclient" }, - } + // "support.greenpay.me": docsParam, + // "docs.pdw.co": docsParam, + // "developers.notion.com": docsParam, + // "docs2.hubitat.com": docsParam, + // "rsseau.fr": docsParam, + // "help.salesforce.com": docsParam, + // "scrapethissite.com": { + // scrapeOptions: {}, + // internalOptions: { forceEngine: "fetch" }, + // }, + // "eonhealth.com": { + // defaultScraper: "fire-engine", + // params: { + // fireEngineOptions: { + // mobileProxy: true, + // method: "get", + // engine: "request", + // }, + // }, + // }, + // "notion.com": { + // scrapeOptions: { waitFor: 2000 }, + // internalOptions: { forceEngine: "fire-engine;playwright" } + // }, + // "developer.apple.com": { + // scrapeOptions: { waitFor: 2000 }, + // internalOptions: { forceEngine: "fire-engine;playwright" } + // }, + "digikey.com": { + scrapeOptions: {}, + internalOptions: { forceEngine: "fire-engine;tlsclient" } + }, + "lorealparis.hu": { + scrapeOptions: {}, + internalOptions: { forceEngine: "fire-engine;tlsclient" } + } }; diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index 23cf253b..8bef0c2c 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -7,384 +7,485 @@ import { scrapeOptions } from "../../controllers/v1/types"; import { Engine } from "./engines"; const testEngines: (Engine | undefined)[] = [ - undefined, - "fire-engine;chrome-cdp", - "fire-engine;playwright", - "fire-engine;tlsclient", - "scrapingbee", - "scrapingbeeLoad", - "fetch", + undefined, + "fire-engine;chrome-cdp", + "fire-engine;playwright", + "fire-engine;tlsclient", + "scrapingbee", + "scrapingbeeLoad", + "fetch" ]; const testEnginesScreenshot: (Engine | undefined)[] = [ - undefined, - "fire-engine;chrome-cdp", - "fire-engine;playwright", - "scrapingbee", - "scrapingbeeLoad", + undefined, + "fire-engine;chrome-cdp", + "fire-engine;playwright", + "scrapingbee", + "scrapingbeeLoad" ]; describe("Standalone scrapeURL tests", () => { - describe.each(testEngines)("Engine %s", (forceEngine: Engine | undefined) => { - it("Basic scrape", async () => { - const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).not.toHaveProperty("content"); - expect(out.document).toHaveProperty("markdown"); - expect(out.document).toHaveProperty("metadata"); - expect(out.document).not.toHaveProperty("html"); - expect(out.document.markdown).toContain("_Roast_"); - expect(out.document.metadata.error).toBeUndefined(); - expect(out.document.metadata.title).toBe("Roast My Website"); - expect(out.document.metadata.description).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" - ); - expect(out.document.metadata.keywords).toBe( - "Roast My Website,Roast,Website,GitHub,Firecrawl" - ); - expect(out.document.metadata.robots).toBe("follow, index"); - expect(out.document.metadata.ogTitle).toBe("Roast My Website"); - expect(out.document.metadata.ogDescription).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" - ); - expect(out.document.metadata.ogUrl).toBe( - "https://www.roastmywebsite.ai" - ); - expect(out.document.metadata.ogImage).toBe( - "https://www.roastmywebsite.ai/og.png" - ); - expect(out.document.metadata.ogLocaleAlternate).toStrictEqual([]); - expect(out.document.metadata.ogSiteName).toBe("Roast My Website"); - expect(out.document.metadata.sourceURL).toBe( - "https://www.roastmywebsite.ai/" - ); - expect(out.document.metadata.statusCode).toBe(200); - } - - }, 30000); - - it("Scrape with formats markdown and html", async () => { - const out = await scrapeURL("test:scrape-formats-markdown-html", "https://roastmywebsite.ai", scrapeOptions.parse({ - formats: ["markdown", "html"], - }), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty("markdown"); - expect(out.document).toHaveProperty("html"); - expect(out.document).toHaveProperty("metadata"); - expect(out.document.markdown).toContain("_Roast_"); - expect(out.document.html).toContain(" { - const out = await scrapeURL("test:scrape-onlyMainContent-false", "https://www.scrapethissite.com/", scrapeOptions.parse({ - onlyMainContent: false, - }), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty("markdown"); - expect(out.document).toHaveProperty("metadata"); - expect(out.document).not.toHaveProperty("html"); - expect(out.document.markdown).toContain("[FAQ](/faq/)"); // .nav - expect(out.document.markdown).toContain("Hartley Brody 2023"); // #footer - } - }, 30000); - - it("Scrape with excludeTags", async () => { - const out = await scrapeURL("test:scrape-excludeTags", "https://www.scrapethissite.com/", scrapeOptions.parse({ - onlyMainContent: false, - excludeTags: ['.nav', '#footer', 'strong'], - }), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty("markdown"); - expect(out.document).toHaveProperty("metadata"); - expect(out.document).not.toHaveProperty("html"); - expect(out.document.markdown).not.toContain("Hartley Brody 2023"); - expect(out.document.markdown).not.toContain("[FAQ](/faq/)"); - } - }, 30000); - - it("Scrape of a page with 400 status code", async () => { - const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(400); - } - }, 30000); - - it("Scrape of a page with 401 status code", async () => { - const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(401); - } - }, 30000); - - it("Scrape of a page with 403 status code", async () => { - const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(403); - } - }, 30000); - - it("Scrape of a page with 404 status code", async () => { - const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(404); - } - }, 30000); - - it("Scrape of a page with 405 status code", async () => { - const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(405); - } - }, 30000); - - it("Scrape of a page with 500 status code", async () => { - const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(500); - } - }, 30000); + describe.each(testEngines)("Engine %s", (forceEngine: Engine | undefined) => { + it("Basic scrape", async () => { + const out = await scrapeURL( + "test:scrape-basic", + "https://www.roastmywebsite.ai/", + scrapeOptions.parse({}), + { forceEngine } + ); - it("Scrape a redirected page", async () => { - const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document.markdown).toContain("Explore Sandbox"); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.sourceURL).toBe("https://scrapethissite.com/"); - expect(out.document.metadata.url).toBe("https://www.scrapethissite.com/"); - expect(out.document.metadata.statusCode).toBe(200); - expect(out.document.metadata.error).toBeUndefined(); - } - }, 30000); - }); - - describe.each(testEnginesScreenshot)("Screenshot on engine %s", (forceEngine: Engine | undefined) => { - it("Scrape with screenshot", async () => { - const out = await scrapeURL("test:scrape-screenshot", "https://www.scrapethissite.com/", scrapeOptions.parse({ - formats: ["screenshot"], - }), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('screenshot'); - expect(typeof out.document.screenshot).toBe("string"); - expect(out.document.screenshot!.startsWith("https://service.firecrawl.dev/storage/v1/object/public/media/")); - // TODO: attempt to fetch screenshot - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(200); - expect(out.document.metadata.error).toBeUndefined(); - } - }, 30000); - - it("Scrape with full-page screenshot", async () => { - const out = await scrapeURL("test:scrape-screenshot-fullPage", "https://www.scrapethissite.com/", scrapeOptions.parse({ - formats: ["screenshot@fullPage"], - }), { forceEngine }); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('screenshot'); - expect(typeof out.document.screenshot).toBe("string"); - expect(out.document.screenshot!.startsWith("https://service.firecrawl.dev/storage/v1/object/public/media/")); - // TODO: attempt to fetch screenshot - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.statusCode).toBe(200); - expect(out.document.metadata.error).toBeUndefined(); - } - }, 30000); - }); - - it("Scrape of a PDF file", async () => { - const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({})); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.markdown).toContain('Broad Line Radio Galaxy'); - expect(out.document.metadata.statusCode).toBe(200); - expect(out.document.metadata.error).toBeUndefined(); - } - }, 60000); - - it("Scrape a DOCX file", async () => { - const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({})); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.markdown).toContain('SERIES A PREFERRED STOCK PURCHASE AGREEMENT'); - expect(out.document.metadata.statusCode).toBe(200); - expect(out.document.metadata.error).toBeUndefined(); - } - }, 60000) - - it("LLM extract with prompt and schema", async () => { - const out = await scrapeURL("test:llm-extract-prompt-schema", "https://firecrawl.dev", scrapeOptions.parse({ - formats: ["extract"], - extract: { - prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", - schema: { - type: "object", - properties: { - company_mission: { type: "string" }, - supports_sso: { type: "boolean" }, - is_open_source: { type: "boolean" }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], - additionalProperties: false, - }, - }, - })); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty("extract"); - expect(out.document.extract).toHaveProperty("company_mission"); - expect(out.document.extract).toHaveProperty("supports_sso"); - expect(out.document.extract).toHaveProperty("is_open_source"); - expect(typeof out.document.extract.company_mission).toBe("string"); - expect(out.document.extract.supports_sso).toBe(false); - expect(out.document.extract.is_open_source).toBe(true); - } - }, 120000) - - it("LLM extract with schema only", async () => { - const out = await scrapeURL("test:llm-extract-schema", "https://firecrawl.dev", scrapeOptions.parse({ - formats: ["extract"], - extract: { - schema: { - type: "object", - properties: { - company_mission: { type: "string" }, - supports_sso: { type: "boolean" }, - is_open_source: { type: "boolean" }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], - additionalProperties: false, - }, - }, - })); - - // expect(out.logs.length).toBeGreaterThan(0); - expect(out.success).toBe(true); - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty("extract"); - expect(out.document.extract).toHaveProperty("company_mission"); - expect(out.document.extract).toHaveProperty("supports_sso"); - expect(out.document.extract).toHaveProperty("is_open_source"); - expect(typeof out.document.extract.company_mission).toBe("string"); - expect(out.document.extract.supports_sso).toBe(false); - expect(out.document.extract.is_open_source).toBe(true); - } - }, 120000) - - test.concurrent.each(new Array(100).fill(0).map((_, i) => i))("Concurrent scrape #%i", async (i) => { - const url = "https://www.scrapethissite.com/?i=" + i; - const id = "test:concurrent:" + url; - const out = await scrapeURL(id, url, scrapeOptions.parse({})); - - const replacer = (key: string, value: any) => { - if (value instanceof Error) { - return { - ...value, - message: value.message, - name: value.name, - cause: value.cause, - stack: value.stack, - } - } else { - return value; - } - } - - // verify that log collection works properly while concurrency is happening - // expect(out.logs.length).toBeGreaterThan(0); - const weirdLogs = out.logs.filter(x => x.scrapeId !== id); - if (weirdLogs.length > 0) { - console.warn(JSON.stringify(weirdLogs, replacer)); - } - expect(weirdLogs.length).toBe(0); - - if (!out.success) console.error(JSON.stringify(out, replacer)); - expect(out.success).toBe(true); - - if (out.success) { - expect(out.document.warning).toBeUndefined(); - expect(out.document).toHaveProperty('markdown'); - expect(out.document).toHaveProperty('metadata'); - expect(out.document.metadata.error).toBeUndefined(); - expect(out.document.metadata.statusCode).toBe(200); - } + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).not.toHaveProperty("content"); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document).not.toHaveProperty("html"); + expect(out.document.markdown).toContain("_Roast_"); + expect(out.document.metadata.error).toBeUndefined(); + expect(out.document.metadata.title).toBe("Roast My Website"); + expect(out.document.metadata.description).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(out.document.metadata.keywords).toBe( + "Roast My Website,Roast,Website,GitHub,Firecrawl" + ); + expect(out.document.metadata.robots).toBe("follow, index"); + expect(out.document.metadata.ogTitle).toBe("Roast My Website"); + expect(out.document.metadata.ogDescription).toBe( + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + ); + expect(out.document.metadata.ogUrl).toBe( + "https://www.roastmywebsite.ai" + ); + expect(out.document.metadata.ogImage).toBe( + "https://www.roastmywebsite.ai/og.png" + ); + expect(out.document.metadata.ogLocaleAlternate).toStrictEqual([]); + expect(out.document.metadata.ogSiteName).toBe("Roast My Website"); + expect(out.document.metadata.sourceURL).toBe( + "https://www.roastmywebsite.ai/" + ); + expect(out.document.metadata.statusCode).toBe(200); + } }, 30000); -}) + + it("Scrape with formats markdown and html", async () => { + const out = await scrapeURL( + "test:scrape-formats-markdown-html", + "https://roastmywebsite.ai", + scrapeOptions.parse({ + formats: ["markdown", "html"] + }), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("html"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.markdown).toContain("_Roast_"); + expect(out.document.html).toContain(" { + const out = await scrapeURL( + "test:scrape-onlyMainContent-false", + "https://www.scrapethissite.com/", + scrapeOptions.parse({ + onlyMainContent: false + }), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document).not.toHaveProperty("html"); + expect(out.document.markdown).toContain("[FAQ](/faq/)"); // .nav + expect(out.document.markdown).toContain("Hartley Brody 2023"); // #footer + } + }, 30000); + + it("Scrape with excludeTags", async () => { + const out = await scrapeURL( + "test:scrape-excludeTags", + "https://www.scrapethissite.com/", + scrapeOptions.parse({ + onlyMainContent: false, + excludeTags: [".nav", "#footer", "strong"] + }), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document).not.toHaveProperty("html"); + expect(out.document.markdown).not.toContain("Hartley Brody 2023"); + expect(out.document.markdown).not.toContain("[FAQ](/faq/)"); + } + }, 30000); + + it("Scrape of a page with 400 status code", async () => { + const out = await scrapeURL( + "test:scrape-400", + "https://httpstat.us/400", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(400); + } + }, 30000); + + it("Scrape of a page with 401 status code", async () => { + const out = await scrapeURL( + "test:scrape-401", + "https://httpstat.us/401", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(401); + } + }, 30000); + + it("Scrape of a page with 403 status code", async () => { + const out = await scrapeURL( + "test:scrape-403", + "https://httpstat.us/403", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(403); + } + }, 30000); + + it("Scrape of a page with 404 status code", async () => { + const out = await scrapeURL( + "test:scrape-404", + "https://httpstat.us/404", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(404); + } + }, 30000); + + it("Scrape of a page with 405 status code", async () => { + const out = await scrapeURL( + "test:scrape-405", + "https://httpstat.us/405", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(405); + } + }, 30000); + + it("Scrape of a page with 500 status code", async () => { + const out = await scrapeURL( + "test:scrape-500", + "https://httpstat.us/500", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(500); + } + }, 30000); + + it("Scrape a redirected page", async () => { + const out = await scrapeURL( + "test:scrape-redirect", + "https://scrapethissite.com/", + scrapeOptions.parse({}), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document.markdown).toContain("Explore Sandbox"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.sourceURL).toBe( + "https://scrapethissite.com/" + ); + expect(out.document.metadata.url).toBe( + "https://www.scrapethissite.com/" + ); + expect(out.document.metadata.statusCode).toBe(200); + expect(out.document.metadata.error).toBeUndefined(); + } + }, 30000); + }); + + describe.each(testEnginesScreenshot)( + "Screenshot on engine %s", + (forceEngine: Engine | undefined) => { + it("Scrape with screenshot", async () => { + const out = await scrapeURL( + "test:scrape-screenshot", + "https://www.scrapethissite.com/", + scrapeOptions.parse({ + formats: ["screenshot"] + }), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("screenshot"); + expect(typeof out.document.screenshot).toBe("string"); + expect( + out.document.screenshot!.startsWith( + "https://service.firecrawl.dev/storage/v1/object/public/media/" + ) + ); + // TODO: attempt to fetch screenshot + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(200); + expect(out.document.metadata.error).toBeUndefined(); + } + }, 30000); + + it("Scrape with full-page screenshot", async () => { + const out = await scrapeURL( + "test:scrape-screenshot-fullPage", + "https://www.scrapethissite.com/", + scrapeOptions.parse({ + formats: ["screenshot@fullPage"] + }), + { forceEngine } + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("screenshot"); + expect(typeof out.document.screenshot).toBe("string"); + expect( + out.document.screenshot!.startsWith( + "https://service.firecrawl.dev/storage/v1/object/public/media/" + ) + ); + // TODO: attempt to fetch screenshot + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.statusCode).toBe(200); + expect(out.document.metadata.error).toBeUndefined(); + } + }, 30000); + } + ); + + it("Scrape of a PDF file", async () => { + const out = await scrapeURL( + "test:scrape-pdf", + "https://arxiv.org/pdf/astro-ph/9301001.pdf", + scrapeOptions.parse({}) + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.markdown).toContain("Broad Line Radio Galaxy"); + expect(out.document.metadata.statusCode).toBe(200); + expect(out.document.metadata.error).toBeUndefined(); + } + }, 60000); + + it("Scrape a DOCX file", async () => { + const out = await scrapeURL( + "test:scrape-docx", + "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", + scrapeOptions.parse({}) + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.markdown).toContain( + "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" + ); + expect(out.document.metadata.statusCode).toBe(200); + expect(out.document.metadata.error).toBeUndefined(); + } + }, 60000); + + it("LLM extract with prompt and schema", async () => { + const out = await scrapeURL( + "test:llm-extract-prompt-schema", + "https://firecrawl.dev", + scrapeOptions.parse({ + formats: ["extract"], + extract: { + prompt: + "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + schema: { + type: "object", + properties: { + company_mission: { type: "string" }, + supports_sso: { type: "boolean" }, + is_open_source: { type: "boolean" } + }, + required: ["company_mission", "supports_sso", "is_open_source"], + additionalProperties: false + } + } + }) + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("extract"); + expect(out.document.extract).toHaveProperty("company_mission"); + expect(out.document.extract).toHaveProperty("supports_sso"); + expect(out.document.extract).toHaveProperty("is_open_source"); + expect(typeof out.document.extract.company_mission).toBe("string"); + expect(out.document.extract.supports_sso).toBe(false); + expect(out.document.extract.is_open_source).toBe(true); + } + }, 120000); + + it("LLM extract with schema only", async () => { + const out = await scrapeURL( + "test:llm-extract-schema", + "https://firecrawl.dev", + scrapeOptions.parse({ + formats: ["extract"], + extract: { + schema: { + type: "object", + properties: { + company_mission: { type: "string" }, + supports_sso: { type: "boolean" }, + is_open_source: { type: "boolean" } + }, + required: ["company_mission", "supports_sso", "is_open_source"], + additionalProperties: false + } + } + }) + ); + + // expect(out.logs.length).toBeGreaterThan(0); + expect(out.success).toBe(true); + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("extract"); + expect(out.document.extract).toHaveProperty("company_mission"); + expect(out.document.extract).toHaveProperty("supports_sso"); + expect(out.document.extract).toHaveProperty("is_open_source"); + expect(typeof out.document.extract.company_mission).toBe("string"); + expect(out.document.extract.supports_sso).toBe(false); + expect(out.document.extract.is_open_source).toBe(true); + } + }, 120000); + + test.concurrent.each(new Array(100).fill(0).map((_, i) => i))( + "Concurrent scrape #%i", + async (i) => { + const url = "https://www.scrapethissite.com/?i=" + i; + const id = "test:concurrent:" + url; + const out = await scrapeURL(id, url, scrapeOptions.parse({})); + + const replacer = (key: string, value: any) => { + if (value instanceof Error) { + return { + ...value, + message: value.message, + name: value.name, + cause: value.cause, + stack: value.stack + }; + } else { + return value; + } + }; + + // verify that log collection works properly while concurrency is happening + // expect(out.logs.length).toBeGreaterThan(0); + const weirdLogs = out.logs.filter((x) => x.scrapeId !== id); + if (weirdLogs.length > 0) { + console.warn(JSON.stringify(weirdLogs, replacer)); + } + expect(weirdLogs.length).toBe(0); + + if (!out.success) console.error(JSON.stringify(out, replacer)); + expect(out.success).toBe(true); + + if (out.success) { + expect(out.document.warning).toBeUndefined(); + expect(out.document).toHaveProperty("markdown"); + expect(out.document).toHaveProperty("metadata"); + expect(out.document.metadata.error).toBeUndefined(); + expect(out.document.metadata.statusCode).toBe(200); + } + }, + 30000 + ); +}); diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts index e0c09c44..4a31da1f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -3,24 +3,30 @@ import { Meta } from ".."; import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache"; export function saveToCache(meta: Meta, document: Document): Document { - if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document; - - if (document.rawHtml === undefined) { - throw new Error("rawHtml is undefined -- this transformer is being called out of order"); - } - - const key = cacheKey(meta.url, meta.options, meta.internalOptions); - - if (key !== null) { - const entry: CacheEntry = { - html: document.rawHtml!, - statusCode: document.metadata.statusCode!, - url: document.metadata.url ?? document.metadata.sourceURL!, - error: document.metadata.error ?? undefined, - }; - - saveEntryToCache(key, entry); - } - + if ( + document.metadata.statusCode! < 200 || + document.metadata.statusCode! >= 300 + ) return document; -} \ No newline at end of file + + if (document.rawHtml === undefined) { + throw new Error( + "rawHtml is undefined -- this transformer is being called out of order" + ); + } + + const key = cacheKey(meta.url, meta.options, meta.internalOptions); + + if (key !== null) { + const entry: CacheEntry = { + html: document.rawHtml!, + statusCode: document.metadata.statusCode!, + url: document.metadata.url ?? document.metadata.sourceURL!, + error: document.metadata.error ?? undefined + }; + + saveEntryToCache(key, entry); + } + + return document; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index b8063f7e..5afceda2 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -9,127 +9,180 @@ import { uploadScreenshot } from "./uploadScreenshot"; import { removeBase64Images } from "./removeBase64Images"; import { saveToCache } from "./cache"; -export type Transformer = (meta: Meta, document: Document) => Document | Promise; +export type Transformer = ( + meta: Meta, + document: Document +) => Document | Promise; -export function deriveMetadataFromRawHTML(meta: Meta, document: Document): Document { - if (document.rawHtml === undefined) { - throw new Error("rawHtml is undefined -- this transformer is being called out of order"); - } +export function deriveMetadataFromRawHTML( + meta: Meta, + document: Document +): Document { + if (document.rawHtml === undefined) { + throw new Error( + "rawHtml is undefined -- this transformer is being called out of order" + ); + } - document.metadata = { - ...extractMetadata(meta, document.rawHtml), - ...document.metadata, - }; - return document; + document.metadata = { + ...extractMetadata(meta, document.rawHtml), + ...document.metadata + }; + return document; } -export function deriveHTMLFromRawHTML(meta: Meta, document: Document): Document { - if (document.rawHtml === undefined) { - throw new Error("rawHtml is undefined -- this transformer is being called out of order"); - } +export function deriveHTMLFromRawHTML( + meta: Meta, + document: Document +): Document { + if (document.rawHtml === undefined) { + throw new Error( + "rawHtml is undefined -- this transformer is being called out of order" + ); + } - document.html = removeUnwantedElements(document.rawHtml, meta.options); - return document; + document.html = removeUnwantedElements(document.rawHtml, meta.options); + return document; } -export async function deriveMarkdownFromHTML(_meta: Meta, document: Document): Promise { - if (document.html === undefined) { - throw new Error("html is undefined -- this transformer is being called out of order"); - } +export async function deriveMarkdownFromHTML( + _meta: Meta, + document: Document +): Promise { + if (document.html === undefined) { + throw new Error( + "html is undefined -- this transformer is being called out of order" + ); + } - document.markdown = await parseMarkdown(document.html); - return document; + document.markdown = await parseMarkdown(document.html); + return document; } export function deriveLinksFromHTML(meta: Meta, document: Document): Document { - // Only derive if the formats has links - if (meta.options.formats.includes("links")) { - if (document.html === undefined) { - throw new Error("html is undefined -- this transformer is being called out of order"); - } - - document.links = extractLinks(document.html, meta.url); + // Only derive if the formats has links + if (meta.options.formats.includes("links")) { + if (document.html === undefined) { + throw new Error( + "html is undefined -- this transformer is being called out of order" + ); } - return document; + document.links = extractLinks(document.html, meta.url); + } + + return document; } -export function coerceFieldsToFormats(meta: Meta, document: Document): Document { - const formats = new Set(meta.options.formats); +export function coerceFieldsToFormats( + meta: Meta, + document: Document +): Document { + const formats = new Set(meta.options.formats); - if (!formats.has("markdown") && document.markdown !== undefined) { - delete document.markdown; - } else if (formats.has("markdown") && document.markdown === undefined) { - meta.logger.warn("Request had format: markdown, but there was no markdown field in the result."); - } + if (!formats.has("markdown") && document.markdown !== undefined) { + delete document.markdown; + } else if (formats.has("markdown") && document.markdown === undefined) { + meta.logger.warn( + "Request had format: markdown, but there was no markdown field in the result." + ); + } - if (!formats.has("rawHtml") && document.rawHtml !== undefined) { - delete document.rawHtml; - } else if (formats.has("rawHtml") && document.rawHtml === undefined) { - meta.logger.warn("Request had format: rawHtml, but there was no rawHtml field in the result."); - } + if (!formats.has("rawHtml") && document.rawHtml !== undefined) { + delete document.rawHtml; + } else if (formats.has("rawHtml") && document.rawHtml === undefined) { + meta.logger.warn( + "Request had format: rawHtml, but there was no rawHtml field in the result." + ); + } - if (!formats.has("html") && document.html !== undefined) { - delete document.html; - } else if (formats.has("html") && document.html === undefined) { - meta.logger.warn("Request had format: html, but there was no html field in the result."); - } + if (!formats.has("html") && document.html !== undefined) { + delete document.html; + } else if (formats.has("html") && document.html === undefined) { + meta.logger.warn( + "Request had format: html, but there was no html field in the result." + ); + } - if (!formats.has("screenshot") && !formats.has("screenshot@fullPage") && document.screenshot !== undefined) { - meta.logger.warn("Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug."); - delete document.screenshot; - } else if ((formats.has("screenshot") || formats.has("screenshot@fullPage")) && document.screenshot === undefined) { - meta.logger.warn("Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result."); - } + if ( + !formats.has("screenshot") && + !formats.has("screenshot@fullPage") && + document.screenshot !== undefined + ) { + meta.logger.warn( + "Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug." + ); + delete document.screenshot; + } else if ( + (formats.has("screenshot") || formats.has("screenshot@fullPage")) && + document.screenshot === undefined + ) { + meta.logger.warn( + "Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result." + ); + } - if (!formats.has("links") && document.links !== undefined) { - meta.logger.warn("Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug."); - delete document.links; - } else if (formats.has("links") && document.links === undefined) { - meta.logger.warn("Request had format: links, but there was no links field in the result."); - } + if (!formats.has("links") && document.links !== undefined) { + meta.logger.warn( + "Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug." + ); + delete document.links; + } else if (formats.has("links") && document.links === undefined) { + meta.logger.warn( + "Request had format: links, but there was no links field in the result." + ); + } - if (!formats.has("extract") && document.extract !== undefined) { - meta.logger.warn("Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug."); - delete document.extract; - } else if (formats.has("extract") && document.extract === undefined) { - meta.logger.warn("Request had format: extract, but there was no extract field in the result."); - } + if (!formats.has("extract") && document.extract !== undefined) { + meta.logger.warn( + "Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug." + ); + delete document.extract; + } else if (formats.has("extract") && document.extract === undefined) { + meta.logger.warn( + "Request had format: extract, but there was no extract field in the result." + ); + } - if (meta.options.actions === undefined || meta.options.actions.length === 0) { - delete document.actions; - } + if (meta.options.actions === undefined || meta.options.actions.length === 0) { + delete document.actions; + } - return document; + return document; } // TODO: allow some of these to run in parallel export const transformerStack: Transformer[] = [ - saveToCache, - deriveHTMLFromRawHTML, - deriveMarkdownFromHTML, - deriveLinksFromHTML, - deriveMetadataFromRawHTML, - uploadScreenshot, - performLLMExtract, - coerceFieldsToFormats, - removeBase64Images, + saveToCache, + deriveHTMLFromRawHTML, + deriveMarkdownFromHTML, + deriveLinksFromHTML, + deriveMetadataFromRawHTML, + uploadScreenshot, + performLLMExtract, + coerceFieldsToFormats, + removeBase64Images ]; -export async function executeTransformers(meta: Meta, document: Document): Promise { - const executions: [string, number][] = []; +export async function executeTransformers( + meta: Meta, + document: Document +): Promise { + const executions: [string, number][] = []; - for (const transformer of transformerStack) { - const _meta = { - ...meta, - logger: meta.logger.child({ method: "executeTransformers/" + transformer.name }), - }; - const start = Date.now(); - document = await transformer(_meta, document); - executions.push([transformer.name, Date.now() - start]); - } + for (const transformer of transformerStack) { + const _meta = { + ...meta, + logger: meta.logger.child({ + method: "executeTransformers/" + transformer.name + }) + }; + const start = Date.now(); + document = await transformer(_meta, document); + executions.push([transformer.name, Date.now() - start]); + } - meta.logger.debug("Executed transformers.", { executions }); + meta.logger.debug("Executed transformers.", { executions }); - return document; + return document; } diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 1c6adcd1..f09073ee 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -9,186 +9,226 @@ const maxTokens = 32000; const modifier = 4; export class LLMRefusalError extends Error { - public refusal: string; - public results: EngineResultsTracker | undefined; + public refusal: string; + public results: EngineResultsTracker | undefined; - constructor(refusal: string) { - super("LLM refused to extract the website's content") - this.refusal = refusal; - } + constructor(refusal: string) { + super("LLM refused to extract the website's content"); + this.refusal = refusal; + } } function normalizeSchema(x: any): any { - if (typeof x !== "object" || x === null) return x; + if (typeof x !== "object" || x === null) return x; - if (x["$defs"] !== null && typeof x["$defs"] === "object") { - x["$defs"] = Object.fromEntries(Object.entries(x["$defs"]).map(([name, schema]) => [name, normalizeSchema(schema)])); - } + if (x["$defs"] !== null && typeof x["$defs"] === "object") { + x["$defs"] = Object.fromEntries( + Object.entries(x["$defs"]).map(([name, schema]) => [ + name, + normalizeSchema(schema) + ]) + ); + } - if (x && x.anyOf) { - x.anyOf = x.anyOf.map(x => normalizeSchema(x)); - } + if (x && x.anyOf) { + x.anyOf = x.anyOf.map((x) => normalizeSchema(x)); + } - if (x && x.oneOf) { - x.oneOf = x.oneOf.map(x => normalizeSchema(x)); - } + if (x && x.oneOf) { + x.oneOf = x.oneOf.map((x) => normalizeSchema(x)); + } - if (x && x.allOf) { - x.allOf = x.allOf.map(x => normalizeSchema(x)); - } + if (x && x.allOf) { + x.allOf = x.allOf.map((x) => normalizeSchema(x)); + } - if (x && x.not) { - x.not = normalizeSchema(x.not); - } + if (x && x.not) { + x.not = normalizeSchema(x.not); + } - if (x && x.type === "object") { - return { - ...x, - properties: Object.fromEntries(Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])), - required: Object.keys(x.properties), - additionalProperties: false, - } - } else if (x && x.type === "array") { - return { - ...x, - items: normalizeSchema(x.items), - } - } else { - return x; - } + if (x && x.type === "object") { + return { + ...x, + properties: Object.fromEntries( + Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]) + ), + required: Object.keys(x.properties), + additionalProperties: false + }; + } else if (x && x.type === "array") { + return { + ...x, + items: normalizeSchema(x.items) + }; + } else { + return x; + } } -export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string, isExtractEndpoint?: boolean): Promise<{ extract: any, numTokens: number, warning: string | undefined }> { - let extract: any; - let warning: string | undefined; +export async function generateOpenAICompletions( + logger: Logger, + options: ExtractOptions, + markdown?: string, + previousWarning?: string, + isExtractEndpoint?: boolean +): Promise<{ extract: any; numTokens: number; warning: string | undefined }> { + let extract: any; + let warning: string | undefined; - const openai = new OpenAI(); - const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; + const openai = new OpenAI(); + const model: TiktokenModel = + (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; - if (markdown === undefined) { - throw new Error("document.markdown is undefined -- this is unexpected"); - } + if (markdown === undefined) { + throw new Error("document.markdown is undefined -- this is unexpected"); + } - // count number of tokens - let numTokens = 0; - const encoder = encoding_for_model(model as TiktokenModel); + // count number of tokens + let numTokens = 0; + const encoder = encoding_for_model(model as TiktokenModel); + try { + // Encode the message into tokens + const tokens = encoder.encode(markdown); + + // Return the number of tokens + numTokens = tokens.length; + } catch (error) { + logger.warn("Calculating num tokens of string failed", { error, markdown }); + + markdown = markdown.slice(0, maxTokens * modifier); + + let w = + "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + + maxTokens + + ") we support."; + warning = previousWarning === undefined ? w : w + " " + previousWarning; + } finally { + // Free the encoder resources after use + encoder.free(); + } + + if (numTokens > maxTokens) { + // trim the document to the maximum number of tokens, tokens != characters + markdown = markdown.slice(0, maxTokens * modifier); + + const w = + "The extraction content would have used more tokens (" + + numTokens + + ") than the maximum we allow (" + + maxTokens + + "). -- the input has been automatically trimmed."; + warning = previousWarning === undefined ? w : w + " " + previousWarning; + } + + let schema = options.schema; + if (schema && schema.type === "array") { + schema = { + type: "object", + properties: { + items: options.schema + }, + required: ["items"], + additionalProperties: false + }; + } else if (schema && typeof schema === "object" && !schema.type) { + schema = { + type: "object", + properties: Object.fromEntries( + Object.entries(schema).map(([key, value]) => [key, { type: value }]) + ), + required: Object.keys(schema), + additionalProperties: false + }; + } + + schema = normalizeSchema(schema); + + const jsonCompletion = await openai.beta.chat.completions.parse({ + model, + temperature: 0, + messages: [ + { + role: "system", + content: options.systemPrompt + }, + { + role: "user", + content: [{ type: "text", text: markdown }] + }, + { + role: "user", + content: + options.prompt !== undefined + ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` + : "Transform the above content into structured JSON output." + } + ], + response_format: options.schema + ? { + type: "json_schema", + json_schema: { + name: "websiteContent", + schema: schema, + strict: true + } + } + : { type: "json_object" } + }); + + if (jsonCompletion.choices[0].message.refusal !== null) { + throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); + } + + extract = jsonCompletion.choices[0].message.parsed; + + if (extract === null && jsonCompletion.choices[0].message.content !== null) { try { - // Encode the message into tokens - const tokens = encoder.encode(markdown); - - // Return the number of tokens - numTokens = tokens.length; - } catch (error) { - logger.warn("Calculating num tokens of string failed", { error, markdown }); - - markdown = markdown.slice(0, maxTokens * modifier); - - let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } finally { - // Free the encoder resources after use - encoder.free(); - } - - if (numTokens > maxTokens) { - // trim the document to the maximum number of tokens, tokens != characters - markdown = markdown.slice(0, maxTokens * modifier); - - const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; - warning = previousWarning === undefined ? w : w + " " + previousWarning; - } - - let schema = options.schema; - if (schema && schema.type === "array") { - schema = { - type: "object", - properties: { - items: options.schema, - }, - required: ["items"], - additionalProperties: false, - }; - } else if (schema && typeof schema === 'object' && !schema.type) { - schema = { - type: "object", - properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]) - ), - required: Object.keys(schema), - additionalProperties: false - }; - } - - schema = normalizeSchema(schema); - - const jsonCompletion = await openai.beta.chat.completions.parse({ - model, - temperature: 0, - messages: [ - { - role: "system", - content: options.systemPrompt, - }, - { - role: "user", - content: [{ type: "text", text: markdown }], - }, - { - role: "user", - content: options.prompt !== undefined - ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` - : "Transform the above content into structured JSON output.", - }, - ], - response_format: options.schema ? { - type: "json_schema", - json_schema: { - name: "websiteContent", - schema: schema, - strict: true, - } - } : { type: "json_object" }, - }); - - if (jsonCompletion.choices[0].message.refusal !== null) { - throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); - } - - extract = jsonCompletion.choices[0].message.parsed; - - if (extract === null && jsonCompletion.choices[0].message.content !== null) { - try { - if (!isExtractEndpoint) { - extract = JSON.parse(jsonCompletion.choices[0].message.content); - } else { - const extractData = JSON.parse(jsonCompletion.choices[0].message.content); - extract = options.schema ? extractData.data.extract : extractData; - } - } catch (e) { - logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); - throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); - } - } - - // If the users actually wants the items object, they can specify it as 'required' in the schema - // otherwise, we just return the items array - if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) { - extract = extract?.items; - } - return { extract, warning, numTokens }; -} - -export async function performLLMExtract(meta: Meta, document: Document): Promise { - if (meta.options.formats.includes("extract")) { - const { extract, warning } = await generateOpenAICompletions( - meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), - meta.options.extract!, - document.markdown, - document.warning, + if (!isExtractEndpoint) { + extract = JSON.parse(jsonCompletion.choices[0].message.content); + } else { + const extractData = JSON.parse( + jsonCompletion.choices[0].message.content ); - document.extract = extract; - document.warning = warning; + extract = options.schema ? extractData.data.extract : extractData; + } + } catch (e) { + logger.error("Failed to parse returned JSON, no schema specified.", { + error: e + }); + throw new LLMRefusalError( + "Failed to parse returned JSON. Please specify a schema in the extract object." + ); } + } - return document; + // If the users actually wants the items object, they can specify it as 'required' in the schema + // otherwise, we just return the items array + if ( + options.schema && + options.schema.type === "array" && + !schema?.required?.includes("items") + ) { + extract = extract?.items; + } + return { extract, warning, numTokens }; +} + +export async function performLLMExtract( + meta: Meta, + document: Document +): Promise { + if (meta.options.formats.includes("extract")) { + const { extract, warning } = await generateOpenAICompletions( + meta.logger.child({ + method: "performLLMExtract/generateOpenAICompletions" + }), + meta.options.extract!, + document.markdown, + document.warning + ); + document.extract = extract; + document.warning = warning; + } + + return document; } diff --git a/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts index 92628f8a..3bc408ff 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts @@ -4,8 +4,11 @@ import { Document } from "../../../controllers/v1/types"; const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g; export function removeBase64Images(meta: Meta, document: Document): Document { - if (meta.options.removeBase64Images && document.markdown !== undefined) { - document.markdown = document.markdown.replace(regex, '$1()'); - } - return document; -} \ No newline at end of file + if (meta.options.removeBase64Images && document.markdown !== undefined) { + document.markdown = document.markdown.replace( + regex, + "$1()" + ); + } + return document; +} diff --git a/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts b/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts index 4c3fc2b4..ed01af69 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts @@ -6,21 +6,29 @@ import { Meta } from ".."; import { Document } from "../../../controllers/v1/types"; export function uploadScreenshot(meta: Meta, document: Document): Document { - if (process.env.USE_DB_AUTHENTICATION === "true" && document.screenshot !== undefined && document.screenshot.startsWith("data:")) { - meta.logger.debug("Uploading screenshot to Supabase..."); + if ( + process.env.USE_DB_AUTHENTICATION === "true" && + document.screenshot !== undefined && + document.screenshot.startsWith("data:") + ) { + meta.logger.debug("Uploading screenshot to Supabase..."); - const fileName = `screenshot-${crypto.randomUUID()}.png`; + const fileName = `screenshot-${crypto.randomUUID()}.png`; - supabase_service.storage - .from("media") - .upload(fileName, Buffer.from(document.screenshot.split(",")[1], "base64"), { - cacheControl: "3600", - upsert: false, - contentType: document.screenshot.split(":")[1].split(";")[0], - }); - - document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`; - } + supabase_service.storage + .from("media") + .upload( + fileName, + Buffer.from(document.screenshot.split(",")[1], "base64"), + { + cacheControl: "3600", + upsert: false, + contentType: document.screenshot.split(":")[1].split(";")[0] + } + ); - return document; + document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`; + } + + return document; } diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index c1417af1..3fa9c588 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -25,7 +25,7 @@ export async function fireEngineMap( location: options.location, tbs: options.tbs, numResults: options.numResults, - page: options.page ?? 1, + page: options.page ?? 1 }); if (!process.env.FIRE_ENGINE_BETA_URL) { @@ -39,9 +39,9 @@ export async function fireEngineMap( method: "POST", headers: { "Content-Type": "application/json", - "X-Disable-Cache": "true", + "X-Disable-Cache": "true" }, - body: data, + body: data }); if (response.ok) { diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 59662829..a7c78fc9 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,114 +1,150 @@ -import axios from 'axios'; -import * as cheerio from 'cheerio'; -import * as querystring from 'querystring'; -import { SearchResult } from '../../src/lib/entities'; -import { logger } from '../../src/lib/logger'; +import axios from "axios"; +import * as cheerio from "cheerio"; +import * as querystring from "querystring"; +import { SearchResult } from "../../src/lib/entities"; +import { logger } from "../../src/lib/logger"; const _useragent_list = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0" ]; function get_useragent(): string { - return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; + return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; } -async function _req(term: string, results: number, lang: string, country: string, start: number, proxies: any, timeout: number, tbs: string | undefined = undefined, filter: string | undefined = undefined) { - const params = { - "q": term, - "num": results, // Number of results to return - "hl": lang, - "gl": country, - "start": start, - }; - if (tbs) { - params["tbs"] = tbs; +async function _req( + term: string, + results: number, + lang: string, + country: string, + start: number, + proxies: any, + timeout: number, + tbs: string | undefined = undefined, + filter: string | undefined = undefined +) { + const params = { + q: term, + num: results, // Number of results to return + hl: lang, + gl: country, + start: start + }; + if (tbs) { + params["tbs"] = tbs; + } + if (filter) { + params["filter"] = filter; + } + try { + const resp = await axios.get("https://www.google.com/search", { + headers: { + "User-Agent": get_useragent() + }, + params: params, + proxy: proxies, + timeout: timeout + }); + return resp; + } catch (error) { + if (error.response && error.response.status === 429) { + throw new Error("Google Search: Too many requests, try again later."); } - if (filter) { - params["filter"] = filter; + throw error; + } +} + +export async function googleSearch( + term: string, + advanced = false, + num_results = 7, + tbs = undefined as string | undefined, + filter = undefined as string | undefined, + lang = "en", + country = "us", + proxy = undefined as string | undefined, + sleep_interval = 0, + timeout = 5000 +): Promise { + let proxies: any = null; + if (proxy) { + if (proxy.startsWith("https")) { + proxies = { https: proxy }; + } else { + proxies = { http: proxy }; } + } + + // TODO: knowledge graph, answer box, etc. + + let start = 0; + let results: SearchResult[] = []; + let attempts = 0; + const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop + while (start < num_results && attempts < maxAttempts) { try { - const resp = await axios.get("https://www.google.com/search", { - headers: { - "User-Agent": get_useragent() - }, - params: params, - proxy: proxies, - timeout: timeout, - }); - return resp; + const resp = await _req( + term, + num_results - start, + lang, + country, + start, + proxies, + timeout, + tbs, + filter + ); + const $ = cheerio.load(resp.data); + const result_block = $("div.g"); + if (result_block.length === 0) { + start += 1; + attempts += 1; + } else { + attempts = 0; // Reset attempts if we have results + } + result_block.each((index, element) => { + const linkElement = $(element).find("a"); + const link = + linkElement && linkElement.attr("href") + ? linkElement.attr("href") + : null; + const title = $(element).find("h3"); + const ogImage = $(element).find("img").eq(1).attr("src"); + const description_box = $(element).find( + "div[style='-webkit-line-clamp:2']" + ); + const answerBox = $(element).find(".mod").text(); + if (description_box) { + const description = description_box.text(); + if (link && title && description) { + start += 1; + results.push(new SearchResult(link, title.text(), description)); + } + } + }); + await new Promise((resolve) => + setTimeout(resolve, sleep_interval * 1000) + ); } catch (error) { - if (error.response && error.response.status === 429) { - throw new Error('Google Search: Too many requests, try again later.'); - } - throw error; + if (error.message === "Too many requests") { + logger.warn("Too many requests, breaking the loop"); + break; + } + throw error; } -} - - - -export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = undefined as string | undefined, filter = undefined as string | undefined, lang = "en", country = "us", proxy = undefined as string | undefined, sleep_interval = 0, timeout = 5000, ) :Promise { - let proxies: any = null; - if (proxy) { - if (proxy.startsWith("https")) { - proxies = {"https": proxy}; - } else { - proxies = {"http": proxy}; - } - } - - // TODO: knowledge graph, answer box, etc. - - let start = 0; - let results : SearchResult[] = []; - let attempts = 0; - const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop - while (start < num_results && attempts < maxAttempts) { - try { - const resp = await _req(term, num_results - start, lang, country, start, proxies, timeout, tbs, filter); - const $ = cheerio.load(resp.data); - const result_block = $("div.g"); - if (result_block.length === 0) { - start += 1; - attempts += 1; - } else { - attempts = 0; // Reset attempts if we have results - } - result_block.each((index, element) => { - const linkElement = $(element).find("a"); - const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; - const title = $(element).find("h3"); - const ogImage = $(element).find("img").eq(1).attr("src"); - const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); - const answerBox = $(element).find(".mod").text(); - if (description_box) { - const description = description_box.text(); - if (link && title && description) { - start += 1; - results.push(new SearchResult(link, title.text(), description)); - } - } - }); - await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); - } catch (error) { - if (error.message === 'Too many requests') { - logger.warn('Too many requests, breaking the loop'); - break; - } - throw error; - } - - if (start === 0) { - return results; - } - } - if (attempts >= maxAttempts) { - logger.warn('Max attempts reached, breaking the loop'); - } - return results + + if (start === 0) { + return results; + } + } + if (attempts >= maxAttempts) { + logger.warn("Max attempts reached, breaking the loop"); + } + return results; } diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 5899af87..978a57e0 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -16,7 +16,7 @@ export async function search({ location = undefined, proxy = undefined, sleep_interval = 0, - timeout = 5000, + timeout = 5000 }: { query: string; advanced?: boolean; @@ -38,7 +38,7 @@ export async function search({ filter, lang, country, - location, + location }); } if (process.env.SEARCHAPI_API_KEY) { diff --git a/apps/api/src/search/searchapi.ts b/apps/api/src/search/searchapi.ts index 24778a77..ea21c8d3 100644 --- a/apps/api/src/search/searchapi.ts +++ b/apps/api/src/search/searchapi.ts @@ -14,7 +14,10 @@ interface SearchOptions { page?: number; } -export async function searchapi_search(q: string, options: SearchOptions): Promise { +export async function searchapi_search( + q: string, + options: SearchOptions +): Promise { const params = { q: q, hl: options.lang, @@ -22,7 +25,7 @@ export async function searchapi_search(q: string, options: SearchOptions): Promi location: options.location, num: options.num_results, page: options.page ?? 1, - engine: process.env.SEARCHAPI_ENGINE || "google", + engine: process.env.SEARCHAPI_ENGINE || "google" }; const url = `https://www.searchapi.io/api/v1/search`; @@ -30,14 +33,13 @@ export async function searchapi_search(q: string, options: SearchOptions): Promi try { const response = await axios.get(url, { headers: { - "Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`, + Authorization: `Bearer ${process.env.SEARCHAPI_API_KEY}`, "Content-Type": "application/json", - "X-SearchApi-Source": "Firecrawl", + "X-SearchApi-Source": "Firecrawl" }, - params: params, + params: params }); - if (response.status === 401) { throw new Error("Unauthorized. Please check your API key."); } @@ -48,7 +50,7 @@ export async function searchapi_search(q: string, options: SearchOptions): Promi return data.organic_results.map((a: any) => ({ url: a.link, title: a.title, - description: a.snippet, + description: a.snippet })); } else { return []; diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts index be716367..4abf720d 100644 --- a/apps/api/src/search/serper.ts +++ b/apps/api/src/search/serper.ts @@ -4,7 +4,9 @@ import { SearchResult } from "../../src/lib/entities"; dotenv.config(); -export async function serper_search(q, options: { +export async function serper_search( + q, + options: { tbs?: string; filter?: string; lang?: string; @@ -12,7 +14,8 @@ export async function serper_search(q, options: { location?: string; num_results: number; page?: number; -}): Promise { + } +): Promise { let data = JSON.stringify({ q: q, hl: options.lang, @@ -20,7 +23,7 @@ export async function serper_search(q, options: { location: options.location, tbs: options.tbs, num: options.num_results, - page: options.page ?? 1, + page: options.page ?? 1 }); let config = { @@ -28,18 +31,18 @@ export async function serper_search(q, options: { url: "https://google.serper.dev/search", headers: { "X-API-KEY": process.env.SERPER_API_KEY, - "Content-Type": "application/json", + "Content-Type": "application/json" }, - data: data, + data: data }; const response = await axios(config); if (response && response.data && Array.isArray(response.data.organic)) { return response.data.organic.map((a) => ({ url: a.link, title: a.title, - description: a.snippet, + description: a.snippet })); - }else{ + } else { return []; } } diff --git a/apps/api/src/services/alerts/index.ts b/apps/api/src/services/alerts/index.ts index 826bb18e..3aaea3aa 100644 --- a/apps/api/src/services/alerts/index.ts +++ b/apps/api/src/services/alerts/index.ts @@ -54,7 +54,7 @@ export async function checkAlerts() { }; await checkAll(); - // setInterval(checkAll, 10000); // Run every + // setInterval(checkAll, 10000); // Run every } } catch (error) { logger.error(`Failed to initialize alerts: ${error}`); diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index 1eac5343..11280f28 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -8,14 +8,14 @@ export async function sendSlackWebhook( ) { const messagePrefix = alertEveryone ? " " : ""; const payload = { - text: `${messagePrefix} ${message}`, + text: `${messagePrefix} ${message}` }; try { const response = await axios.post(webhookUrl, payload, { headers: { - "Content-Type": "application/json", - }, + "Content-Type": "application/json" + } }); logger.info("Webhook sent successfully:", response.data); } catch (error) { diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index 1659a110..3411c921 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -23,7 +23,12 @@ const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds export async function autoCharge( chunk: AuthCreditUsageChunk, autoRechargeThreshold: number -): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> { +): Promise<{ + success: boolean; + message: string; + remainingCredits: number; + chunk: AuthCreditUsageChunk; +}> { const resource = `auto-recharge:${chunk.team_id}`; const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`; @@ -32,145 +37,162 @@ export async function autoCharge( // Another check to prevent race conditions, double charging - cool down of 5 minutes const cooldownValue = await getValue(cooldownKey); if (cooldownValue) { - logger.info(`Auto-recharge for team ${chunk.team_id} is in cooldown period`); + logger.info( + `Auto-recharge for team ${chunk.team_id} is in cooldown period` + ); return { success: false, message: "Auto-recharge is in cooldown period", remainingCredits: chunk.remaining_credits, - chunk, + chunk }; } // Use a distributed lock to prevent concurrent auto-charge attempts - return await redlock.using([resource], 5000, async (signal) : Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> => { - // Recheck the condition inside the lock to prevent race conditions - const updatedChunk = await getACUC(chunk.api_key, false, false); - if ( - updatedChunk && - updatedChunk.remaining_credits < autoRechargeThreshold - ) { - if (chunk.sub_user_id) { - // Fetch the customer's Stripe information - const { data: customer, error: customersError } = - await supabase_service - .from("customers") - .select("id, stripe_customer_id") - .eq("id", chunk.sub_user_id) - .single(); - - if (customersError) { - logger.error(`Error fetching customer data: ${customersError}`); - return { - success: false, - message: "Error fetching customer data", - remainingCredits: chunk.remaining_credits, - chunk, - }; - } + return await redlock.using( + [resource], + 5000, + async ( + signal + ): Promise<{ + success: boolean; + message: string; + remainingCredits: number; + chunk: AuthCreditUsageChunk; + }> => { + // Recheck the condition inside the lock to prevent race conditions + const updatedChunk = await getACUC(chunk.api_key, false, false); + if ( + updatedChunk && + updatedChunk.remaining_credits < autoRechargeThreshold + ) { + if (chunk.sub_user_id) { + // Fetch the customer's Stripe information + const { data: customer, error: customersError } = + await supabase_service + .from("customers") + .select("id, stripe_customer_id") + .eq("id", chunk.sub_user_id) + .single(); - if (customer && customer.stripe_customer_id) { - let issueCreditsSuccess = false; - // Attempt to create a payment intent - const paymentStatus = await createPaymentIntent( - chunk.team_id, - customer.stripe_customer_id - ); - - // If payment is successful or requires further action, issue credits - if ( - paymentStatus.return_status === "succeeded" || - paymentStatus.return_status === "requires_action" - ) { - issueCreditsSuccess = await issueCredits( + if (customersError) { + logger.error(`Error fetching customer data: ${customersError}`); + return { + success: false, + message: "Error fetching customer data", + remainingCredits: chunk.remaining_credits, + chunk + }; + } + + if (customer && customer.stripe_customer_id) { + let issueCreditsSuccess = false; + // Attempt to create a payment intent + const paymentStatus = await createPaymentIntent( chunk.team_id, - AUTO_RECHARGE_CREDITS - ); - } - - // Record the auto-recharge transaction - await supabase_service.from("auto_recharge_transactions").insert({ - team_id: chunk.team_id, - initial_payment_status: paymentStatus.return_status, - credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0, - stripe_charge_id: paymentStatus.charge_id, - }); - - // Send a notification if credits were successfully issued - if (issueCreditsSuccess) { - await sendNotification( - chunk.team_id, - NotificationType.AUTO_RECHARGE_SUCCESS, - chunk.sub_current_period_start, - chunk.sub_current_period_end, - chunk, - true + customer.stripe_customer_id ); - // Set cooldown period - await setValue(cooldownKey, 'true', AUTO_RECHARGE_COOLDOWN); - } - - // Reset ACUC cache to reflect the new credit balance - const cacheKeyACUC = `acuc_${chunk.api_key}`; - await deleteKey(cacheKeyACUC); - - if (process.env.SLACK_ADMIN_WEBHOOK_URL) { - const webhookCooldownKey = `webhook_cooldown_${chunk.team_id}`; - const isInCooldown = await getValue(webhookCooldownKey); - - if (!isInCooldown) { - sendSlackWebhook( - `Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`, - false, - process.env.SLACK_ADMIN_WEBHOOK_URL - ).catch((error) => { - logger.debug(`Error sending slack notification: ${error}`); - }); - - // Set cooldown for 1 hour - await setValue(webhookCooldownKey, 'true', 60 * 60); + // If payment is successful or requires further action, issue credits + if ( + paymentStatus.return_status === "succeeded" || + paymentStatus.return_status === "requires_action" + ) { + issueCreditsSuccess = await issueCredits( + chunk.team_id, + AUTO_RECHARGE_CREDITS + ); } + + // Record the auto-recharge transaction + await supabase_service.from("auto_recharge_transactions").insert({ + team_id: chunk.team_id, + initial_payment_status: paymentStatus.return_status, + credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0, + stripe_charge_id: paymentStatus.charge_id + }); + + // Send a notification if credits were successfully issued + if (issueCreditsSuccess) { + await sendNotification( + chunk.team_id, + NotificationType.AUTO_RECHARGE_SUCCESS, + chunk.sub_current_period_start, + chunk.sub_current_period_end, + chunk, + true + ); + + // Set cooldown period + await setValue(cooldownKey, "true", AUTO_RECHARGE_COOLDOWN); + } + + // Reset ACUC cache to reflect the new credit balance + const cacheKeyACUC = `acuc_${chunk.api_key}`; + await deleteKey(cacheKeyACUC); + + if (process.env.SLACK_ADMIN_WEBHOOK_URL) { + const webhookCooldownKey = `webhook_cooldown_${chunk.team_id}`; + const isInCooldown = await getValue(webhookCooldownKey); + + if (!isInCooldown) { + sendSlackWebhook( + `Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`, + false, + process.env.SLACK_ADMIN_WEBHOOK_URL + ).catch((error) => { + logger.debug(`Error sending slack notification: ${error}`); + }); + + // Set cooldown for 1 hour + await setValue(webhookCooldownKey, "true", 60 * 60); + } + } + return { + success: true, + message: "Auto-recharge successful", + remainingCredits: + chunk.remaining_credits + AUTO_RECHARGE_CREDITS, + chunk: { + ...chunk, + remaining_credits: + chunk.remaining_credits + AUTO_RECHARGE_CREDITS + } + }; + } else { + logger.error("No Stripe customer ID found for user"); + return { + success: false, + message: "No Stripe customer ID found for user", + remainingCredits: chunk.remaining_credits, + chunk + }; } - return { - success: true, - message: "Auto-recharge successful", - remainingCredits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS, - chunk: {...chunk, remaining_credits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS}, - }; } else { - logger.error("No Stripe customer ID found for user"); + logger.error("No sub_user_id found in chunk"); return { success: false, - message: "No Stripe customer ID found for user", + message: "No sub_user_id found in chunk", remainingCredits: chunk.remaining_credits, - chunk, + chunk }; } - } else { - logger.error("No sub_user_id found in chunk"); - return { - success: false, - message: "No sub_user_id found in chunk", - remainingCredits: chunk.remaining_credits, - chunk, - }; } + return { + success: false, + message: "No need to auto-recharge", + remainingCredits: chunk.remaining_credits, + chunk + }; } - return { - success: false, - message: "No need to auto-recharge", - remainingCredits: chunk.remaining_credits, - chunk, - }; - - }); + ); } catch (error) { logger.error(`Failed to acquire lock for auto-recharge: ${error}`); return { success: false, message: "Failed to acquire lock for auto-recharge", remainingCredits: chunk.remaining_credits, - chunk, + chunk }; } } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 8558d8ba..f25e165e 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -16,10 +16,22 @@ const FREE_CREDITS = 500; /** * If you do not know the subscription_id in the current context, pass subscription_id as undefined. */ -export async function billTeam(team_id: string, subscription_id: string | null | undefined, credits: number) { - return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(team_id, subscription_id, credits); +export async function billTeam( + team_id: string, + subscription_id: string | null | undefined, + credits: number +) { + return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })( + team_id, + subscription_id, + credits + ); } -export async function supaBillTeam(team_id: string, subscription_id: string | null | undefined, credits: number) { +export async function supaBillTeam( + team_id: string, + subscription_id: string | null | undefined, + credits: number +) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } @@ -29,7 +41,7 @@ export async function supaBillTeam(team_id: string, subscription_id: string | nu _team_id: team_id, sub_id: subscription_id ?? null, fetch_subscription: subscription_id === undefined, - credits, + credits }); if (error) { @@ -46,7 +58,7 @@ export async function supaBillTeam(team_id: string, subscription_id: string | nu ...acuc, credits_used: acuc.credits_used + credits, adjusted_credits_used: acuc.adjusted_credits_used + credits, - remaining_credits: acuc.remaining_credits - credits, + remaining_credits: acuc.remaining_credits - credits } : null ); @@ -55,21 +67,37 @@ export async function supaBillTeam(team_id: string, subscription_id: string | nu } export type CheckTeamCreditsResponse = { - success: boolean, - message: string, - remainingCredits: number, - chunk?: AuthCreditUsageChunk, -} + success: boolean; + message: string; + remainingCredits: number; + chunk?: AuthCreditUsageChunk; +}; -export async function checkTeamCredits(chunk: AuthCreditUsageChunk | null, team_id: string, credits: number): Promise { - return withAuth(supaCheckTeamCredits, { success: true, message: "No DB, bypassed", remainingCredits: Infinity })(chunk, team_id, credits); +export async function checkTeamCredits( + chunk: AuthCreditUsageChunk | null, + team_id: string, + credits: number +): Promise { + return withAuth(supaCheckTeamCredits, { + success: true, + message: "No DB, bypassed", + remainingCredits: Infinity + })(chunk, team_id, credits); } // if team has enough credits for the operation, return true, else return false -export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk | null, team_id: string, credits: number): Promise { +export async function supaCheckTeamCredits( + chunk: AuthCreditUsageChunk | null, + team_id: string, + credits: number +): Promise { // WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery if (team_id === "preview") { - return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; + return { + success: true, + message: "Preview team, no credits used", + remainingCredits: Infinity + }; } else if (chunk === null) { throw new Error("NULL ACUC passed to supaCheckTeamCredits"); } @@ -81,7 +109,8 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk | null, t // Removal of + credits const creditUsagePercentage = chunk.adjusted_credits_used / totalPriceCredits; - let isAutoRechargeEnabled = false, autoRechargeThreshold = 1000; + let isAutoRechargeEnabled = false, + autoRechargeThreshold = 1000; const cacheKey = `team_auto_recharge_${team_id}`; let cachedData = await getValue(cacheKey); if (cachedData) { @@ -102,16 +131,19 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk | null, t } } - if (isAutoRechargeEnabled && chunk.remaining_credits < autoRechargeThreshold) { + if ( + isAutoRechargeEnabled && + chunk.remaining_credits < autoRechargeThreshold + ) { const autoChargeResult = await autoCharge(chunk, autoRechargeThreshold); if (autoChargeResult.success) { return { success: true, - message: autoChargeResult.message, - remainingCredits: autoChargeResult.remainingCredits, - chunk: autoChargeResult.chunk, - }; - } + message: autoChargeResult.message, + remainingCredits: autoChargeResult.remainingCredits, + chunk: autoChargeResult.chunk + }; + } } // Compare the adjusted total credits used with the credits allowed by the plan @@ -131,7 +163,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk | null, t message: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, - chunk, + chunk }; } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit @@ -148,7 +180,7 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk | null, t success: true, message: "Sufficient credits available", remainingCredits: chunk.remaining_credits, - chunk, + chunk }; } @@ -202,7 +234,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( return { totalCreditsUsed: totalCreditsUsed, remainingCredits, - totalCredits: FREE_CREDITS + couponCredits, + totalCredits: FREE_CREDITS + couponCredits }; } @@ -241,6 +273,6 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( return { totalCreditsUsed, remainingCredits, - totalCredits: price.credits, + totalCredits: price.credits }; } diff --git a/apps/api/src/services/billing/issue_credits.ts b/apps/api/src/services/billing/issue_credits.ts index ce84db1b..3f013a1c 100644 --- a/apps/api/src/services/billing/issue_credits.ts +++ b/apps/api/src/services/billing/issue_credits.ts @@ -8,7 +8,7 @@ export async function issueCredits(team_id: string, credits: number) { credits: credits, status: "active", // indicates that this coupon was issued from auto recharge - from_auto_recharge: true, + from_auto_recharge: true }); if (error) { diff --git a/apps/api/src/services/billing/stripe.ts b/apps/api/src/services/billing/stripe.ts index db482e91..c5b76445 100644 --- a/apps/api/src/services/billing/stripe.ts +++ b/apps/api/src/services/billing/stripe.ts @@ -5,7 +5,7 @@ const stripe = new Stripe(process.env.STRIPE_SECRET_KEY ?? ""); async function getCustomerDefaultPaymentMethod(customerId: string) { const paymentMethods = await stripe.customers.listPaymentMethods(customerId, { - limit: 3, + limit: 3 }); return paymentMethods.data[0] ?? null; } @@ -16,9 +16,12 @@ export async function createPaymentIntent( customer_id: string ): Promise<{ return_status: ReturnStatus; charge_id: string }> { try { - const defaultPaymentMethod = await getCustomerDefaultPaymentMethod(customer_id); + const defaultPaymentMethod = + await getCustomerDefaultPaymentMethod(customer_id); if (!defaultPaymentMethod) { - logger.error(`No default payment method found for customer: ${customer_id}`); + logger.error( + `No default payment method found for customer: ${customer_id}` + ); return { return_status: "failed", charge_id: "" }; } const paymentIntent = await stripe.paymentIntents.create({ @@ -29,7 +32,7 @@ export async function createPaymentIntent( payment_method_types: [defaultPaymentMethod?.type ?? "card"], payment_method: defaultPaymentMethod?.id, off_session: true, - confirm: true, + confirm: true }); if (paymentIntent.status === "succeeded") { diff --git a/apps/api/src/services/idempotency/create.ts b/apps/api/src/services/idempotency/create.ts index f29fc70f..8e1ede44 100644 --- a/apps/api/src/services/idempotency/create.ts +++ b/apps/api/src/services/idempotency/create.ts @@ -2,10 +2,8 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; import { logger } from "../../../src/lib/logger"; -export async function createIdempotencyKey( - req: Request, -): Promise { - const idempotencyKey = req.headers['x-idempotency-key'] as string; +export async function createIdempotencyKey(req: Request): Promise { + const idempotencyKey = req.headers["x-idempotency-key"] as string; if (!idempotencyKey) { throw new Error("No idempotency key provided in the request headers."); } diff --git a/apps/api/src/services/idempotency/validate.ts b/apps/api/src/services/idempotency/validate.ts index ca3acab1..5a347f67 100644 --- a/apps/api/src/services/idempotency/validate.ts +++ b/apps/api/src/services/idempotency/validate.ts @@ -1,28 +1,28 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; -import { validate as isUuid } from 'uuid'; +import { validate as isUuid } from "uuid"; import { logger } from "../../../src/lib/logger"; -export async function validateIdempotencyKey( - req: Request, -): Promise { - const idempotencyKey = req.headers['x-idempotency-key']; +export async function validateIdempotencyKey(req: Request): Promise { + const idempotencyKey = req.headers["x-idempotency-key"]; if (!idempotencyKey) { // // not returning for missing idempotency key for now return true; } - // Ensure idempotencyKey is treated as a string - const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey; - if (!isUuid(key)) { - logger.debug("Invalid idempotency key provided in the request headers."); - return false; - } + // Ensure idempotencyKey is treated as a string + const key = Array.isArray(idempotencyKey) + ? idempotencyKey[0] + : idempotencyKey; + if (!isUuid(key)) { + logger.debug("Invalid idempotency key provided in the request headers."); + return false; + } const { data, error } = await supabase_service .from("idempotency_keys") .select("key") .eq("key", idempotencyKey); - + if (error) { logger.error(`Error validating idempotency key: ${error}`); } diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 0160828e..bfdc84ce 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -4,17 +4,17 @@ import { configDotenv } from "dotenv"; configDotenv(); export async function logCrawl(job_id: string, team_id: string) { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (useDbAuthentication) { try { const { data, error } = await supabase_service - .from("bulljobs_teams") - .insert([ - { - job_id: job_id, - team_id: team_id, - }, - ]); + .from("bulljobs_teams") + .insert([ + { + job_id: job_id, + team_id: team_id + } + ]); } catch (error) { logger.error(`Error logging crawl job to supabase:\n${error}`); } diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index aaecad25..c3111dd7 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -9,7 +9,7 @@ configDotenv(); export async function logJob(job: FirecrawlJob, force: boolean = false) { try { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (!useDbAuthentication) { return; } @@ -21,7 +21,12 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { job.scrapeOptions.headers["Authorization"] ) { job.scrapeOptions.headers["Authorization"] = "REDACTED"; - job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }]; + job.docs = [ + { + content: "REDACTED DUE TO AUTHORIZATION HEADER", + html: "REDACTED DUE TO AUTHORIZATION HEADER" + } + ]; } const jobColumn = { job_id: job.job_id ? job.job_id : null, @@ -38,25 +43,34 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { origin: job.origin, num_tokens: job.num_tokens, retry: !!job.retry, - crawl_id: job.crawl_id, + crawl_id: job.crawl_id }; if (force) { - let i = 0, done = false; + let i = 0, + done = false; while (i++ <= 10) { try { const { error } = await supabase_service .from("firecrawl_jobs") .insert([jobColumn]); if (error) { - logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id }); - await new Promise((resolve) => setTimeout(() => resolve(), 75)); + logger.error( + "Failed to log job due to Supabase error -- trying again", + { error, scrapeId: job.job_id } + ); + await new Promise((resolve) => + setTimeout(() => resolve(), 75) + ); } else { done = true; break; } } catch (error) { - logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id }); + logger.error( + "Failed to log job due to thrown error -- trying again", + { error, scrapeId: job.job_id } + ); await new Promise((resolve) => setTimeout(() => resolve(), 75)); } } @@ -70,7 +84,10 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { .from("firecrawl_jobs") .insert([jobColumn]); if (error) { - logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id }); + logger.error(`Error logging job: ${error.message}`, { + error, + scrapeId: job.job_id + }); } else { logger.debug("Job logged successfully!", { scrapeId: job.job_id }); } @@ -80,7 +97,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { let phLog = { distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user ...(job.team_id !== "preview" && { - groups: { team: job.team_id }, + groups: { team: job.team_id } }), //* Identifying event on this team event: "job-logged", properties: { @@ -95,14 +112,13 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { page_options: job.scrapeOptions, origin: job.origin, num_tokens: job.num_tokens, - retry: job.retry, - }, + retry: job.retry + } }; - if(job.mode !== "single_urls") { + if (job.mode !== "single_urls") { posthog.capture(phLog); } } - } catch (error) { logger.error(`Error logging job: ${error.message}`); } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 441b3894..3ccaf777 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -10,7 +10,7 @@ export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (!useDbAuthentication) { logger.debug("Skipping logging scrape to Supabase"); return; @@ -42,8 +42,8 @@ export async function logScrape( date_added: new Date().toISOString(), html: "Removed to save db space", ipv4_support: scrapeLog.ipv4_support, - ipv6_support: scrapeLog.ipv6_support, - }, + ipv6_support: scrapeLog.ipv6_support + } ]); if (error) { diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 982d402e..22c23865 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -14,25 +14,25 @@ const emailTemplates: Record< > = { [NotificationType.APPROACHING_LIMIT]: { subject: "You've used 80% of your credit limit - Firecrawl", - html: "Hey there,

Thanks,
Firecrawl Team
", + html: "Hey there,

You are approaching your credit limit for this billing period. Your usage right now is around 80% of your total credit limit. Consider upgrading your plan to avoid hitting the limit. Check out our pricing page for more info.


Thanks,
Firecrawl Team
" }, [NotificationType.LIMIT_REACHED]: { subject: "Credit Limit Reached! Take action now to resume usage - Firecrawl", - html: "Hey there,

You have reached your credit limit for this billing period. To resume usage, please upgrade your plan. Check out our pricing page for more info.


Thanks,
Firecrawl Team
", + html: "Hey there,

You have reached your credit limit for this billing period. To resume usage, please upgrade your plan. Check out our pricing page for more info.


Thanks,
Firecrawl Team
" }, [NotificationType.RATE_LIMIT_REACHED]: { subject: "Rate Limit Reached - Firecrawl", - html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", + html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit." }, [NotificationType.AUTO_RECHARGE_SUCCESS]: { subject: "Auto recharge successful - Firecrawl", - html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at firecrawl.dev/pricing to avoid hitting the limit.


Thanks,
Firecrawl Team
", + html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at firecrawl.dev/pricing to avoid hitting the limit.


Thanks,
Firecrawl Team
" }, [NotificationType.AUTO_RECHARGE_FAILED]: { subject: "Auto recharge failed - Firecrawl", - html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team
", - }, + html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team
" + } }; export async function sendNotification( @@ -55,7 +55,7 @@ export async function sendNotification( export async function sendEmailNotification( email: string, - notificationType: NotificationType, + notificationType: NotificationType ) { const resend = new Resend(process.env.RESEND_API_KEY); @@ -65,7 +65,7 @@ export async function sendEmailNotification( to: [email], reply_to: "help@firecrawl.com", subject: emailTemplates[notificationType].subject, - html: emailTemplates[notificationType].html, + html: emailTemplates[notificationType].html }); if (error) { @@ -89,91 +89,97 @@ export async function sendNotificationInternal( if (team_id === "preview") { return { success: true }; } - return await redlock.using([`notification-lock:${team_id}:${notificationType}`], 5000, async () => { + return await redlock.using( + [`notification-lock:${team_id}:${notificationType}`], + 5000, + async () => { + if (!bypassRecentChecks) { + const fifteenDaysAgo = new Date(); + fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15); - if (!bypassRecentChecks) { - const fifteenDaysAgo = new Date(); - fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15); + const { data, error } = await supabase_service + .from("user_notifications") + .select("*") + .eq("team_id", team_id) + .eq("notification_type", notificationType) + .gte("sent_date", fifteenDaysAgo.toISOString()); - const { data, error } = await supabase_service - .from("user_notifications") - .select("*") - .eq("team_id", team_id) - .eq("notification_type", notificationType) - .gte("sent_date", fifteenDaysAgo.toISOString()); + if (error) { + logger.debug(`Error fetching notifications: ${error}`); + return { success: false }; + } - if (error) { - logger.debug(`Error fetching notifications: ${error}`); - return { success: false }; + if (data.length !== 0) { + return { success: false }; + } + + // TODO: observation: Free credits people are not receiving notifications + + const { data: recentData, error: recentError } = await supabase_service + .from("user_notifications") + .select("*") + .eq("team_id", team_id) + .eq("notification_type", notificationType) + .gte("sent_date", startDateString) + .lte("sent_date", endDateString); + + if (recentError) { + logger.debug( + `Error fetching recent notifications: ${recentError.message}` + ); + return { success: false }; + } + + if (recentData.length !== 0) { + return { success: false }; + } + } + + console.log( + `Sending notification for team_id: ${team_id} and notificationType: ${notificationType}` + ); + // get the emails from the user with the team_id + const { data: emails, error: emailsError } = await supabase_service + .from("users") + .select("email") + .eq("team_id", team_id); + + if (emailsError) { + logger.debug(`Error fetching emails: ${emailsError}`); + return { success: false }; + } + + for (const email of emails) { + await sendEmailNotification(email.email, notificationType); + } + + const { error: insertError } = await supabase_service + .from("user_notifications") + .insert([ + { + team_id: team_id, + notification_type: notificationType, + sent_date: new Date().toISOString(), + timestamp: new Date().toISOString() + } + ]); + + if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { + sendSlackWebhook( + `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`, + false, + process.env.SLACK_ADMIN_WEBHOOK_URL + ).catch((error) => { + logger.debug(`Error sending slack notification: ${error}`); + }); + } + + if (insertError) { + logger.debug(`Error inserting notification record: ${insertError}`); + return { success: false }; + } + + return { success: true }; } - - if (data.length !== 0) { - return { success: false }; - } - - // TODO: observation: Free credits people are not receiving notifications - - const { data: recentData, error: recentError } = await supabase_service - .from("user_notifications") - .select("*") - .eq("team_id", team_id) - .eq("notification_type", notificationType) - .gte("sent_date", startDateString) - .lte("sent_date", endDateString); - - if (recentError) { - logger.debug(`Error fetching recent notifications: ${recentError.message}`); - return { success: false }; - } - - if (recentData.length !== 0) { - return { success: false }; - } - - } - - console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`); - // get the emails from the user with the team_id - const { data: emails, error: emailsError } = await supabase_service - .from("users") - .select("email") - .eq("team_id", team_id); - - if (emailsError) { - logger.debug(`Error fetching emails: ${emailsError}`); - return { success: false }; - } - - for (const email of emails) { - await sendEmailNotification(email.email, notificationType); - } - - const { error: insertError } = await supabase_service - .from("user_notifications") - .insert([ - { - team_id: team_id, - notification_type: notificationType, - sent_date: new Date().toISOString(), - timestamp: new Date().toISOString(), - }, - ]); - - if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { - sendSlackWebhook( - `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`, - false, - process.env.SLACK_ADMIN_WEBHOOK_URL - ).catch((error) => { - logger.debug(`Error sending slack notification: ${error}`); - }); - } - - if (insertError) { - logger.debug(`Error inserting notification record: ${insertError}`); - return { success: false }; - } - - return { success: true }; - }); + ); } diff --git a/apps/api/src/services/posthog.ts b/apps/api/src/services/posthog.ts index e3a01353..69f370ec 100644 --- a/apps/api/src/services/posthog.ts +++ b/apps/api/src/services/posthog.ts @@ -1,6 +1,6 @@ -import { PostHog } from 'posthog-node'; +import { PostHog } from "posthog-node"; import "dotenv/config"; -import { logger } from '../../src/lib/logger'; +import { logger } from "../../src/lib/logger"; export default function PostHogClient(apiKey: string) { const posthogClient = new PostHog(apiKey, { @@ -24,4 +24,4 @@ export const posthog = process.env.POSTHOG_API_KEY "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." ); return new MockPostHog(); - })(); \ No newline at end of file + })(); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index bc2debfe..b4bd799b 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -3,7 +3,13 @@ import { getScrapeQueue } from "./queue-service"; import { v4 as uuidv4 } from "uuid"; import { WebScraperOptions } from "../types"; import * as Sentry from "@sentry/node"; -import { cleanOldConcurrencyLimitEntries, getConcurrencyLimitActiveJobs, getConcurrencyLimitMax, pushConcurrencyLimitActiveJob, pushConcurrencyLimitedJob } from "../lib/concurrency-limit"; +import { + cleanOldConcurrencyLimitEntries, + getConcurrencyLimitActiveJobs, + getConcurrencyLimitMax, + pushConcurrencyLimitActiveJob, + pushConcurrencyLimitedJob +} from "../lib/concurrency-limit"; async function addScrapeJobRaw( webScraperOptions: any, @@ -13,11 +19,17 @@ async function addScrapeJobRaw( ) { let concurrencyLimited = false; - if (webScraperOptions && webScraperOptions.team_id && webScraperOptions.plan) { + if ( + webScraperOptions && + webScraperOptions.team_id && + webScraperOptions.plan + ) { const now = Date.now(); const limit = await getConcurrencyLimitMax(webScraperOptions.plan); cleanOldConcurrencyLimitEntries(webScraperOptions.team_id, now); - concurrencyLimited = (await getConcurrencyLimitActiveJobs(webScraperOptions.team_id, now)).length >= limit; + concurrencyLimited = + (await getConcurrencyLimitActiveJobs(webScraperOptions.team_id, now)) + .length >= limit; } if (concurrencyLimited) { @@ -27,19 +39,23 @@ async function addScrapeJobRaw( opts: { ...options, priority: jobPriority, - jobId: jobId, + jobId: jobId }, - priority: jobPriority, + priority: jobPriority }); } else { - if (webScraperOptions && webScraperOptions.team_id && webScraperOptions.plan) { + if ( + webScraperOptions && + webScraperOptions.team_id && + webScraperOptions.plan + ) { await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId); } await getScrapeQueue().add(jobId, webScraperOptions, { ...options, priority: jobPriority, - jobId, + jobId }); } } @@ -52,24 +68,32 @@ export async function addScrapeJob( ) { if (Sentry.isInitialized()) { const size = JSON.stringify(webScraperOptions).length; - return await Sentry.startSpan({ - name: "Add scrape job", - op: "queue.publish", - attributes: { - "messaging.message.id": jobId, - "messaging.destination.name": getScrapeQueue().name, - "messaging.message.body.size": size, + return await Sentry.startSpan( + { + name: "Add scrape job", + op: "queue.publish", + attributes: { + "messaging.message.id": jobId, + "messaging.destination.name": getScrapeQueue().name, + "messaging.message.body.size": size + } }, - }, async (span) => { - await addScrapeJobRaw({ - ...webScraperOptions, - sentry: { - trace: Sentry.spanToTraceHeader(span), - baggage: Sentry.spanToBaggageHeader(span), - size, - }, - }, options, jobId, jobPriority); - }); + async (span) => { + await addScrapeJobRaw( + { + ...webScraperOptions, + sentry: { + trace: Sentry.spanToTraceHeader(span), + baggage: Sentry.spanToBaggageHeader(span), + size + } + }, + options, + jobId, + jobPriority + ); + } + ); } else { await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority); } @@ -77,18 +101,25 @@ export async function addScrapeJob( export async function addScrapeJobs( jobs: { - data: WebScraperOptions, + data: WebScraperOptions; opts: { - jobId: string, - priority: number, - }, - }[], + jobId: string; + priority: number; + }; + }[] ) { // TODO: better - await Promise.all(jobs.map(job => addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority))); + await Promise.all( + jobs.map((job) => + addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority) + ) + ); } -export function waitForJob(jobId: string, timeout: number): Promise { +export function waitForJob( + jobId: string, + timeout: number +): Promise { return new Promise((resolve, reject) => { const start = Date.now(); const int = setInterval(async () => { @@ -110,5 +141,5 @@ export function waitForJob(jobId: string, timeout: number): Promise } } }, 250); - }) + }); } diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index e6432a3f..3970a6e7 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -5,7 +5,7 @@ import IORedis from "ioredis"; let scrapeQueue: Queue; export const redisConnection = new IORedis(process.env.REDIS_URL!, { - maxRetriesPerRequest: null, + maxRetriesPerRequest: null }); export const scrapeQueueName = "{scrapeQueue}"; @@ -18,12 +18,12 @@ export function getScrapeQueue() { connection: redisConnection, defaultJobOptions: { removeOnComplete: { - age: 90000, // 25 hours + age: 90000 // 25 hours }, removeOnFail: { - age: 90000, // 25 hours - }, - }, + age: 90000 // 25 hours + } + } } // { // settings: { @@ -42,7 +42,6 @@ export function getScrapeQueue() { return scrapeQueue; } - // === REMOVED IN FAVOR OF POLLING -- NOT RELIABLE // import { QueueEvents } from 'bullmq'; -// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); \ No newline at end of file +// export const scrapeQueueEvents = new QueueEvents(scrapeQueueName, { connection: redisConnection.duplicate() }); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 74e954cd..dc352d36 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -5,7 +5,7 @@ import { CustomError } from "../lib/custom-error"; import { getScrapeQueue, redisConnection, - scrapeQueueName, + scrapeQueueName } from "./queue-service"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; @@ -24,26 +24,31 @@ import { getCrawl, getCrawlJobs, lockURL, - normalizeURL, + normalizeURL } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; import { addJobPriority, deleteJobPriority, - getJobPriority, + getJobPriority } from "../../src/lib/job-priority"; import { PlanType, RateLimiterMode } from "../types"; import { getJobs } from "..//controllers/v1/crawl-status"; import { configDotenv } from "dotenv"; import { scrapeOptions } from "../controllers/v1/types"; import { getRateLimiterPoints } from "./rate-limiter"; -import { cleanOldConcurrencyLimitEntries, pushConcurrencyLimitActiveJob, removeConcurrencyLimitActiveJob, takeConcurrencyLimitedJob } from "../lib/concurrency-limit"; +import { + cleanOldConcurrencyLimitEntries, + pushConcurrencyLimitActiveJob, + removeConcurrencyLimitActiveJob, + takeConcurrencyLimitedJob +} from "../lib/concurrency-limit"; configDotenv(); class RacedRedirectError extends Error { constructor() { - super("Raced redirect error") + super("Raced redirect error"); } } @@ -63,21 +68,28 @@ const connectionMonitorInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 10; const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; -async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { +async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); - const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp); + const jobs = (await getJobs(jobIDs)).sort( + (a, b) => a.timestamp - b.timestamp + ); // const jobStatuses = await Promise.all(jobs.map((x) => x.getState())); - const jobStatus = - sc.cancelled // || jobStatuses.some((x) => x === "failed") - ? "failed" - : "completed"; + const jobStatus = sc.cancelled // || jobStatuses.some((x) => x === "failed") + ? "failed" + : "completed"; - const fullDocs = jobs.map((x) => - x.returnvalue ? (Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue) : null - ).filter(x => x !== null); + const fullDocs = jobs + .map((x) => + x.returnvalue + ? Array.isArray(x.returnvalue) + ? x.returnvalue[0] + : x.returnvalue + : null + ) + .filter((x) => x !== null); await logJob({ job_id: job.data.crawl_id, @@ -91,7 +103,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { url: sc.originUrl!, scrapeOptions: sc.scrapeOptions, crawlerOptions: sc.crawlerOptions, - origin: job.data.origin, + origin: job.data.origin }); const data = { @@ -100,12 +112,12 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { links: fullDocs.map((doc) => { return { content: doc, - source: doc?.metadata?.sourceURL ?? doc?.url ?? "", + source: doc?.metadata?.sourceURL ?? doc?.url ?? "" }; - }), + }) }, project_id: job.data.project_id, - docs: fullDocs, + docs: fullDocs }; // v0 web hooks, call when done with all the data @@ -116,15 +128,14 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { data, job.data.webhook, job.data.v1, - job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed" + job.data.crawlerOptions !== null + ? "crawl.completed" + : "batch_scrape.completed" ); } } else { const jobIDs = await getCrawlJobs(job.data.crawl_id); - const jobStatus = - sc.cancelled - ? "failed" - : "completed"; + const jobStatus = sc.cancelled ? "failed" : "completed"; // v1 web hooks, call when done with no data, but with event completed if (job.data.v1 && job.data.webhook) { @@ -134,30 +145,43 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { [], job.data.webhook, job.data.v1, - job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed" - ); - } + job.data.crawlerOptions !== null + ? "crawl.completed" + : "batch_scrape.completed" + ); + } - await logJob({ - job_id: job.data.crawl_id, - success: jobStatus === "completed", - message: sc.cancelled ? "Cancelled" : undefined, - num_docs: jobIDs.length, - docs: [], - time_taken: (Date.now() - sc.createdAt) / 1000, - team_id: job.data.team_id, - scrapeOptions: sc.scrapeOptions, - mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", - url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), - crawlerOptions: sc.crawlerOptions, - origin: job.data.origin, - }, true); + await logJob( + { + job_id: job.data.crawl_id, + success: jobStatus === "completed", + message: sc.cancelled ? "Cancelled" : undefined, + num_docs: jobIDs.length, + docs: [], + time_taken: (Date.now() - sc.createdAt) / 1000, + team_id: job.data.team_id, + scrapeOptions: sc.scrapeOptions, + mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", + url: + sc?.originUrl ?? + (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), + crawlerOptions: sc.crawlerOptions, + origin: job.data.origin + }, + true + ); } } } const processJobInternal = async (token: string, job: Job & { id: string }) => { - const logger = _logger.child({ module: "queue-worker", method: "processJobInternal", jobId: job.id, scrapeId: job.id, crawlId: job.data?.crawl_id ?? undefined }); + const logger = _logger.child({ + module: "queue-worker", + method: "processJobInternal", + jobId: job.id, + scrapeId: job.id, + crawlId: job.data?.crawl_id ?? undefined + }); const extendLockInterval = setInterval(async () => { logger.info(`🐂 Worker extending lock on job ${job.id}`); @@ -171,7 +195,9 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { if (result.success) { try { if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { - logger.debug("Job succeeded -- has crawl associated, putting null in Redis"); + logger.debug( + "Job succeeded -- has crawl associated, putting null in Redis" + ); await job.moveToCompleted(null, token, false); } else { logger.debug("Job succeeded -- putting result in Redis"); @@ -220,7 +246,7 @@ const workerFun = async ( lockDuration: 1 * 60 * 1000, // 1 minute // lockRenewTime: 15 * 1000, // 15 seconds stalledInterval: 30 * 1000, // 30 seconds - maxStalledCount: 10, // 10 times + maxStalledCount: 10 // 10 times }); worker.startStalledCheckTimer(); @@ -241,7 +267,7 @@ const workerFun = async ( if (cantAcceptConnectionCount >= 25) { logger.error("WORKER STALLED", { cpuUsage: await monitor.checkCpuUsage(), - memoryUsage: await monitor.checkMemoryUsage(), + memoryUsage: await monitor.checkMemoryUsage() }); } @@ -265,14 +291,18 @@ const workerFun = async ( if (nextJob !== null) { await pushConcurrencyLimitActiveJob(job.data.team_id, nextJob.id); - await queue.add(nextJob.id, { - ...nextJob.data, - concurrencyLimitHit: true, - }, { - ...nextJob.opts, - jobId: nextJob.id, - priority: nextJob.priority, - }); + await queue.add( + nextJob.id, + { + ...nextJob.data, + concurrencyLimitHit: true + }, + { + ...nextJob.opts, + jobId: nextJob.id, + priority: nextJob.priority + } + ); } } } @@ -281,7 +311,7 @@ const workerFun = async ( Sentry.continueTrace( { sentryTrace: job.data.sentry.trace, - baggage: job.data.sentry.baggage, + baggage: job.data.sentry.baggage }, () => { Sentry.startSpan( @@ -289,8 +319,8 @@ const workerFun = async ( name: "Scrape job", attributes: { job: job.id, - worker: process.env.FLY_MACHINE_ID ?? worker.id, - }, + worker: process.env.FLY_MACHINE_ID ?? worker.id + } }, async (span) => { await Sentry.startSpan( @@ -303,17 +333,17 @@ const workerFun = async ( "messaging.message.body.size": job.data.sentry.size, "messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp), - "messaging.message.retry.count": job.attemptsMade, - }, + "messaging.message.retry.count": job.attemptsMade + } }, async () => { let res; try { res = await processJobInternal(token, job); - } finally { - await afterJobDone(job) + } finally { + await afterJobDone(job); } - + if (res !== null) { span.setStatus({ code: 2 }); // ERROR } else { @@ -331,12 +361,11 @@ const workerFun = async ( name: "Scrape job", attributes: { job: job.id, - worker: process.env.FLY_MACHINE_ID ?? worker.id, - }, + worker: process.env.FLY_MACHINE_ID ?? worker.id + } }, () => { - processJobInternal(token, job) - .finally(() => afterJobDone(job)); + processJobInternal(token, job).finally(() => afterJobDone(job)); } ); } @@ -351,7 +380,13 @@ const workerFun = async ( workerFun(getScrapeQueue(), processJobInternal); async function processJob(job: Job & { id: string }, token: string) { - const logger = _logger.child({ module: "queue-worker", method: "processJob", jobId: job.id, scrapeId: job.id, crawlId: job.data?.crawl_id ?? undefined }); + const logger = _logger.child({ + module: "queue-worker", + method: "processJob", + jobId: job.id, + scrapeId: job.id, + crawlId: job.data?.crawl_id ?? undefined + }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); // Check if the job URL is researchhub and block it immediately @@ -368,7 +403,7 @@ async function processJob(job: Job & { id: string }, token: string) { document: null, project_id: job.data.project_id, error: - "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", + "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error." }; return data; } @@ -378,21 +413,23 @@ async function processJob(job: Job & { id: string }, token: string) { current: 1, total: 100, current_step: "SCRAPING", - current_url: "", + current_url: "" }); const start = Date.now(); const pipeline = await Promise.race([ startWebScraperPipeline({ job, - token, + token }), - ...(job.data.scrapeOptions.timeout !== undefined ? [ - (async () => { - await sleep(job.data.scrapeOptions.timeout); - throw new Error("timeout") - })(), - ] : []) + ...(job.data.scrapeOptions.timeout !== undefined + ? [ + (async () => { + await sleep(job.data.scrapeOptions.timeout); + throw new Error("timeout"); + })() + ] + : []) ]); if (!pipeline.success) { @@ -410,17 +447,21 @@ async function processJob(job: Job & { id: string }, token: string) { const data = { success: true, result: { - links: [{ - content: doc, - source: doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? "", - }], + links: [ + { + content: doc, + source: doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? "" + } + ] }, project_id: job.data.project_id, - document: doc, + document: doc }; if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { - logger.debug("Calling webhook with success...", { webhook: job.data.webhook }); + logger.debug("Calling webhook with success...", { + webhook: job.data.webhook + }); await callWebhook( job.data.team_id, job.data.crawl_id, @@ -434,54 +475,83 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - - if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) { - logger.debug("Was redirected, removing old URL and locking new URL...", { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }); + + if ( + doc.metadata.url !== undefined && + doc.metadata.sourceURL !== undefined && + normalizeURL(doc.metadata.url, sc) !== + normalizeURL(doc.metadata.sourceURL, sc) + ) { + logger.debug( + "Was redirected, removing old URL and locking new URL...", + { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url } + ); // Remove the old URL from visited unique due to checking for limit // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL) - await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc)); + await redisConnection.srem( + "crawl:" + job.data.crawl_id + ":visited_unique", + normalizeURL(doc.metadata.sourceURL, sc) + ); const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); - const p2 = generateURLPermutations(normalizeURL(doc.metadata.sourceURL, sc)); + const p2 = generateURLPermutations( + normalizeURL(doc.metadata.sourceURL, sc) + ); // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before. // This can prevent flakiness with race conditions. // Lock the new URL const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url); - if (job.data.crawlerOptions !== null && !lockRes && JSON.stringify(p1) !== JSON.stringify(p2)) { + if ( + job.data.crawlerOptions !== null && + !lockRes && + JSON.stringify(p1) !== JSON.stringify(p2) + ) { throw new RacedRedirectError(); } } logger.debug("Logging job to DB..."); - await logJob({ - job_id: job.id as string, - success: true, - num_docs: 1, - docs: [doc], - time_taken: timeTakenInSeconds, - team_id: job.data.team_id, - mode: job.data.mode, - url: job.data.url, - crawlerOptions: sc.crawlerOptions, - scrapeOptions: job.data.scrapeOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, - }, true); + await logJob( + { + job_id: job.id as string, + success: true, + num_docs: 1, + docs: [doc], + time_taken: timeTakenInSeconds, + team_id: job.data.team_id, + mode: job.data.mode, + url: job.data.url, + crawlerOptions: sc.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id + }, + true + ); logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, true); if (job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!); + const crawler = crawlToCrawler( + job.data.crawl_id, + sc, + doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl! + ); const links = crawler.filterLinks( - crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!), + crawler.extractLinksFromHTML( + rawHtml ?? "", + doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl! + ), Infinity, sc.crawlerOptions?.maxDepth ?? 10 ); - logger.debug("Discovered " + links.length + " links...", { linksLength: links.length }); + logger.debug("Discovered " + links.length + " links...", { + linksLength: links.length + }); for (const link of links) { if (await lockURL(job.data.crawl_id, sc, link)) { @@ -489,11 +559,17 @@ async function processJob(job: Job & { id: string }, token: string) { const jobPriority = await getJobPriority({ plan: sc.plan as PlanType, team_id: sc.team_id, - basePriority: job.data.crawl_id ? 20 : 10, + basePriority: job.data.crawl_id ? 20 : 10 }); const jobId = uuidv4(); - logger.debug("Determined job priority " + jobPriority + " for URL " + JSON.stringify(link), { jobPriority, url: link }); + logger.debug( + "Determined job priority " + + jobPriority + + " for URL " + + JSON.stringify(link), + { jobPriority, url: link } + ); // console.log("plan: ", sc.plan); // console.log("team_id: ", sc.team_id) @@ -511,7 +587,7 @@ async function processJob(job: Job & { id: string }, token: string) { origin: job.data.origin, crawl_id: job.data.crawl_id, webhook: job.data.webhook, - v1: job.data.v1, + v1: job.data.v1 }, {}, jobId, @@ -519,9 +595,15 @@ async function processJob(job: Job & { id: string }, token: string) { ); await addCrawlJob(job.data.crawl_id, jobId); - logger.debug("Added job for URL " + JSON.stringify(link), { jobPriority, url: link, newJobId: jobId }); + logger.debug("Added job for URL " + JSON.stringify(link), { + jobPriority, + url: link, + newJobId: jobId + }); } else { - logger.debug("Could not lock URL " + JSON.stringify(link), { url: link }); + logger.debug("Could not lock URL " + JSON.stringify(link), { + url: link + }); } } } @@ -533,7 +615,8 @@ async function processJob(job: Job & { id: string }, token: string) { logger.info(`🐂 Job done ${job.id}`); return data; } catch (error) { - const isEarlyTimeout = error instanceof Error && error.message === "timeout"; + const isEarlyTimeout = + error instanceof Error && error.message === "timeout"; if (isEarlyTimeout) { logger.error(`🐂 Job timed out ${job.id}`); @@ -544,8 +627,8 @@ async function processJob(job: Job & { id: string }, token: string) { Sentry.captureException(error, { data: { - job: job.id, - }, + job: job.id + } }); if (error instanceof CustomError) { @@ -562,7 +645,12 @@ async function processJob(job: Job & { id: string }, token: string) { success: false, document: null, project_id: job.data.project_id, - error: error instanceof Error ? error : typeof error === "string" ? new Error(error) : new Error(JSON.stringify(error)), + error: + error instanceof Error + ? error + : typeof error === "string" + ? new Error(error) + : new Error(JSON.stringify(error)) }; if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) { @@ -572,7 +660,7 @@ async function processJob(job: Job & { id: string }, token: string) { data, job.data.webhook, job.data.v1, - job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", + job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page" ); } // if (job.data.v1) { @@ -588,31 +676,34 @@ async function processJob(job: Job & { id: string }, token: string) { if (job.data.crawl_id) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - + logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, false); logger.debug("Logging job to DB..."); - await logJob({ - job_id: job.id as string, - success: false, - message: - typeof error === "string" - ? error - : error.message ?? - "Something went wrong... Contact help@mendable.ai", - num_docs: 0, - docs: [], - time_taken: 0, - team_id: job.data.team_id, - mode: job.data.mode, - url: job.data.url, - crawlerOptions: sc.crawlerOptions, - scrapeOptions: job.data.scrapeOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, - }, true); - + await logJob( + { + job_id: job.id as string, + success: false, + message: + typeof error === "string" + ? error + : (error.message ?? + "Something went wrong... Contact help@mendable.ai"), + num_docs: 0, + docs: [], + time_taken: 0, + team_id: job.data.team_id, + mode: job.data.mode, + url: job.data.url, + crawlerOptions: sc.crawlerOptions, + scrapeOptions: job.data.scrapeOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id + }, + true + ); + await finishCrawlIfNeeded(job, sc); // await logJob({ diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts index 4052bfff..5c25a8d7 100644 --- a/apps/api/src/services/rate-limiter.test.ts +++ b/apps/api/src/services/rate-limiter.test.ts @@ -2,7 +2,7 @@ import { getRateLimiter, serverRateLimiter, testSuiteRateLimiter, - redisRateLimitClient, + redisRateLimitClient } from "./rate-limiter"; import { RateLimiterMode } from "../../src/types"; import { RateLimiterRedis } from "rate-limiter-flexible"; @@ -25,7 +25,7 @@ describe("Rate Limiter Service", () => { afterAll(async () => { try { // if (process.env.REDIS_RATE_LIMIT_URL === "redis://localhost:6379") { - await redisRateLimitClient.disconnect(); + await redisRateLimitClient.disconnect(); // } } catch (error) {} }); @@ -103,7 +103,7 @@ describe("Rate Limiter Service", () => { storeClient: redisRateLimitClient, keyPrefix, points, - duration: 60, + duration: 60 }); expect(limiter.keyPrefix).toBe(keyPrefix); @@ -357,7 +357,7 @@ describe("Rate Limiter Service", () => { storeClient: redisRateLimitClient, keyPrefix, points, - duration, + duration }); const consumePoints = 5; diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 5eecfa70..8067f862 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -18,7 +18,7 @@ const RATE_LIMITS = { etier2c: 300, etier1a: 1000, etier2a: 300, - etierscale1: 150, + etierscale1: 150 }, scrape: { default: 20, @@ -35,7 +35,7 @@ const RATE_LIMITS = { etier2c: 2500, etier1a: 1000, etier2a: 2500, - etierscale1: 1500, + etierscale1: 1500 }, search: { default: 20, @@ -52,9 +52,9 @@ const RATE_LIMITS = { etier2c: 2500, etier1a: 1000, etier2a: 2500, - etierscale1: 1500, + etierscale1: 1500 }, - map:{ + map: { default: 20, free: 5, starter: 50, @@ -69,36 +69,36 @@ const RATE_LIMITS = { etier2c: 2500, etier1a: 1000, etier2a: 2500, - etierscale1: 1500, + etierscale1: 1500 }, preview: { free: 5, - default: 5, + default: 5 }, account: { free: 100, - default: 100, + default: 100 }, crawlStatus: { free: 300, - default: 500, + default: 500 }, testSuite: { free: 10000, - default: 10000, - }, + default: 10000 + } }; export const redisRateLimitClient = new Redis( process.env.REDIS_RATE_LIMIT_URL! -) +); const createRateLimiter = (keyPrefix, points) => new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix, points, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); export const serverRateLimiter = createRateLimiter( @@ -110,43 +110,42 @@ export const testSuiteRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "test-suite", points: 10000, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); export const devBRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "dev-b", points: 1200, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); export const manualRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "manual", points: 2000, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); - export const scrapeStatusRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "scrape-status", points: 400, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); export const etier1aRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "etier1a", points: 10000, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); export const etier2aRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "etier2a", points: 2500, - duration: 60, // Duration in seconds + duration: 60 // Duration in seconds }); const testSuiteTokens = [ @@ -180,12 +179,12 @@ export function getRateLimiterPoints( token?: string, plan?: string, teamId?: string -) : number { +): number { const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} if (!rateLimitConfig) return RATE_LIMITS.account.default; - - const points : number = + + const points: number = rateLimitConfig[makePlanKey(plan)] || rateLimitConfig.default; // 5 return points; } @@ -195,30 +194,33 @@ export function getRateLimiter( token?: string, plan?: string, teamId?: string - ) : RateLimiterRedis { - if (token && testSuiteTokens.some(testToken => token.includes(testToken))) { +): RateLimiterRedis { + if (token && testSuiteTokens.some((testToken) => token.includes(testToken))) { return testSuiteRateLimiter; } - if(teamId && teamId === process.env.DEV_B_TEAM_ID) { + if (teamId && teamId === process.env.DEV_B_TEAM_ID) { return devBRateLimiter; } - - if(teamId && teamId === process.env.ETIER1A_TEAM_ID) { + + if (teamId && teamId === process.env.ETIER1A_TEAM_ID) { return etier1aRateLimiter; } - if(teamId && teamId === process.env.ETIER2A_TEAM_ID) { + if (teamId && teamId === process.env.ETIER2A_TEAM_ID) { return etier2aRateLimiter; } - if(teamId && teamId === process.env.ETIER2D_TEAM_ID) { + if (teamId && teamId === process.env.ETIER2D_TEAM_ID) { return etier2aRateLimiter; } - if(teamId && manual.includes(teamId)) { + if (teamId && manual.includes(teamId)) { return manualRateLimiter; } - - return createRateLimiter(`${mode}-${makePlanKey(plan)}`, getRateLimiterPoints(mode, token, plan, teamId)); + + return createRateLimiter( + `${mode}-${makePlanKey(plan)}`, + getRateLimiterPoints(mode, token, plan, teamId) + ); } diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index 1bd83605..04fcbd5e 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -35,7 +35,12 @@ redisRateLimitClient.on("connect", (err) => { * @param {string} value The value to store. * @param {number} [expire] Optional expiration time in seconds. */ -const setValue = async (key: string, value: string, expire?: number, nx = false) => { +const setValue = async ( + key: string, + value: string, + expire?: number, + nx = false +) => { if (expire && !nx) { await redisRateLimitClient.set(key, value, "EX", expire); } else { diff --git a/apps/api/src/services/redlock.ts b/apps/api/src/services/redlock.ts index 921a973a..757346f9 100644 --- a/apps/api/src/services/redlock.ts +++ b/apps/api/src/services/redlock.ts @@ -21,6 +21,6 @@ export const redlock = new Redlock( // The minimum remaining time on a lock before an extension is automatically // attempted with the `using` API. - automaticExtensionThreshold: 500, // time in ms + automaticExtensionThreshold: 500 // time in ms } ); diff --git a/apps/api/src/services/sentry.ts b/apps/api/src/services/sentry.ts index 072a501e..41f19362 100644 --- a/apps/api/src/services/sentry.ts +++ b/apps/api/src/services/sentry.ts @@ -7,12 +7,10 @@ if (process.env.SENTRY_DSN) { logger.info("Setting up Sentry..."); Sentry.init({ dsn: process.env.SENTRY_DSN, - integrations: [ - nodeProfilingIntegration(), - ], + integrations: [nodeProfilingIntegration()], tracesSampleRate: process.env.SENTRY_ENVIRONMENT === "dev" ? 1.0 : 0.045, profilesSampleRate: 1.0, serverName: process.env.FLY_MACHINE_ID, - environment: process.env.SENTRY_ENVIRONMENT ?? "production", + environment: process.env.SENTRY_ENVIRONMENT ?? "production" }); } diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 61f16836..521a82ca 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -10,7 +10,7 @@ class SupabaseService { constructor() { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; - const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; // Only initialize the Supabase client if both URL and Service Token are provided. if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null @@ -52,6 +52,6 @@ export const supabase_service: SupabaseClient = new Proxy( } // Otherwise, delegate access to the Supabase client. return Reflect.get(client, prop, receiver); - }, + } } ) as unknown as SupabaseClient; diff --git a/apps/api/src/services/system-monitor.ts b/apps/api/src/services/system-monitor.ts index b5e1bf29..4fa4c478 100644 --- a/apps/api/src/services/system-monitor.ts +++ b/apps/api/src/services/system-monitor.ts @@ -1,223 +1,228 @@ -import si from 'systeminformation'; +import si from "systeminformation"; import { Mutex } from "async-mutex"; -import os from 'os'; -import fs from 'fs'; -import { logger } from '../lib/logger'; +import os from "os"; +import fs from "fs"; +import { logger } from "../lib/logger"; const IS_KUBERNETES = process.env.IS_KUBERNETES === "true"; const MAX_CPU = process.env.MAX_CPU ? parseFloat(process.env.MAX_CPU) : 0.8; const MAX_RAM = process.env.MAX_RAM ? parseFloat(process.env.MAX_RAM) : 0.8; -const CACHE_DURATION = process.env.SYS_INFO_MAX_CACHE_DURATION ? parseFloat(process.env.SYS_INFO_MAX_CACHE_DURATION) : 150; - +const CACHE_DURATION = process.env.SYS_INFO_MAX_CACHE_DURATION + ? parseFloat(process.env.SYS_INFO_MAX_CACHE_DURATION) + : 150; class SystemMonitor { - private static instance: SystemMonitor; - private static instanceMutex = new Mutex(); + private static instance: SystemMonitor; + private static instanceMutex = new Mutex(); - private cpuUsageCache: number | null = null; - private memoryUsageCache: number | null = null; - private lastCpuCheck: number = 0; - private lastMemoryCheck: number = 0; - - // Variables for CPU usage calculation - private previousCpuUsage: number = 0; - private previousTime: number = Date.now(); + private cpuUsageCache: number | null = null; + private memoryUsageCache: number | null = null; + private lastCpuCheck: number = 0; + private lastMemoryCheck: number = 0; - private constructor() {} + // Variables for CPU usage calculation + private previousCpuUsage: number = 0; + private previousTime: number = Date.now(); - public static async getInstance(): Promise { - if (SystemMonitor.instance) { - return SystemMonitor.instance; - } - - await this.instanceMutex.runExclusive(async () => { - if (!SystemMonitor.instance) { - SystemMonitor.instance = new SystemMonitor(); - } - }); - - return SystemMonitor.instance; + private constructor() {} + + public static async getInstance(): Promise { + if (SystemMonitor.instance) { + return SystemMonitor.instance; } - public async checkMemoryUsage() { - if (IS_KUBERNETES) { - return this._checkMemoryUsageKubernetes(); - } - return this._checkMemoryUsage(); + await this.instanceMutex.runExclusive(async () => { + if (!SystemMonitor.instance) { + SystemMonitor.instance = new SystemMonitor(); + } + }); + + return SystemMonitor.instance; + } + + public async checkMemoryUsage() { + if (IS_KUBERNETES) { + return this._checkMemoryUsageKubernetes(); + } + return this._checkMemoryUsage(); + } + + private readMemoryCurrent(): number { + const data = fs.readFileSync("/sys/fs/cgroup/memory.current", "utf8"); + return parseInt(data.trim(), 10); + } + + private readMemoryMax(): number { + const data = fs.readFileSync("/sys/fs/cgroup/memory.max", "utf8").trim(); + if (data === "max") { + return Infinity; + } + return parseInt(data, 10); + } + private async _checkMemoryUsageKubernetes() { + try { + const currentMemoryUsage = this.readMemoryCurrent(); + const memoryLimit = this.readMemoryMax(); + + let memoryUsagePercentage: number; + + if (memoryLimit === Infinity) { + // No memory limit set; use total system memory + const totalMemory = os.totalmem(); + memoryUsagePercentage = currentMemoryUsage / totalMemory; + } else { + memoryUsagePercentage = currentMemoryUsage / memoryLimit; + } + + // console.log("Memory usage:", memoryUsagePercentage); + + return memoryUsagePercentage; + } catch (error) { + logger.error(`Error calculating memory usage: ${error}`); + return 0; // Fallback to 0% usage + } + } + + private async _checkMemoryUsage() { + const now = Date.now(); + if ( + this.memoryUsageCache !== null && + now - this.lastMemoryCheck < CACHE_DURATION + ) { + return this.memoryUsageCache; } + const memoryData = await si.mem(); + const totalMemory = memoryData.total; + const availableMemory = memoryData.available; + const usedMemory = totalMemory - availableMemory; + const usedMemoryPercentage = usedMemory / totalMemory; - private readMemoryCurrent(): number { - const data = fs.readFileSync('/sys/fs/cgroup/memory.current', 'utf8'); - return parseInt(data.trim(), 10); + this.memoryUsageCache = usedMemoryPercentage; + this.lastMemoryCheck = now; + + return usedMemoryPercentage; + } + + public async checkCpuUsage() { + if (IS_KUBERNETES) { + return this._checkCpuUsageKubernetes(); + } + return this._checkCpuUsage(); + } + private readCpuUsage(): number { + const data = fs.readFileSync("/sys/fs/cgroup/cpu.stat", "utf8"); + const match = data.match(/^usage_usec (\d+)$/m); + if (match) { + return parseInt(match[1], 10); + } + throw new Error("Could not read usage_usec from cpu.stat"); + } + + private getNumberOfCPUs(): number { + let cpus: number[] = []; + try { + const cpusetPath = "/sys/fs/cgroup/cpuset.cpus.effective"; + const data = fs.readFileSync(cpusetPath, "utf8").trim(); + + if (!data) { + throw new Error(`${cpusetPath} is empty.`); + } + + cpus = this.parseCpuList(data); + + if (cpus.length === 0) { + throw new Error("No CPUs found in cpuset.cpus.effective"); + } + } catch (error) { + logger.warn( + `Unable to read cpuset.cpus.effective, defaulting to OS CPUs: ${error}` + ); + cpus = os.cpus().map((cpu, index) => index); + } + return cpus.length; + } + + private parseCpuList(cpuList: string): number[] { + const ranges = cpuList.split(","); + const cpus: number[] = []; + ranges.forEach((range) => { + const [startStr, endStr] = range.split("-"); + const start = parseInt(startStr, 10); + const end = endStr !== undefined ? parseInt(endStr, 10) : start; + for (let i = start; i <= end; i++) { + cpus.push(i); + } + }); + return cpus; + } + private async _checkCpuUsageKubernetes() { + try { + const usage = this.readCpuUsage(); // In microseconds (µs) + const now = Date.now(); + + // Check if it's the first run + if (this.previousCpuUsage === 0) { + // Initialize previous values + this.previousCpuUsage = usage; + this.previousTime = now; + // Return 0% CPU usage on first run + return 0; + } + + const deltaUsage = usage - this.previousCpuUsage; // In µs + const deltaTime = (now - this.previousTime) * 1000; // Convert ms to µs + + const numCPUs = this.getNumberOfCPUs(); // Get the number of CPUs + + // Calculate the CPU usage percentage and normalize by the number of CPUs + const cpuUsagePercentage = deltaUsage / deltaTime / numCPUs; + + // Update previous values + this.previousCpuUsage = usage; + this.previousTime = now; + + // console.log("CPU usage:", cpuUsagePercentage); + + return cpuUsagePercentage; + } catch (error) { + logger.error(`Error calculating CPU usage: ${error}`); + return 0; // Fallback to 0% usage + } + } + + private async _checkCpuUsage() { + const now = Date.now(); + if ( + this.cpuUsageCache !== null && + now - this.lastCpuCheck < CACHE_DURATION + ) { + return this.cpuUsageCache; } - private readMemoryMax(): number { - const data = fs.readFileSync('/sys/fs/cgroup/memory.max', 'utf8').trim(); - if (data === 'max') { - return Infinity; - } - return parseInt(data, 10); - } - private async _checkMemoryUsageKubernetes() { - try { - const currentMemoryUsage = this.readMemoryCurrent(); - const memoryLimit = this.readMemoryMax(); + const cpuData = await si.currentLoad(); + const cpuLoad = cpuData.currentLoad / 100; - let memoryUsagePercentage: number; + this.cpuUsageCache = cpuLoad; + this.lastCpuCheck = now; - if (memoryLimit === Infinity) { - // No memory limit set; use total system memory - const totalMemory = os.totalmem(); - memoryUsagePercentage = currentMemoryUsage / totalMemory; - } else { - memoryUsagePercentage = currentMemoryUsage / memoryLimit; - } + return cpuLoad; + } - // console.log("Memory usage:", memoryUsagePercentage); + public async acceptConnection() { + const cpuUsage = await this.checkCpuUsage(); + const memoryUsage = await this.checkMemoryUsage(); - return memoryUsagePercentage; - } catch (error) { - logger.error(`Error calculating memory usage: ${error}`); - return 0; // Fallback to 0% usage - } - } + return cpuUsage < MAX_CPU && memoryUsage < MAX_RAM; + } - private async _checkMemoryUsage() { - const now = Date.now(); - if (this.memoryUsageCache !== null && (now - this.lastMemoryCheck) < CACHE_DURATION) { - return this.memoryUsageCache; - } - - const memoryData = await si.mem(); - const totalMemory = memoryData.total; - const availableMemory = memoryData.available; - const usedMemory = totalMemory - availableMemory; - const usedMemoryPercentage = (usedMemory / totalMemory); - - this.memoryUsageCache = usedMemoryPercentage; - this.lastMemoryCheck = now; - - return usedMemoryPercentage; - } - - public async checkCpuUsage() { - if (IS_KUBERNETES) { - return this._checkCpuUsageKubernetes(); - } - return this._checkCpuUsage(); - } - private readCpuUsage(): number { - const data = fs.readFileSync('/sys/fs/cgroup/cpu.stat', 'utf8'); - const match = data.match(/^usage_usec (\d+)$/m); - if (match) { - return parseInt(match[1], 10); - } - throw new Error('Could not read usage_usec from cpu.stat'); - } - - - private getNumberOfCPUs(): number { - let cpus: number[] = []; - try { - const cpusetPath = '/sys/fs/cgroup/cpuset.cpus.effective'; - const data = fs.readFileSync(cpusetPath, 'utf8').trim(); - - if (!data) { - throw new Error(`${cpusetPath} is empty.`); - } - - cpus = this.parseCpuList(data); - - if (cpus.length === 0) { - throw new Error('No CPUs found in cpuset.cpus.effective'); - } - } catch (error) { - logger.warn(`Unable to read cpuset.cpus.effective, defaulting to OS CPUs: ${error}`); - cpus = os.cpus().map((cpu, index) => index); - } - return cpus.length; - } - - - private parseCpuList(cpuList: string): number[] { - const ranges = cpuList.split(','); - const cpus: number[] = []; - ranges.forEach((range) => { - const [startStr, endStr] = range.split('-'); - const start = parseInt(startStr, 10); - const end = endStr !== undefined ? parseInt(endStr, 10) : start; - for (let i = start; i <= end; i++) { - cpus.push(i); - } - }); - return cpus; - } - private async _checkCpuUsageKubernetes() { - try { - const usage = this.readCpuUsage(); // In microseconds (µs) - const now = Date.now(); - - // Check if it's the first run - if (this.previousCpuUsage === 0) { - // Initialize previous values - this.previousCpuUsage = usage; - this.previousTime = now; - // Return 0% CPU usage on first run - return 0; - } - - const deltaUsage = usage - this.previousCpuUsage; // In µs - const deltaTime = (now - this.previousTime) * 1000; // Convert ms to µs - - const numCPUs = this.getNumberOfCPUs(); // Get the number of CPUs - - // Calculate the CPU usage percentage and normalize by the number of CPUs - const cpuUsagePercentage = (deltaUsage / deltaTime) / numCPUs; - - // Update previous values - this.previousCpuUsage = usage; - this.previousTime = now; - - // console.log("CPU usage:", cpuUsagePercentage); - - return cpuUsagePercentage; - } catch (error) { - logger.error(`Error calculating CPU usage: ${error}`); - return 0; // Fallback to 0% usage - } - } - - - private async _checkCpuUsage() { - const now = Date.now(); - if (this.cpuUsageCache !== null && (now - this.lastCpuCheck) < CACHE_DURATION) { - return this.cpuUsageCache; - } - - const cpuData = await si.currentLoad(); - const cpuLoad = cpuData.currentLoad / 100; - - this.cpuUsageCache = cpuLoad; - this.lastCpuCheck = now; - - return cpuLoad; - } - - public async acceptConnection() { - const cpuUsage = await this.checkCpuUsage(); - const memoryUsage = await this.checkMemoryUsage(); - - return cpuUsage < MAX_CPU && memoryUsage < MAX_RAM; - } - - public clearCache() { - this.cpuUsageCache = null; - this.memoryUsageCache = null; - this.lastCpuCheck = 0; - this.lastMemoryCheck = 0; - } + public clearCache() { + this.cpuUsageCache = null; + this.memoryUsageCache = null; + this.lastCpuCheck = 0; + this.lastMemoryCheck = 0; + } } -export default SystemMonitor.getInstance(); \ No newline at end of file +export default SystemMonitor.getInstance(); diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 7840484d..dfee11f6 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -22,7 +22,9 @@ export const callWebhook = async ( id ); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; - let webhookUrl = specified ?? (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined); + let webhookUrl = + specified ?? + (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined); // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set // and the USE_DB_AUTHENTICATION environment variable is set to true @@ -46,7 +48,14 @@ export const callWebhook = async ( webhookUrl = webhooksData[0].url; } - logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook }); + logger.debug("Calling webhook...", { + webhookUrl, + teamId, + specified, + v1, + eventType, + awaitWebhook + }); if (!webhookUrl) { return null; @@ -61,14 +70,12 @@ export const callWebhook = async ( ) { for (let i = 0; i < data.result.links.length; i++) { if (v1) { - dataToSend.push( - data.result.links[i].content - ); + dataToSend.push(data.result.links[i].content); } else { dataToSend.push({ content: data.result.links[i].content.content, markdown: data.result.links[i].content.markdown, - metadata: data.result.links[i].content.metadata, + metadata: data.result.links[i].content.metadata }); } } @@ -82,23 +89,23 @@ export const callWebhook = async ( success: !v1 ? data.success : eventType === "crawl.page" - ? data.success - : true, + ? data.success + : true, type: eventType, [v1 ? "id" : "jobId"]: id, data: dataToSend, error: !v1 ? data?.error || undefined : eventType === "crawl.page" - ? data?.error || undefined - : undefined, + ? data?.error || undefined + : undefined }, { headers: { "Content-Type": "application/json", - ...webhookUrl.headers, + ...webhookUrl.headers }, - timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) + timeout: v1 ? 10000 : 30000 // 10 seconds timeout (v1) } ); } catch (error) { @@ -114,22 +121,22 @@ export const callWebhook = async ( success: !v1 ? data.success : eventType === "crawl.page" - ? data.success - : true, + ? data.success + : true, type: eventType, [v1 ? "id" : "jobId"]: id, data: dataToSend, error: !v1 ? data?.error || undefined : eventType === "crawl.page" - ? data?.error || undefined - : undefined, + ? data?.error || undefined + : undefined }, { headers: { "Content-Type": "application/json", - ...webhookUrl.headers, - }, + ...webhookUrl.headers + } } ) .catch((error) => { diff --git a/apps/api/src/strings.ts b/apps/api/src/strings.ts index 8edc57f1..d5672b82 100644 --- a/apps/api/src/strings.ts +++ b/apps/api/src/strings.ts @@ -1,4 +1,4 @@ export const errorNoResults = "No results found, please check the URL or contact us at help@mendable.ai to file a ticket."; -export const clientSideError = "client-side exception has occurred" \ No newline at end of file +export const clientSideError = "client-side exception has occurred"; diff --git a/apps/api/src/supabase_types.ts b/apps/api/src/supabase_types.ts index 00b2efbb..8f9e1b64 100644 --- a/apps/api/src/supabase_types.ts +++ b/apps/api/src/supabase_types.ts @@ -40,7 +40,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; company: { @@ -77,7 +77,7 @@ export interface Database { columns: ["pricing_plan_id"]; referencedRelation: "pricing_plan"; referencedColumns: ["id"]; - }, + } ]; }; constants: { @@ -126,7 +126,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; customers: { @@ -157,7 +157,7 @@ export interface Database { columns: ["user_id"]; referencedRelation: "users"; referencedColumns: ["id"]; - }, + } ]; }; data: { @@ -236,7 +236,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; data_partitioned: { @@ -390,7 +390,7 @@ export interface Database { columns: ["company_id"]; referencedRelation: "company"; referencedColumns: ["company_id"]; - }, + } ]; }; message: { @@ -439,7 +439,7 @@ export interface Database { columns: ["conversation_id"]; referencedRelation: "conversation"; referencedColumns: ["conversation_id"]; - }, + } ]; }; model_configuration: { @@ -479,7 +479,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; monthly_message_counts: { @@ -507,7 +507,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; prices: { @@ -560,7 +560,7 @@ export interface Database { columns: ["product_id"]; referencedRelation: "products"; referencedColumns: ["id"]; - }, + } ]; }; pricing_plan: { @@ -747,7 +747,7 @@ export interface Database { columns: ["user_id"]; referencedRelation: "users"; referencedColumns: ["id"]; - }, + } ]; }; suggested_questions: { @@ -775,7 +775,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; user_notifications: { @@ -821,7 +821,7 @@ export interface Database { columns: ["user_id"]; referencedRelation: "users"; referencedColumns: ["id"]; - }, + } ]; }; users: { @@ -864,7 +864,7 @@ export interface Database { columns: ["id"]; referencedRelation: "users"; referencedColumns: ["id"]; - }, + } ]; }; z_testcomp_92511: { @@ -934,7 +934,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - }, + } ]; }; }; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index bf7d2248..cfae8f23 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -1,5 +1,10 @@ import { z } from "zod"; -import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, webhookSchema } from "./controllers/v1/types"; +import { + AuthCreditUsageChunk, + ScrapeOptions, + Document as V1Document, + webhookSchema +} from "./controllers/v1/types"; import { ExtractorOptions, Document } from "./lib/entities"; import { InternalOptions } from "./scraper/scrapeURL"; @@ -52,13 +57,15 @@ export interface RunWebScraperParams { is_scrape?: boolean; } -export type RunWebScraperResult = { - success: false; - error: Error; -} | { - success: true; - document: V1Document; -} +export type RunWebScraperResult = + | { + success: false; + error: Error; + } + | { + success: true; + document: V1Document; + }; export interface FirecrawlJob { job_id?: string; @@ -73,8 +80,8 @@ export interface FirecrawlJob { crawlerOptions?: any; scrapeOptions?: any; origin: string; - num_tokens?: number, - retry?: boolean, + num_tokens?: number; + retry?: boolean; crawl_id?: string; } @@ -92,7 +99,6 @@ export interface FirecrawlCrawlResponse { body: { status: string; jobId: string; - }; error?: string; } @@ -101,7 +107,7 @@ export interface FirecrawlCrawlStatusResponse { statusCode: number; body: { status: string; - data: Document[]; + data: Document[]; }; error?: string; } @@ -121,29 +127,29 @@ export enum RateLimiterMode { Scrape = "scrape", Preview = "preview", Search = "search", - Map = "map", - + Map = "map" } -export type AuthResponse = { - success: true; - team_id: string; - api_key?: string; - plan?: PlanType; - chunk: AuthCreditUsageChunk | null; -} | { - success: false; - error: string; - status: number; -} - +export type AuthResponse = + | { + success: true; + team_id: string; + api_key?: string; + plan?: PlanType; + chunk: AuthCreditUsageChunk | null; + } + | { + success: false; + error: string; + status: number; + }; export enum NotificationType { APPROACHING_LIMIT = "approachingLimit", LIMIT_REACHED = "limitReached", RATE_LIMIT_REACHED = "rateLimitReached", AUTO_RECHARGE_SUCCESS = "autoRechargeSuccess", - AUTO_RECHARGE_FAILED = "autoRechargeFailed", + AUTO_RECHARGE_FAILED = "autoRechargeFailed" } export type ScrapeLog = { @@ -161,7 +167,7 @@ export type ScrapeLog = { ipv6_support?: boolean | null; }; -export type PlanType = +export type PlanType = | "starter" | "standard" | "scale" @@ -175,5 +181,11 @@ export type PlanType = | "free" | ""; - -export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed"; \ No newline at end of file +export type WebhookEventType = + | "crawl.page" + | "batch_scrape.page" + | "crawl.started" + | "batch_scrape.started" + | "crawl.completed" + | "batch_scrape.completed" + | "crawl.failed"; From 52f2e733e2a8a25827a9e5d9c64e7a6b652f49ea Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 11 Dec 2024 19:48:22 -0300 Subject: [PATCH 2/3] Nick: fixes --- apps/api/package.json | 3 ++- apps/api/pnpm-lock.yaml | 18 ++++++++++++++---- apps/api/src/controllers/v0/crawl.ts | 18 +++++++----------- apps/api/src/controllers/v0/scrape.ts | 10 ++++------ apps/api/src/index.ts | 14 ++++++-------- apps/api/src/routes/v1.ts | 24 ++++++++++-------------- 6 files changed, 43 insertions(+), 44 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 86f798e9..1f4fd8a8 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -6,7 +6,7 @@ "scripts": { "start": "nodemon --exec ts-node src/index.ts", "start:production": "tsc && node dist/src/index.js", - "format": "npx prettier --write \"src/**/*.(js|ts)\"", + "format": "prettier --write \"src/**/*.(js|ts)\"", "flyio": "node dist/src/index.js", "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc && pnpm sentry:sourcemaps", @@ -102,6 +102,7 @@ "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", + "prettier": "^3.4.2", "promptable": "^0.0.10", "puppeteer": "^22.12.1", "rate-limiter-flexible": "2.4.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 4557afa9..563965c1 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -164,6 +164,9 @@ importers: posthog-node: specifier: ^4.0.1 version: 4.0.1 + prettier: + specifier: ^3.4.2 + version: 3.4.2 promptable: specifier: ^0.0.10 version: 0.0.10 @@ -3756,6 +3759,11 @@ packages: resolution: {integrity: sha512-rtqm2h22QxLGBrW2bLYzbRhliIrqgZ0k+gF0LkQ1SNdeD06YE5eilV0MxZppFSxC8TfH0+B0cWCuebEnreIDgQ==} engines: {node: '>=15.0.0'} + prettier@3.4.2: + resolution: {integrity: sha512-e9MewbtFo+Fevyuxn/4rrcDAaq0IYxPGLvObpQjiZBMAzB9IGmzlnG9RZy3FFas+eBMu2vA0CszMeduow5dIuQ==} + engines: {node: '>=14'} + hasBin: true + pretty-format@29.7.0: resolution: {integrity: sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} @@ -4321,8 +4329,8 @@ packages: engines: {node: '>=14.17'} hasBin: true - typescript@5.6.3: - resolution: {integrity: sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==} + typescript@5.7.2: + resolution: {integrity: sha512-i5t66RHxDvVN40HfDd1PsEThGNnlMCMT3jMUuoh9/0TaqWevNontacunWyN02LA9/fIbEWlcHZcgTKb9QoaLfg==} engines: {node: '>=14.17'} hasBin: true @@ -8978,6 +8986,8 @@ snapshots: transitivePeerDependencies: - debug + prettier@3.4.2: {} + pretty-format@29.7.0: dependencies: '@jest/schemas': 29.6.3 @@ -9000,7 +9010,7 @@ snapshots: csv-parse: 5.5.6 gpt3-tokenizer: 1.1.5 openai: 3.3.0 - typescript: 5.6.3 + typescript: 5.7.2 uuid: 9.0.1 zod: 3.23.8 transitivePeerDependencies: @@ -9584,7 +9594,7 @@ snapshots: typescript@5.4.5: {} - typescript@5.6.3: {} + typescript@5.7.2: {} typesense@1.8.2(@babel/runtime@7.24.6): dependencies: diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index b8c6bc63..bb9ba363 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -86,12 +86,10 @@ export async function crawlController(req: Request, res: Response) { } = await checkTeamCredits(chunk, team_id, limitCheck); if (!creditsCheckSuccess) { - return res - .status(402) - .json({ - error: - "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" - }); + return res.status(402).json({ + error: + "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" + }); } // TODO: need to do this to v1 @@ -259,10 +257,8 @@ export async function crawlController(req: Request, res: Response) { } catch (error) { Sentry.captureException(error); logger.error(error); - return res - .status(500) - .json({ - error: error instanceof ZodError ? "Invalid URL" : error.message - }); + return res.status(500).json({ + error: error instanceof ZodError ? "Invalid URL" : error.message + }); } } diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index c7c8d9fe..4a761ea3 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -211,12 +211,10 @@ export async function scrapeController(req: Request, res: Response) { await checkTeamCredits(chunk, team_id, 1); if (!creditsCheckSuccess) { earlyReturn = true; - return res - .status(402) - .json({ - error: - "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" - }); + return res.status(402).json({ + error: + "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" + }); } } catch (error) { logger.error(error); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 905c32d8..a4f4445b 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -235,14 +235,12 @@ app.use( " -- " + verbose ); - res - .status(500) - .json({ - success: false, - error: - "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + - id - }); + res.status(500).json({ + success: false, + error: + "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + + id + }); } ); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 206423ba..a9727e00 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -54,13 +54,11 @@ function checkCreditsMiddleware( `Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}` ); if (!res.headersSent) { - return res - .status(402) - .json({ - success: false, - error: - "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." - }); + return res.status(402).json({ + success: false, + error: + "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." + }); } } req.account = { remainingCredits }; @@ -122,13 +120,11 @@ function idempotencyMiddleware( function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { if (!res.headersSent) { - return res - .status(403) - .json({ - success: false, - error: - "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." - }); + return res.status(403).json({ + success: false, + error: + "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." + }); } } next(); From 8a1c4049188b0899121c5e1bab528c7a2d3b49c1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 11 Dec 2024 19:51:08 -0300 Subject: [PATCH 3/3] Nick: revert trailing comma --- apps/api/.prettierrc | 2 +- .../src/__tests__/e2e_extract/index.test.ts | 76 +-- .../__tests__/e2e_full_withAuth/index.test.ts | 268 +++++----- apps/api/src/__tests__/e2e_map/index.test.ts | 24 +- .../src/__tests__/e2e_noAuth/index.test.ts | 16 +- .../__tests__/e2e_v1_withAuth/index.test.ts | 158 +++--- .../e2e_v1_withAuth_all_params/index.test.ts | 214 ++++---- .../src/__tests__/e2e_withAuth/index.test.ts | 154 +++--- .../src/controllers/__tests__/crawl.test.ts | 14 +- apps/api/src/controllers/auth.ts | 40 +- apps/api/src/controllers/v0/admin/queue.ts | 44 +- .../src/controllers/v0/admin/redis-health.ts | 6 +- apps/api/src/controllers/v0/crawl-cancel.ts | 2 +- apps/api/src/controllers/v0/crawl-status.ts | 10 +- apps/api/src/controllers/v0/crawl.ts | 34 +- apps/api/src/controllers/v0/crawlPreview.ts | 18 +- apps/api/src/controllers/v0/scrape.ts | 50 +- apps/api/src/controllers/v0/search.ts | 32 +- apps/api/src/controllers/v0/status.ts | 8 +- .../v1/__tests__/urlValidation.test.ts | 14 +- apps/api/src/controllers/v1/batch-scrape.ts | 30 +- .../src/controllers/v1/concurrency-check.ts | 6 +- apps/api/src/controllers/v1/crawl-cancel.ts | 4 +- .../api/src/controllers/v1/crawl-status-ws.ts | 31 +- apps/api/src/controllers/v1/crawl-status.ts | 24 +- apps/api/src/controllers/v1/crawl.ts | 46 +- apps/api/src/controllers/v1/extract.ts | 53 +- apps/api/src/controllers/v1/map.ts | 32 +- apps/api/src/controllers/v1/scrape-status.ts | 10 +- apps/api/src/controllers/v1/scrape.ts | 24 +- apps/api/src/controllers/v1/types.ts | 118 ++-- apps/api/src/index.ts | 30 +- apps/api/src/lib/LLM-extraction/index.ts | 10 +- apps/api/src/lib/LLM-extraction/models.ts | 30 +- .../lib/__tests__/html-to-markdown.test.ts | 8 +- .../src/lib/__tests__/job-priority.test.ts | 18 +- apps/api/src/lib/batch-process.ts | 2 +- apps/api/src/lib/cache.ts | 6 +- apps/api/src/lib/concurrency-limit.ts | 18 +- apps/api/src/lib/crawl-redis.test.ts | 8 +- apps/api/src/lib/crawl-redis.ts | 52 +- apps/api/src/lib/custom-error.ts | 2 +- apps/api/src/lib/default-values.ts | 8 +- apps/api/src/lib/extract/reranker.ts | 8 +- apps/api/src/lib/html-to-markdown.ts | 12 +- apps/api/src/lib/job-priority.ts | 6 +- apps/api/src/lib/logger.ts | 18 +- apps/api/src/lib/map-cosine.ts | 4 +- apps/api/src/lib/ranker.test.ts | 6 +- apps/api/src/lib/ranker.ts | 12 +- apps/api/src/lib/scrape-events.ts | 10 +- apps/api/src/lib/validate-country.ts | 502 +++++++++--------- apps/api/src/lib/validateUrl.test.ts | 24 +- apps/api/src/lib/withAuth.ts | 2 +- apps/api/src/main/runWebScraper.ts | 30 +- apps/api/src/routes/admin.ts | 12 +- apps/api/src/routes/v1.ts | 36 +- apps/api/src/run-req.ts | 22 +- .../WebScraper/__tests__/crawler.test.ts | 10 +- apps/api/src/scraper/WebScraper/crawler.ts | 50 +- .../WebScraper/custom/handleCustomScraping.ts | 18 +- apps/api/src/scraper/WebScraper/sitemap.ts | 20 +- .../utils/__tests__/blocklist.test.ts | 16 +- .../src/scraper/WebScraper/utils/blocklist.ts | 8 +- .../scraper/WebScraper/utils/maxDepthUtils.ts | 2 +- .../scraper/scrapeURL/engines/cache/index.ts | 2 +- .../scraper/scrapeURL/engines/docx/index.ts | 2 +- .../scraper/scrapeURL/engines/fetch/index.ts | 12 +- .../engines/fire-engine/checkStatus.ts | 36 +- .../scrapeURL/engines/fire-engine/delete.ts | 12 +- .../scrapeURL/engines/fire-engine/index.ts | 94 ++-- .../scrapeURL/engines/fire-engine/scrape.ts | 18 +- .../src/scraper/scrapeURL/engines/index.ts | 78 +-- .../scraper/scrapeURL/engines/pdf/index.ts | 52 +- .../scrapeURL/engines/playwright/index.ts | 16 +- .../scrapeURL/engines/scrapingbee/index.ts | 24 +- .../scrapeURL/engines/utils/downloadFile.ts | 14 +- .../engines/utils/specialtyHandler.ts | 8 +- apps/api/src/scraper/scrapeURL/error.ts | 7 +- apps/api/src/scraper/scrapeURL/index.ts | 58 +- .../src/scraper/scrapeURL/lib/extractLinks.ts | 2 +- .../scraper/scrapeURL/lib/extractMetadata.ts | 4 +- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 56 +- .../scrapeURL/lib/removeUnwantedElements.ts | 10 +- .../scrapeURL/lib/urlSpecificParams.ts | 6 +- .../src/scraper/scrapeURL/scrapeURL.test.ts | 96 ++-- .../scraper/scrapeURL/transformers/cache.ts | 4 +- .../scraper/scrapeURL/transformers/index.ts | 46 +- .../scrapeURL/transformers/llmExtract.ts | 46 +- .../transformers/removeBase64Images.ts | 2 +- .../transformers/uploadScreenshot.ts | 4 +- apps/api/src/search/fireEngine.ts | 10 +- apps/api/src/search/googlesearch.ts | 18 +- apps/api/src/search/index.ts | 8 +- apps/api/src/search/searchapi.ts | 10 +- apps/api/src/search/serper.ts | 10 +- apps/api/src/services/alerts/index.ts | 10 +- apps/api/src/services/alerts/slack.ts | 8 +- apps/api/src/services/billing/auto_charge.ts | 34 +- .../src/services/billing/credit_billing.ts | 48 +- .../api/src/services/billing/issue_credits.ts | 2 +- apps/api/src/services/billing/stripe.ts | 10 +- apps/api/src/services/logging/crawl_log.ts | 4 +- apps/api/src/services/logging/log_job.ts | 20 +- apps/api/src/services/logging/scrape_log.ts | 6 +- .../notification/email_notification.ts | 34 +- .../notification/notification_string.ts | 2 +- apps/api/src/services/posthog.ts | 4 +- apps/api/src/services/queue-jobs.ts | 32 +- apps/api/src/services/queue-service.ts | 12 +- apps/api/src/services/queue-worker.ts | 132 ++--- apps/api/src/services/rate-limiter.test.ts | 82 +-- apps/api/src/services/rate-limiter.ts | 44 +- apps/api/src/services/redis.ts | 2 +- apps/api/src/services/redlock.ts | 4 +- apps/api/src/services/sentry.ts | 2 +- apps/api/src/services/supabase.ts | 8 +- apps/api/src/services/system-monitor.ts | 2 +- apps/api/src/services/webhook.ts | 32 +- apps/api/src/supabase_types.ts | 30 +- apps/api/src/types.ts | 6 +- 121 files changed, 1965 insertions(+), 1952 deletions(-) diff --git a/apps/api/.prettierrc b/apps/api/.prettierrc index d93a7f24..5d50a9cd 100644 --- a/apps/api/.prettierrc +++ b/apps/api/.prettierrc @@ -1,3 +1,3 @@ { - "trailingComma": "none" + "trailingComma": "all" } \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e_extract/index.test.ts b/apps/api/src/__tests__/e2e_extract/index.test.ts index 117cbab1..e1e4d1ce 100644 --- a/apps/api/src/__tests__/e2e_extract/index.test.ts +++ b/apps/api/src/__tests__/e2e_extract/index.test.ts @@ -3,7 +3,7 @@ import dotenv from "dotenv"; import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, - FirecrawlScrapeResponse + FirecrawlScrapeResponse, } from "../../types"; dotenv.config(); @@ -23,9 +23,9 @@ describe("E2E Tests for Extract API Routes", () => { schema: { type: "object", properties: { - authors: { type: "array", items: { type: "string" } } - } - } + authors: { type: "array", items: { type: "string" } }, + }, + }, }); console.log(response.body); @@ -45,7 +45,7 @@ describe("E2E Tests for Extract API Routes", () => { expect(gotItRight).toBeGreaterThan(1); }, - 60000 + 60000, ); it.concurrent( @@ -62,9 +62,9 @@ describe("E2E Tests for Extract API Routes", () => { schema: { type: "object", properties: { - founders: { type: "array", items: { type: "string" } } - } - } + founders: { type: "array", items: { type: "string" } }, + }, + }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -83,7 +83,7 @@ describe("E2E Tests for Extract API Routes", () => { expect(gotItRight).toBeGreaterThanOrEqual(2); }, - 60000 + 60000, ); it.concurrent( @@ -100,10 +100,10 @@ describe("E2E Tests for Extract API Routes", () => { schema: { type: "array", items: { - type: "string" + type: "string", }, - required: ["items"] - } + required: ["items"], + }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -118,7 +118,7 @@ describe("E2E Tests for Extract API Routes", () => { expect(gotItRight).toBeGreaterThan(2); }, - 60000 + 60000, ); it.concurrent( @@ -135,15 +135,15 @@ describe("E2E Tests for Extract API Routes", () => { schema: { type: "object", properties: { - pciDssCompliance: { type: "boolean" } - } - } + pciDssCompliance: { type: "boolean" }, + }, + }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); expect(response.body.data?.pciDssCompliance).toBe(true); }, - 60000 + 60000, ); it.concurrent( @@ -163,10 +163,10 @@ describe("E2E Tests for Extract API Routes", () => { properties: { connector: { type: "string" }, description: { type: "string" }, - supportsCaptureDelete: { type: "boolean" } - } - } - } + supportsCaptureDelete: { type: "boolean" }, + }, + }, + }, }); console.log(response.body); @@ -174,7 +174,7 @@ describe("E2E Tests for Extract API Routes", () => { // expect(response.body).toHaveProperty("data"); // expect(response.body.data?.pciDssCompliance).toBe(true); }, - 60000 + 60000, ); it.concurrent( @@ -186,17 +186,17 @@ describe("E2E Tests for Extract API Routes", () => { .set("Content-Type", "application/json") .send({ urls: [ - "https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003" + "https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003", ], prompt: "what applicant tracking system is this company using?", schema: { type: "object", properties: { isGreenhouseATS: { type: "boolean" }, - answer: { type: "string" } - } + answer: { type: "string" }, + }, }, - allowExternalLinks: true + allowExternalLinks: true, }); console.log(response.body); @@ -204,7 +204,7 @@ describe("E2E Tests for Extract API Routes", () => { expect(response.body).toHaveProperty("data"); expect(response.body.data?.isGreenhouseATS).toBe(true); }, - 60000 + 60000, ); it.concurrent( @@ -222,12 +222,12 @@ describe("E2E Tests for Extract API Routes", () => { items: { type: "object", properties: { - component: { type: "string" } - } + component: { type: "string" }, + }, }, - required: ["items"] + required: ["items"], }, - allowExternalLinks: true + allowExternalLinks: true, }); console.log(response.body.data?.items); @@ -248,7 +248,7 @@ describe("E2E Tests for Extract API Routes", () => { } expect(gotItRight).toBeGreaterThan(2); }, - 60000 + 60000, ); it.concurrent( @@ -267,11 +267,11 @@ describe("E2E Tests for Extract API Routes", () => { properties: { name: { type: "string" }, work: { type: "string" }, - education: { type: "string" } + education: { type: "string" }, }, - required: ["name", "work", "education"] + required: ["name", "work", "education"], }, - allowExternalLinks: true + allowExternalLinks: true, }); console.log(response.body.data); @@ -281,7 +281,7 @@ describe("E2E Tests for Extract API Routes", () => { expect(response.body.data?.work).toBeDefined(); expect(response.body.data?.education).toBeDefined(); }, - 60000 + 60000, ); it.concurrent( @@ -293,7 +293,7 @@ describe("E2E Tests for Extract API Routes", () => { .set("Content-Type", "application/json") .send({ urls: ["https://docs.firecrawl.dev"], - prompt: "What is the title and description of the page?" + prompt: "What is the title and description of the page?", }); console.log(response.body.data); @@ -302,6 +302,6 @@ describe("E2E Tests for Extract API Routes", () => { expect(typeof response.body.data).toBe("object"); expect(Object.keys(response.body.data).length).toBeGreaterThan(0); }, - 60000 + 60000, ); }); diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index a8841aab..45b3c31e 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -47,7 +47,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent("should return an error for a blocklisted URL", async () => { @@ -59,7 +59,7 @@ describe("E2E Tests for API Routes", () => { .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); @@ -103,30 +103,30 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.title).toBe("Roast My Website"); expect(response.body.data.metadata.description).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(response.body.data.metadata.keywords).toBe( - "Roast My Website,Roast,Website,GitHub,Firecrawl" + "Roast My Website,Roast,Website,GitHub,Firecrawl", ); expect(response.body.data.metadata.robots).toBe("follow, index"); expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); expect(response.body.data.metadata.ogDescription).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(response.body.data.metadata.ogUrl).toBe( - "https://www.roastmywebsite.ai" + "https://www.roastmywebsite.ai", ); expect(response.body.data.metadata.ogImage).toBe( - "https://www.roastmywebsite.ai/og.png" + "https://www.roastmywebsite.ai/og.png", ); expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); expect(response.body.data.metadata.sourceURL).toBe( - "https://roastmywebsite.ai" + "https://roastmywebsite.ai", ); expect(response.body.data.metadata.pageStatusCode).toBe(200); }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( @@ -138,7 +138,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true } + pageOptions: { includeHtml: true }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -152,7 +152,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( @@ -164,7 +164,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://roastmywebsite.ai", - pageOptions: { includeRawHtml: true } + pageOptions: { includeRawHtml: true }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -178,7 +178,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( @@ -196,12 +196,12 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain( - "We present spectrophotometric observations of the Broad Line Radio Galaxy" + "We present spectrophotometric observations of the Broad Line Radio Galaxy", ); expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -219,12 +219,12 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain( - "We present spectrophotometric observations of the Broad Line Radio Galaxy" + "We present spectrophotometric observations of the Broad Line Radio Galaxy", ); expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -236,7 +236,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf", - pageOptions: { parsePDF: false } + pageOptions: { parsePDF: false }, }); await new Promise((r) => setTimeout(r, 6000)); @@ -245,10 +245,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain( - "/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj" + "/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -266,16 +266,16 @@ describe("E2E Tests for API Routes", () => { expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); expect(responseWithoutRemoveTags.body.data.content).toContain( - "Scrape This Site" + "Scrape This Site", ); expect(responseWithoutRemoveTags.body.data.content).toContain( - "Lessons and Videos" + "Lessons and Videos", ); // #footer expect(responseWithoutRemoveTags.body.data.content).toContain( - "[Sandbox](" + "[Sandbox](", ); // .nav expect(responseWithoutRemoveTags.body.data.content).toContain( - "web scraping" + "web scraping", ); // strong const response = await request(TEST_URL) @@ -284,7 +284,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com/", - pageOptions: { removeTags: [".nav", "#footer", "strong"] } + pageOptions: { removeTags: [".nav", "#footer", "strong"] }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -297,7 +297,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav expect(response.body.data.content).not.toContain("web scraping"); // strong }, - 30000 + 30000, ); // 30 seconds timeout // TODO: add this test back once we nail the waitFor option to be more deterministic @@ -337,10 +337,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(400); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "bad request" + "bad request", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -359,10 +359,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(401); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "unauthorized" + "unauthorized", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -381,10 +381,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(403); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "forbidden" + "forbidden", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -403,10 +403,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(404); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "not found" + "not found", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -425,10 +425,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(405); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "method not allowed" + "method not allowed", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -447,10 +447,10 @@ describe("E2E Tests for API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(500); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "internal server error" + "internal server error", ); }, - 60000 + 60000, ); // 60 seconds }); @@ -469,7 +469,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent("should return an error for a blocklisted URL", async () => { @@ -481,7 +481,7 @@ describe("E2E Tests for API Routes", () => { .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); @@ -496,9 +496,9 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("jobId"); expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/, ); - } + }, ); it.concurrent( "should prevent duplicate requests using the same idempotency key", @@ -525,7 +525,7 @@ describe("E2E Tests for API Routes", () => { expect(secondResponse.statusCode).toBe(409); expect(secondResponse.body.error).toBe("Idempotency key already used"); - } + }, ); it.concurrent( @@ -539,8 +539,8 @@ describe("E2E Tests for API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - includes: ["blog/*"] - } + includes: ["blog/*"], + }, }); let response; @@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => { const completedResponse = response; const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { @@ -579,13 +579,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -599,8 +599,8 @@ describe("E2E Tests for API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - excludes: ["blog/*"] - } + excludes: ["blog/*"], + }, }); let isFinished = false; @@ -623,14 +623,14 @@ describe("E2E Tests for API Routes", () => { const completedResponse = response; const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); }); }, - 90000 + 90000, ); // 90 seconds it.concurrent( @@ -642,7 +642,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://mendable.ai", - crawlerOptions: { limit: 3 } + crawlerOptions: { limit: 3 }, }); let isFinished = false; @@ -674,13 +674,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -692,7 +692,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 } + crawlerOptions: { maxDepth: 1 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -726,13 +726,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(1); @@ -748,7 +748,7 @@ describe("E2E Tests for API Routes", () => { expect(depth).toBeLessThanOrEqual(2); }); }, - 180000 + 180000, ); it.concurrent( @@ -760,7 +760,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com/pages/", - crawlerOptions: { maxDepth: 1 } + crawlerOptions: { maxDepth: 1 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -794,7 +794,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(1); @@ -810,7 +810,7 @@ describe("E2E Tests for API Routes", () => { expect(depth).toBeLessThanOrEqual(3); }); }, - 180000 + 180000, ); it.concurrent( @@ -822,7 +822,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.mendable.ai", - crawlerOptions: { maxDepth: 0 } + crawlerOptions: { maxDepth: 0 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -849,7 +849,7 @@ describe("E2E Tests for API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); const testurls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); //console.log(testurls) @@ -861,7 +861,7 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThanOrEqual(1); @@ -877,7 +877,7 @@ describe("E2E Tests for API Routes", () => { expect(depth).toBeLessThanOrEqual(1); }); }, - 180000 + 180000, ); // it.concurrent("should return a successful response with a valid API key and valid limit option", async () => { @@ -934,7 +934,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true } + pageOptions: { includeHtml: true }, }); expect(crawlResponse.statusCode).toBe(200); @@ -969,10 +969,10 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); // 120 seconds @@ -983,13 +983,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].html).toContain(" { allowExternalContentLinks: true, ignoreSitemap: true, returnOnlyUrls: true, - limit: 50 - } + limit: 50, + }, }); expect(crawlInitResponse.statusCode).toBe(200); @@ -1031,19 +1031,19 @@ describe("E2E Tests for API Routes", () => { expect.arrayContaining([ expect.objectContaining({ url: expect.stringContaining( - "https://firecrawl.dev/?ref=mendable+banner" - ) + "https://firecrawl.dev/?ref=mendable+banner", + ), }), expect.objectContaining({ - url: expect.stringContaining("https://mendable.ai/pricing") + url: expect.stringContaining("https://mendable.ai/pricing"), }), expect.objectContaining({ - url: expect.stringContaining("https://x.com/CalebPeffer") - }) - ]) + url: expect.stringContaining("https://x.com/CalebPeffer"), + }), + ]), ); }, - 180000 + 180000, ); // 3 minutes timeout }); @@ -1062,7 +1062,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); // it.concurrent("should return an error for a blocklisted URL", async () => { @@ -1088,7 +1088,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(408); }, - 3000 + 3000, ); // it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => { @@ -1120,7 +1120,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ query: "test" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -1136,7 +1136,7 @@ describe("E2E Tests for API Routes", () => { expect(response.body.success).toBe(true); expect(response.body).toHaveProperty("data"); }, - 30000 + 30000, ); // 30 seconds timeout }); @@ -1153,7 +1153,7 @@ describe("E2E Tests for API Routes", () => { .get("/v0/crawl/status/123") .set("Authorization", `Bearer invalid-api-key`); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -1163,7 +1163,7 @@ describe("E2E Tests for API Routes", () => { .get("/v0/crawl/status/invalidJobId") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(404); - } + }, ); it.concurrent( @@ -1201,22 +1201,22 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); const childrenLinks = completedResponse.body.data.filter( (doc) => doc.metadata && doc.metadata.sourceURL && - doc.metadata.sourceURL.includes("mendable.ai/blog") + doc.metadata.sourceURL.includes("mendable.ai/blog"), ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, - 180000 + 180000, ); // 120 seconds it.concurrent( @@ -1236,9 +1236,9 @@ describe("E2E Tests for API Routes", () => { "abs/*", "static/*", "about/*", - "archive/*" - ] - } + "archive/*", + ], + }, }); expect(crawlResponse.statusCode).toBe(200); @@ -1266,21 +1266,21 @@ describe("E2E Tests for API Routes", () => { expect.arrayContaining([ expect.objectContaining({ content: expect.stringContaining( - "asymmetries might represent, for instance, preferred source orientations to our line of sight." - ) - }) - ]) + "asymmetries might represent, for instance, preferred source orientations to our line of sight.", + ), + }), + ]), ); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); }, - 180000 + 180000, ); // 120 seconds it.concurrent( @@ -1292,7 +1292,7 @@ describe("E2E Tests for API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true } + pageOptions: { includeHtml: true }, }); expect(crawlResponse.statusCode).toBe(200); @@ -1333,13 +1333,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.data[0].markdown).toContain("_Roast_"); expect(completedResponse.body.data[0].html).toContain(" { .send({ url: "https://mendable.ai/blog", pageOptions: { includeHtml: true }, - crawlerOptions: { allowBackwardCrawling: true } + crawlerOptions: { allowBackwardCrawling: true }, }); expect(crawlResponse.statusCode).toBe(200); @@ -1397,10 +1397,10 @@ describe("E2E Tests for API Routes", () => { }); expect(completedResponse.body.data.length).toBeGreaterThan( - onlyChildrenLinks.length + onlyChildrenLinks.length, ); }, - 60000 + 60000, ); it.concurrent( @@ -1438,13 +1438,13 @@ describe("E2E Tests for API Routes", () => { expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata"); expect( - completedResponse.body.partial_data[0].metadata.pageStatusCode + completedResponse.body.partial_data[0].metadata.pageStatusCode, ).toBe(200); expect( - completedResponse.body.partial_data[0].metadata.pageError + completedResponse.body.partial_data[0].metadata.pageError, ).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds describe("POST /v0/scrape with LLM Extraction", () => { @@ -1458,7 +1458,7 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://mendable.ai", pageOptions: { - onlyMainContent: true + onlyMainContent: true, }, extractorOptions: { mode: "llm-extraction", @@ -1468,18 +1468,18 @@ describe("E2E Tests for API Routes", () => { type: "object", properties: { company_mission: { - type: "string" + type: "string", }, supports_sso: { - type: "boolean" + type: "boolean", }, is_open_source: { - type: "boolean" - } + type: "boolean", + }, }, - required: ["company_mission", "supports_sso", "is_open_source"] - } - } + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, }); // Ensure that the job was successfully created before proceeding with LLM extraction @@ -1498,7 +1498,7 @@ describe("E2E Tests for API Routes", () => { expect(llmExtraction.is_open_source).toBe(false); expect(typeof llmExtraction.is_open_source).toBe("boolean"); }, - 60000 + 60000, ); // 60 secs it.concurrent( @@ -1519,15 +1519,15 @@ describe("E2E Tests for API Routes", () => { type: "object", properties: { primary_cta: { - type: "string" + type: "string", }, secondary_cta: { - type: "string" - } + type: "string", + }, }, - required: ["primary_cta", "secondary_cta"] - } - } + required: ["primary_cta", "secondary_cta"], + }, + }, }); // Ensure that the job was successfully created before proceeding with LLM extraction @@ -1542,7 +1542,7 @@ describe("E2E Tests for API Routes", () => { expect(llmExtraction).toHaveProperty("secondary_cta"); expect(typeof llmExtraction.secondary_cta).toBe("string"); }, - 60000 + 60000, ); // 60 secs }); @@ -1617,8 +1617,8 @@ describe("E2E Tests for API Routes", () => { .send({ url: "https://flutterbricks.com", crawlerOptions: { - mode: "fast" - } + mode: "fast", + }, }); expect(crawlResponse.statusCode).toBe(200); @@ -1660,7 +1660,7 @@ describe("E2E Tests for API Routes", () => { expect(results.length).toBeGreaterThanOrEqual(10); expect(results.length).toBeLessThanOrEqual(15); }, - 20000 + 20000, ); // it.concurrent("should complete the crawl in more than 10 seconds", async () => { @@ -1741,7 +1741,7 @@ describe("E2E Tests for API Routes", () => { expect(response.statusCode).toBe(429); }, - 90000 + 90000, ); }); diff --git a/apps/api/src/__tests__/e2e_map/index.test.ts b/apps/api/src/__tests__/e2e_map/index.test.ts index 948f097e..30ec6776 100644 --- a/apps/api/src/__tests__/e2e_map/index.test.ts +++ b/apps/api/src/__tests__/e2e_map/index.test.ts @@ -15,7 +15,7 @@ describe("E2E Tests for Map API Routes", () => { .send({ url: "https://firecrawl.dev", sitemapOnly: false, - search: "smart-crawl" + search: "smart-crawl", }); console.log(response.body); @@ -24,7 +24,7 @@ describe("E2E Tests for Map API Routes", () => { expect(response.body.links.length).toBeGreaterThan(0); expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl"); }, - 60000 + 60000, ); it.concurrent( @@ -37,7 +37,7 @@ describe("E2E Tests for Map API Routes", () => { .send({ url: "https://firecrawl.dev", sitemapOnly: false, - includeSubdomains: true + includeSubdomains: true, }); console.log(response.body); @@ -45,10 +45,10 @@ describe("E2E Tests for Map API Routes", () => { expect(response.body).toHaveProperty("links"); expect(response.body.links.length).toBeGreaterThan(0); expect(response.body.links[response.body.links.length - 1]).toContain( - "docs.firecrawl.dev" + "docs.firecrawl.dev", ); }, - 60000 + 60000, ); it.concurrent( @@ -60,7 +60,7 @@ describe("E2E Tests for Map API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev", - sitemapOnly: true + sitemapOnly: true, }); console.log(response.body); @@ -68,10 +68,10 @@ describe("E2E Tests for Map API Routes", () => { expect(response.body).toHaveProperty("links"); expect(response.body.links.length).toBeGreaterThan(0); expect(response.body.links[response.body.links.length - 1]).not.toContain( - "docs.firecrawl.dev" + "docs.firecrawl.dev", ); }, - 60000 + 60000, ); it.concurrent( @@ -84,7 +84,7 @@ describe("E2E Tests for Map API Routes", () => { .send({ url: "https://firecrawl.dev", sitemapOnly: false, - limit: 10 + limit: 10, }); console.log(response.body); @@ -92,7 +92,7 @@ describe("E2E Tests for Map API Routes", () => { expect(response.body).toHaveProperty("links"); expect(response.body.links.length).toBeLessThanOrEqual(10); }, - 60000 + 60000, ); it.concurrent( @@ -104,7 +104,7 @@ describe("E2E Tests for Map API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://geekflare.com/sitemap_index.xml", - sitemapOnly: true + sitemapOnly: true, }); console.log(response.body); @@ -112,6 +112,6 @@ describe("E2E Tests for Map API Routes", () => { expect(response.body).toHaveProperty("links"); expect(response.body.links.length).toBeGreaterThan(1900); }, - 60000 + 60000, ); }); diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index 9c3ddf33..e30352a5 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -62,7 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); @@ -89,7 +89,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); @@ -101,7 +101,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("jobId"); expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/, ); }); }); @@ -120,7 +120,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { .send({ url: blocklistedUrl }); expect(response.statusCode).toBe(403); expect(response.body.error).toContain( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); @@ -132,7 +132,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("jobId"); expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/, ); }); }); @@ -172,7 +172,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { it("should return Job not found for invalid job ID", async () => { const response = await request(TEST_URL).get( - "/v0/crawl/status/invalidJobId" + "/v0/crawl/status/invalidJobId", ); expect(response.statusCode).toBe(404); }); @@ -185,7 +185,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(crawlResponse.statusCode).toBe(200); const response = await request(TEST_URL).get( - `/v0/crawl/status/${crawlResponse.body.jobId}` + `/v0/crawl/status/${crawlResponse.body.jobId}`, ); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); @@ -195,7 +195,7 @@ describe("E2E Tests for API Routes with No Authentication", () => { await new Promise((r) => setTimeout(r, 30000)); const completedResponse = await request(TEST_URL).get( - `/v0/crawl/status/${crawlResponse.body.jobId}` + `/v0/crawl/status/${crawlResponse.body.jobId}`, ); expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 33e3be5d..35ee2d89 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -2,7 +2,7 @@ import request from "supertest"; import { configDotenv } from "dotenv"; import { ScrapeRequestInput, - ScrapeResponseRequestTest + ScrapeResponseRequestTest, } from "../../controllers/v1/types"; configDotenv(); @@ -24,7 +24,7 @@ describe("E2E Tests for v1 API Routes", () => { console.log( "process.env.USE_DB_AUTHENTICATION", - process.env.USE_DB_AUTHENTICATION + process.env.USE_DB_AUTHENTICATION, ); console.log("?", process.env.USE_DB_AUTHENTICATION === "true"); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; @@ -47,7 +47,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent("should throw error for blocklisted URL", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://facebook.com/fake-test" + url: "https://facebook.com/fake-test", }; const response = await request(TEST_URL) @@ -58,7 +58,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(403); expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." + "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", ); }); @@ -71,14 +71,14 @@ describe("E2E Tests for v1 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( "should return a successful response with a valid API key", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://roastmywebsite.ai" + url: "https://roastmywebsite.ai", }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -100,37 +100,37 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.error).toBeUndefined(); expect(response.body.data.metadata.title).toBe("Roast My Website"); expect(response.body.data.metadata.description).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(response.body.data.metadata.keywords).toBe( - "Roast My Website,Roast,Website,GitHub,Firecrawl" + "Roast My Website,Roast,Website,GitHub,Firecrawl", ); expect(response.body.data.metadata.robots).toBe("follow, index"); expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); expect(response.body.data.metadata.ogDescription).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(response.body.data.metadata.ogUrl).toBe( - "https://www.roastmywebsite.ai" + "https://www.roastmywebsite.ai", ); expect(response.body.data.metadata.ogImage).toBe( - "https://www.roastmywebsite.ai/og.png" + "https://www.roastmywebsite.ai/og.png", ); expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); expect(response.body.data.metadata.sourceURL).toBe( - "https://roastmywebsite.ai" + "https://roastmywebsite.ai", ); expect(response.body.data.metadata.statusCode).toBe(200); }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( "should return a successful response with a valid API key", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://arxiv.org/abs/2410.04840" + url: "https://arxiv.org/abs/2410.04840", }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -151,43 +151,43 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.markdown).toContain("Strong Model Collapse"); expect(response.body.data.metadata.error).toBeUndefined(); expect(response.body.data.metadata.description).toContain( - "Abstract page for arXiv paper 2410.04840: Strong Model Collapse" + "Abstract page for arXiv paper 2410.04840: Strong Model Collapse", ); expect(response.body.data.metadata.citation_title).toBe( - "Strong Model Collapse" + "Strong Model Collapse", ); expect(response.body.data.metadata.citation_author).toEqual([ "Dohmatob, Elvis", "Feng, Yunzhen", "Subramonian, Arjun", - "Kempe, Julia" + "Kempe, Julia", ]); expect(response.body.data.metadata.citation_date).toBe("2024/10/07"); expect(response.body.data.metadata.citation_online_date).toBe( - "2024/10/08" + "2024/10/08", ); expect(response.body.data.metadata.citation_pdf_url).toBe( - "http://arxiv.org/pdf/2410.04840" + "http://arxiv.org/pdf/2410.04840", ); expect(response.body.data.metadata.citation_arxiv_id).toBe( - "2410.04840" + "2410.04840", ); expect(response.body.data.metadata.citation_abstract).toContain( - "Within the scaling laws paradigm" + "Within the scaling laws paradigm", ); expect(response.body.data.metadata.sourceURL).toBe( - "https://arxiv.org/abs/2410.04840" + "https://arxiv.org/abs/2410.04840", ); expect(response.body.data.metadata.statusCode).toBe(200); }, - 30000 + 30000, ); it.concurrent( "should return a successful response with a valid API key and includeHtml set to true", async () => { const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", - formats: ["markdown", "html"] + formats: ["markdown", "html"], }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -209,13 +209,13 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); }, - 30000 + 30000, ); it.concurrent( "should return a successful response for a valid scrape with PDF file", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" + url: "https://arxiv.org/pdf/astro-ph/9301001.pdf", // formats: ["markdown", "html"], }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -232,19 +232,19 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.markdown).toContain( - "Broad Line Radio Galaxy" + "Broad Line Radio Galaxy", ); expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); }, - 60000 + 60000, ); it.concurrent( "should return a successful response for a valid scrape with PDF file without explicit .pdf extension", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://arxiv.org/pdf/astro-ph/9301001" + url: "https://arxiv.org/pdf/astro-ph/9301001", }; const response: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/scrape") @@ -261,12 +261,12 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.markdown).toContain( - "Broad Line Radio Galaxy" + "Broad Line Radio Galaxy", ); expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); }, - 60000 + 60000, ); it.concurrent( @@ -274,7 +274,7 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest: ScrapeRequestInput = { url: "https://www.scrapethissite.com/", - onlyMainContent: false // default is true + onlyMainContent: false, // default is true }; const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL) @@ -292,16 +292,16 @@ describe("E2E Tests for v1 API Routes", () => { expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); expect(responseWithoutRemoveTags.body.data.markdown).toContain( - "[FAQ](/faq/)" + "[FAQ](/faq/)", ); // .nav expect(responseWithoutRemoveTags.body.data.markdown).toContain( - "Hartley Brody 2023" + "Hartley Brody 2023", ); // #footer const scrapeRequestWithRemoveTags: ScrapeRequestInput = { url: "https://www.scrapethissite.com/", excludeTags: [".nav", "#footer", "strong"], - onlyMainContent: false // default is true + onlyMainContent: false, // default is true }; const response: ScrapeResponseRequestTest = await request(TEST_URL) .post("/v1/scrape") @@ -320,7 +320,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.markdown).not.toContain("Hartley Brody 2023"); expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); // }, - 30000 + 30000, ); it.concurrent( @@ -342,7 +342,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(400); }, - 60000 + 60000, ); it.concurrent( @@ -364,7 +364,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(401); }, - 60000 + 60000, ); // Removed it as we want to retry fallback to the next scraper @@ -405,7 +405,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(404); }, - 60000 + 60000, ); // it.concurrent('should return a successful response for a scrape with 405 page', async () => { @@ -455,7 +455,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(408); }, - 3000 + 3000, ); it.concurrent( @@ -463,7 +463,7 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", - formats: ["html", "rawHtml"] + formats: ["html", "rawHtml"], }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -486,7 +486,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); }, - 30000 + 30000, ); it.concurrent( @@ -495,7 +495,7 @@ describe("E2E Tests for v1 API Routes", () => { const scrapeRequest: ScrapeRequestInput = { url: "https://ycombinator.com/companies", formats: ["markdown"], - waitFor: 8000 + waitFor: 8000, }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -518,7 +518,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); }, - 30000 + 30000, ); it.concurrent( @@ -526,7 +526,7 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", - formats: ["links"] + formats: ["links"], }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -548,7 +548,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data.metadata.statusCode).toBe(200); expect(response.body.data.metadata.error).toBeUndefined(); }, - 30000 + 30000, ); }); @@ -569,14 +569,14 @@ describe("E2E Tests for v1 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( "should return a successful response with a valid API key", async () => { const mapRequest = { - url: "https://roastmywebsite.ai" + url: "https://roastmywebsite.ai", }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -594,7 +594,7 @@ describe("E2E Tests for v1 API Routes", () => { const links = response.body.links as unknown[]; expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); - } + }, ); it.concurrent( @@ -602,7 +602,7 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const mapRequest = { url: "https://usemotion.com", - search: "pricing" + search: "pricing", }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -621,7 +621,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); expect(links[0]).toContain("usemotion.com/pricing"); - } + }, ); it.concurrent( @@ -630,7 +630,7 @@ describe("E2E Tests for v1 API Routes", () => { const mapRequest = { url: "https://firecrawl.dev", search: "docs", - includeSubdomains: true + includeSubdomains: true, }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -650,10 +650,10 @@ describe("E2E Tests for v1 API Routes", () => { expect(links.length).toBeGreaterThan(0); const containsDocsFirecrawlDev = links.some((link: string) => - link.includes("docs.firecrawl.dev") + link.includes("docs.firecrawl.dev"), ); expect(containsDocsFirecrawlDev).toBe(true); - } + }, ); it.concurrent( @@ -662,7 +662,7 @@ describe("E2E Tests for v1 API Routes", () => { const mapRequest = { url: "https://www.firecrawl.dev", search: "docs", - includeSubdomains: true + includeSubdomains: true, }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -682,11 +682,11 @@ describe("E2E Tests for v1 API Routes", () => { expect(links.length).toBeGreaterThan(0); const containsDocsFirecrawlDev = links.some((link: string) => - link.includes("docs.firecrawl.dev") + link.includes("docs.firecrawl.dev"), ); expect(containsDocsFirecrawlDev).toBe(true); }, - 10000 + 10000, ); it.concurrent( @@ -695,7 +695,7 @@ describe("E2E Tests for v1 API Routes", () => { const mapRequest = { url: "https://www.firecrawl.dev", search: "docs", - includeSubdomains: false + includeSubdomains: false, }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -714,14 +714,14 @@ describe("E2E Tests for v1 API Routes", () => { expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); expect(links[0]).not.toContain("docs.firecrawl.dev"); - } + }, ); it.concurrent("should return an error for invalid URL", async () => { const mapRequest = { url: "invalid-url", includeSubdomains: true, - search: "test" + search: "test", }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -746,7 +746,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent("should throw error for blocklisted URL", async () => { const scrapeRequest: ScrapeRequestInput = { - url: "https://facebook.com/fake-test" + url: "https://facebook.com/fake-test", }; const response = await request(TEST_URL) @@ -757,7 +757,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(403); expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." + "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", ); }); @@ -770,7 +770,7 @@ describe("E2E Tests for v1 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent("should return a successful response", async () => { @@ -783,7 +783,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("id"); expect(response.body.id).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/, ); expect(response.body).toHaveProperty("success", true); expect(response.body).toHaveProperty("url"); @@ -800,7 +800,7 @@ describe("E2E Tests for v1 API Routes", () => { .send({ url: "https://firecrawl.dev", limit: 40, - includePaths: ["blog/*"] + includePaths: ["blog/*"], }); let response; @@ -826,7 +826,7 @@ describe("E2E Tests for v1 API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { @@ -843,7 +843,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); }, - 180000 + 180000, ); // 180 seconds it.concurrent( @@ -856,7 +856,7 @@ describe("E2E Tests for v1 API Routes", () => { .send({ url: "https://firecrawl.dev", limit: 40, - excludePaths: ["blog/*"] + excludePaths: ["blog/*"], }); let isFinished = false; @@ -882,14 +882,14 @@ describe("E2E Tests for v1 API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(3); urls.forEach((url: string) => { expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy(); }); }, - 90000 + 90000, ); // 90 seconds it.concurrent( @@ -901,7 +901,7 @@ describe("E2E Tests for v1 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - maxDepth: 1 + maxDepth: 1, }); expect(crawlResponse.statusCode).toBe(200); @@ -911,7 +911,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("status"); expect(["active", "waiting", "completed", "scraping"]).toContain( - response.body.status + response.body.status, ); // wait for 60 seconds let isCompleted = false; @@ -939,7 +939,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(1); @@ -955,7 +955,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(depth).toBeLessThanOrEqual(2); }); }, - 180000 + 180000, ); }); @@ -972,7 +972,7 @@ describe("E2E Tests for v1 API Routes", () => { .get("/v1/crawl/123") .set("Authorization", `Bearer invalid-api-key`); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -982,7 +982,7 @@ describe("E2E Tests for v1 API Routes", () => { .get("/v1/crawl/invalidJobId") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(404); - } + }, ); it.concurrent( @@ -1026,12 +1026,12 @@ describe("E2E Tests for v1 API Routes", () => { expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); const childrenLinks = completedResponse.body.data.filter( - (doc) => doc.metadata && doc.metadata.sourceURL + (doc) => doc.metadata && doc.metadata.sourceURL, ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, - 180000 + 180000, ); // 120 seconds it.concurrent( @@ -1068,7 +1068,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); expect(completedResponse.body.data[0].metadata.error).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds }); }); diff --git a/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts index e297f7c8..313b7357 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts @@ -2,7 +2,7 @@ import request from "supertest"; import { configDotenv } from "dotenv"; import { ScrapeRequest, - ScrapeResponseRequestTest + ScrapeResponseRequestTest, } from "../../controllers/v1/types"; configDotenv(); @@ -14,7 +14,7 @@ describe("E2E Tests for v1 API Routes", () => { "should return a successful response for a scrape with 403 page", async () => { const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -30,18 +30,18 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.statusCode).toBe(403); }, - 30000 + 30000, ); it.concurrent( "should handle 'formats:markdown (default)' parameter correctly", async () => { const scrapeRequest = { - url: E2E_TEST_SERVER_URL + url: E2E_TEST_SERVER_URL, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -57,26 +57,26 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("markdown"); expect(response.body.data.markdown).toContain( - "This page is used for end-to-end (e2e) testing with Firecrawl." + "This page is used for end-to-end (e2e) testing with Firecrawl.", ); expect(response.body.data.markdown).toContain( - "Content with id #content-1" + "Content with id #content-1", ); // expect(response.body.data.markdown).toContain("Loading..."); expect(response.body.data.markdown).toContain("Click me!"); expect(response.body.data.markdown).toContain( - "Power your AI apps with clean data crawled from any website. It's also open-source." + "Power your AI apps with clean data crawled from any website. It's also open-source.", ); // firecrawl.dev inside an iframe expect(response.body.data.markdown).toContain( - "This content loads only when you see it. Don't blink! 👼" + "This content loads only when you see it. Don't blink! 👼", ); // the browser always scroll to the bottom expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default expect(response.body.data.markdown).not.toContain( - "This content is only visible on mobile" + "This content is only visible on mobile", ); }, - 30000 + 30000, ); it.concurrent( @@ -84,11 +84,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - formats: ["html"] + formats: ["html"], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -105,13 +105,13 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("html"); expect(response.body.data.html).not.toContain( - '
Header
' + '
Header
', ); expect(response.body.data.html).toContain( - '

This page is used for end-to-end (e2e) testing with Firecrawl.

' + '

This page is used for end-to-end (e2e) testing with Firecrawl.

', ); }, - 30000 + 30000, ); it.concurrent( @@ -119,11 +119,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - formats: ["rawHtml"] + formats: ["rawHtml"], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -140,11 +140,11 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.data).toHaveProperty("rawHtml"); expect(response.body.data.rawHtml).toContain( - ">This page is used for end-to-end (e2e) testing with Firecrawl.

" + ">This page is used for end-to-end (e2e) testing with Firecrawl.

", ); expect(response.body.data.rawHtml).toContain(">Header"); }, - 30000 + 30000, ); // - TODO: tests for links @@ -157,11 +157,11 @@ describe("E2E Tests for v1 API Routes", () => { // @ts-ignore const scrapeRequest = { url: E2E_TEST_SERVER_URL, - headers: { "e2e-header-test": "firecrawl" } + headers: { "e2e-header-test": "firecrawl" }, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -175,10 +175,10 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).toContain( - "e2e-header-test: firecrawl" + "e2e-header-test: firecrawl", ); }, - 30000 + 30000, ); it.concurrent( @@ -186,11 +186,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - includeTags: ["#content-1"] + includeTags: ["#content-1"], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -204,13 +204,13 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).not.toContain( - "

This page is used for end-to-end (e2e) testing with Firecrawl.

" + "

This page is used for end-to-end (e2e) testing with Firecrawl.

", ); expect(response.body.data.markdown).toContain( - "Content with id #content-1" + "Content with id #content-1", ); }, - 30000 + 30000, ); it.concurrent( @@ -218,11 +218,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - excludeTags: ["#content-1"] + excludeTags: ["#content-1"], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -236,13 +236,13 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).toContain( - "This page is used for end-to-end (e2e) testing with Firecrawl." + "This page is used for end-to-end (e2e) testing with Firecrawl.", ); expect(response.body.data.markdown).not.toContain( - "Content with id #content-1" + "Content with id #content-1", ); }, - 30000 + 30000, ); it.concurrent( @@ -251,11 +251,11 @@ describe("E2E Tests for v1 API Routes", () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, formats: ["html", "markdown"], - onlyMainContent: false + onlyMainContent: false, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -269,13 +269,13 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).toContain( - "This page is used for end-to-end (e2e) testing with Firecrawl." + "This page is used for end-to-end (e2e) testing with Firecrawl.", ); expect(response.body.data.html).toContain( - '
Header
' + '
Header
', ); }, - 30000 + 30000, ); it.concurrent( @@ -283,11 +283,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - timeout: 500 + timeout: 500, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -302,7 +302,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.body.error).toBe("Request timed out"); expect(response.body.success).toBe(false); }, - 30000 + 30000, ); it.concurrent( @@ -310,11 +310,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - mobile: true + mobile: true, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -327,17 +327,17 @@ describe("E2E Tests for v1 API Routes", () => { throw new Error("Expected response body to have 'data' property"); } expect(response.body.data.markdown).toContain( - "This content is only visible on mobile" + "This content is only visible on mobile", ); }, - 30000 + 30000, ); it.concurrent( "should handle 'parsePDF' parameter correctly", async () => { const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -352,21 +352,21 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).toContain( - "arXiv:astro-ph/9301001v1 7 Jan 1993" + "arXiv:astro-ph/9301001v1 7 Jan 1993", ); expect(response.body.data.markdown).not.toContain( - "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm" + "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm", ); const responseNoParsePDF: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") .send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf", - parsePDF: false + parsePDF: false, }); await new Promise((r) => setTimeout(r, 6000)); @@ -376,10 +376,10 @@ describe("E2E Tests for v1 API Routes", () => { throw new Error("Expected response body to have 'data' property"); } expect(responseNoParsePDF.body.data.markdown).toContain( - "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm" + "h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm", ); }, - 30000 + 30000, ); // it.concurrent("should handle 'location' parameter correctly", @@ -408,11 +408,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: "https://expired.badssl.com/", - timeout: 120000 + timeout: 120000, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -430,7 +430,7 @@ describe("E2E Tests for v1 API Routes", () => { const scrapeRequestWithSkipTlsVerification = { url: "https://expired.badssl.com/", skipTlsVerification: true, - timeout: 120000 + timeout: 120000, } as ScrapeRequest; const responseWithSkipTlsVerification: ScrapeResponseRequestTest = @@ -448,10 +448,10 @@ describe("E2E Tests for v1 API Routes", () => { } // console.log(responseWithSkipTlsVerification.body.data) expect(responseWithSkipTlsVerification.body.data.markdown).toContain( - "badssl.com" + "badssl.com", ); }, - 60000 + 60000, ); it.concurrent( @@ -459,11 +459,11 @@ describe("E2E Tests for v1 API Routes", () => { async () => { const scrapeRequest = { url: E2E_TEST_SERVER_URL, - removeBase64Images: true + removeBase64Images: true, } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -478,7 +478,7 @@ describe("E2E Tests for v1 API Routes", () => { // - TODO: not working for every image // expect(response.body.data.markdown).toContain("Image-Removed"); }, - 30000 + 30000, ); it.concurrent( @@ -489,13 +489,13 @@ describe("E2E Tests for v1 API Routes", () => { actions: [ { type: "wait", - milliseconds: 10000 - } - ] + milliseconds: 10000, + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -508,10 +508,10 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).not.toContain("Loading..."); expect(response.body.data.markdown).toContain( - "Content loaded after 5 seconds!" + "Content loaded after 5 seconds!", ); }, - 30000 + 30000, ); // screenshot @@ -522,13 +522,13 @@ describe("E2E Tests for v1 API Routes", () => { url: E2E_TEST_SERVER_URL, actions: [ { - type: "screenshot" - } - ] + type: "screenshot", + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -543,15 +543,15 @@ describe("E2E Tests for v1 API Routes", () => { throw new Error("Expected response body to have screenshots array"); } expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan( - 0 + 0, ); expect(response.body.data.actions.screenshots[0]).toContain( - "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-" + "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-", ); // TODO compare screenshot with expected screenshot }, - 30000 + 30000, ); it.concurrent( @@ -562,16 +562,16 @@ describe("E2E Tests for v1 API Routes", () => { actions: [ { type: "screenshot", - fullPage: true + fullPage: true, }, { - type: "scrape" - } - ] + type: "scrape", + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -587,24 +587,24 @@ describe("E2E Tests for v1 API Routes", () => { throw new Error("Expected response body to have screenshots array"); } expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan( - 0 + 0, ); expect(response.body.data.actions.screenshots[0]).toContain( - "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-" + "https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-", ); if (!response.body.data.actions?.scrapes) { throw new Error("Expected response body to have scrapes array"); } expect(response.body.data.actions.scrapes[0].url).toBe( - "https://firecrawl-e2e-test.vercel.app/" + "https://firecrawl-e2e-test.vercel.app/", ); expect(response.body.data.actions.scrapes[0].html).toContain( - "This page is used for end-to-end (e2e) testing with Firecrawl.

" + "This page is used for end-to-end (e2e) testing with Firecrawl.

", ); // TODO compare screenshot with expected full page screenshot }, - 30000 + 30000, ); it.concurrent( @@ -615,13 +615,13 @@ describe("E2E Tests for v1 API Routes", () => { actions: [ { type: "click", - selector: "#click-me" - } - ] + selector: "#click-me", + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -634,10 +634,10 @@ describe("E2E Tests for v1 API Routes", () => { } expect(response.body.data.markdown).not.toContain("Click me!"); expect(response.body.data.markdown).toContain( - "Text changed after click!" + "Text changed after click!", ); }, - 30000 + 30000, ); it.concurrent( @@ -649,17 +649,17 @@ describe("E2E Tests for v1 API Routes", () => { actions: [ { type: "click", - selector: "#input-1" + selector: "#input-1", }, { type: "write", - text: "Hello, world!" - } - ] + text: "Hello, world!", + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -675,7 +675,7 @@ describe("E2E Tests for v1 API Routes", () => { // uncomment the following line: // expect(response.body.data.html).toContain(""); }, - 30000 + 30000, ); // TODO: fix this test (need to fix fire-engine first) @@ -688,13 +688,13 @@ describe("E2E Tests for v1 API Routes", () => { actions: [ { type: "press", - key: "ArrowDown" - } - ] + key: "ArrowDown", + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -709,7 +709,7 @@ describe("E2E Tests for v1 API Routes", () => { // } // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown") }, - 30000 + 30000, ); // TODO: fix this test (need to fix fire-engine first) @@ -722,18 +722,18 @@ describe("E2E Tests for v1 API Routes", () => { actions: [ { type: "click", - selector: "#scroll-bottom-loader" + selector: "#scroll-bottom-loader", }, { type: "scroll", direction: "down", - amount: 2000 - } - ] + amount: 2000, + }, + ], } as ScrapeRequest; const response: ScrapeResponseRequestTest = await request( - FIRECRAWL_API_URL + FIRECRAWL_API_URL, ) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -748,7 +748,7 @@ describe("E2E Tests for v1 API Routes", () => { // // expect(response.body.data.markdown).toContain("You have reached the bottom!") }, - 30000 + 30000, ); // TODO: test scrape action diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index e026eef0..46668e64 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -3,7 +3,7 @@ import dotenv from "dotenv"; import { FirecrawlCrawlResponse, FirecrawlCrawlStatusResponse, - FirecrawlScrapeResponse + FirecrawlScrapeResponse, } from "../../types"; dotenv.config(); @@ -42,7 +42,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -63,30 +63,30 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data.metadata.pageError).toBeUndefined(); expect(response.body.data.metadata.title).toBe("Roast My Website"); expect(response.body.data.metadata.description).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(response.body.data.metadata.keywords).toBe( - "Roast My Website,Roast,Website,GitHub,Firecrawl" + "Roast My Website,Roast,Website,GitHub,Firecrawl", ); expect(response.body.data.metadata.robots).toBe("follow, index"); expect(response.body.data.metadata.ogTitle).toBe("Roast My Website"); expect(response.body.data.metadata.ogDescription).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(response.body.data.metadata.ogUrl).toBe( - "https://www.roastmywebsite.ai" + "https://www.roastmywebsite.ai", ); expect(response.body.data.metadata.ogImage).toBe( - "https://www.roastmywebsite.ai/og.png" + "https://www.roastmywebsite.ai/og.png", ); expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]); expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website"); expect(response.body.data.metadata.sourceURL).toBe( - "https://roastmywebsite.ai" + "https://roastmywebsite.ai", ); expect(response.body.data.metadata.pageStatusCode).toBe(200); }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( @@ -98,7 +98,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://roastmywebsite.ai", - pageOptions: { includeHtml: true } + pageOptions: { includeHtml: true }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -112,7 +112,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( @@ -130,12 +130,12 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain( - "We present spectrophotometric observations of the Broad Line Radio Galaxy" + "We present spectrophotometric observations of the Broad Line Radio Galaxy", ); expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -153,12 +153,12 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("content"); expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.content).toContain( - "We present spectrophotometric observations of the Broad Line Radio Galaxy" + "We present spectrophotometric observations of the Broad Line Radio Galaxy", ); expect(response.body.data.metadata.pageStatusCode).toBe(200); expect(response.body.data.metadata.pageError).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -177,16 +177,16 @@ describe("E2E Tests for v0 API Routes", () => { expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata"); expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html"); expect(responseWithoutRemoveTags.body.data.content).toContain( - "Scrape This Site" + "Scrape This Site", ); expect(responseWithoutRemoveTags.body.data.content).toContain( - "Lessons and Videos" + "Lessons and Videos", ); // #footer expect(responseWithoutRemoveTags.body.data.content).toContain( - "[Sandbox](" + "[Sandbox](", ); // .nav expect(responseWithoutRemoveTags.body.data.content).toContain( - "web scraping" + "web scraping", ); // strong const response: FirecrawlScrapeResponse = await request(TEST_URL) @@ -195,7 +195,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com/", - pageOptions: { removeTags: [".nav", "#footer", "strong"] } + pageOptions: { removeTags: [".nav", "#footer", "strong"] }, }); expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("data"); @@ -208,7 +208,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav expect(response.body.data.content).not.toContain("web scraping"); // strong }, - 30000 + 30000, ); // 30 seconds timeout it.concurrent( @@ -227,10 +227,10 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(400); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "bad request" + "bad request", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -249,10 +249,10 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(401); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "unauthorized" + "unauthorized", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -271,10 +271,10 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(403); expect(response.body.data.metadata.pageError.toLowerCase()).toContain( - "forbidden" + "forbidden", ); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -293,7 +293,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(404); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -312,7 +312,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(405); }, - 60000 + 60000, ); // 60 seconds it.concurrent( @@ -331,7 +331,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.data).toHaveProperty("metadata"); expect(response.body.data.metadata.pageStatusCode).toBe(500); }, - 60000 + 60000, ); // 60 seconds }); @@ -351,7 +351,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -365,9 +365,9 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("jobId"); expect(response.body.jobId).toMatch( - /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ + /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/, ); - } + }, ); it.concurrent( @@ -381,8 +381,8 @@ describe("E2E Tests for v0 API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - includes: ["blog/*"] - } + includes: ["blog/*"], + }, }); let response: FirecrawlCrawlStatusResponse; @@ -408,7 +408,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { @@ -424,13 +424,13 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Mendable"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); }, - 180000 + 180000, ); // 180 seconds it.concurrent( @@ -444,8 +444,8 @@ describe("E2E Tests for v0 API Routes", () => { url: "https://mendable.ai", limit: 10, crawlerOptions: { - excludes: ["blog/*"] - } + excludes: ["blog/*"], + }, }); let isFinished = false; @@ -467,20 +467,20 @@ describe("E2E Tests for v0 API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database const completedResponse: FirecrawlCrawlStatusResponse = await request( - TEST_URL + TEST_URL, ) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(5); urls.forEach((url: string) => { expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy(); }); }, - 90000 + 90000, ); // 90 seconds it.concurrent( @@ -492,7 +492,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://www.scrapethissite.com", - crawlerOptions: { maxDepth: 1 } + crawlerOptions: { maxDepth: 1 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -515,7 +515,7 @@ describe("E2E Tests for v0 API Routes", () => { } } const completedResponse: FirecrawlCrawlStatusResponse = await request( - TEST_URL + TEST_URL, ) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); @@ -528,13 +528,13 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); const urls = completedResponse.body.data.map( - (item: any) => item.metadata?.sourceURL + (item: any) => item.metadata?.sourceURL, ); expect(urls.length).toBeGreaterThan(1); @@ -550,14 +550,14 @@ describe("E2E Tests for v0 API Routes", () => { expect(depth).toBeLessThanOrEqual(2); }); }, - 180000 + 180000, ); }); describe("POST /v0/crawlWebsitePreview", () => { it.concurrent("should require authorization", async () => { const response: FirecrawlCrawlResponse = await request(TEST_URL).post( - "/v0/crawlWebsitePreview" + "/v0/crawlWebsitePreview", ); expect(response.statusCode).toBe(401); }); @@ -571,7 +571,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -585,7 +585,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.statusCode).toBe(408); }, - 3000 + 3000, ); }); @@ -604,7 +604,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ query: "test" }); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -620,7 +620,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(response.body.success).toBe(true); expect(response.body).toHaveProperty("data"); }, - 60000 + 60000, ); // 60 seconds timeout }); @@ -637,7 +637,7 @@ describe("E2E Tests for v0 API Routes", () => { .get("/v0/crawl/status/123") .set("Authorization", `Bearer invalid-api-key`); expect(response.statusCode).toBe(401); - } + }, ); it.concurrent( @@ -647,7 +647,7 @@ describe("E2E Tests for v0 API Routes", () => { .get("/v0/crawl/status/invalidJobId") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); expect(response.statusCode).toBe(404); - } + }, ); it.concurrent( @@ -689,22 +689,22 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0].content).toContain("Firecrawl"); expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 + 200, ); expect( - completedResponse.body.data[0].metadata.pageError + completedResponse.body.data[0].metadata.pageError, ).toBeUndefined(); const childrenLinks = completedResponse.body.data.filter( (doc) => doc.metadata && doc.metadata.sourceURL && - doc.metadata.sourceURL.includes("firecrawl.dev/blog") + doc.metadata.sourceURL.includes("firecrawl.dev/blog"), ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); }, - 180000 + 180000, ); // 120 seconds // TODO: review the test below @@ -762,7 +762,7 @@ describe("E2E Tests for v0 API Routes", () => { .set("Content-Type", "application/json") .send({ url: "https://docs.tatum.io", - crawlerOptions: { limit: 200 } + crawlerOptions: { limit: 200 }, }); expect(crawlResponse.statusCode).toBe(200); @@ -798,22 +798,22 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data).toEqual(expect.arrayContaining([])); expect(completedResponse.body).toHaveProperty("partial_data"); expect(completedResponse.body.partial_data[0]).toHaveProperty( - "content" + "content", ); expect(completedResponse.body.partial_data[0]).toHaveProperty( - "markdown" + "markdown", ); expect(completedResponse.body.partial_data[0]).toHaveProperty( - "metadata" + "metadata", ); expect( - completedResponse.body.partial_data[0].metadata.pageStatusCode + completedResponse.body.partial_data[0].metadata.pageStatusCode, ).toBe(200); expect( - completedResponse.body.partial_data[0].metadata.pageError + completedResponse.body.partial_data[0].metadata.pageError, ).toBeUndefined(); }, - 60000 + 60000, ); // 60 seconds }); @@ -828,7 +828,7 @@ describe("E2E Tests for v0 API Routes", () => { .send({ url: "https://mendable.ai", pageOptions: { - onlyMainContent: true + onlyMainContent: true, }, extractorOptions: { mode: "llm-extraction", @@ -838,18 +838,18 @@ describe("E2E Tests for v0 API Routes", () => { type: "object", properties: { company_mission: { - type: "string" + type: "string", }, supports_sso: { - type: "boolean" + type: "boolean", }, is_open_source: { - type: "boolean" - } + type: "boolean", + }, }, - required: ["company_mission", "supports_sso", "is_open_source"] - } - } + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, }); // Ensure that the job was successfully created before proceeding with LLM extraction @@ -868,7 +868,7 @@ describe("E2E Tests for v0 API Routes", () => { expect(llmExtraction.is_open_source).toBe(false); expect(typeof llmExtraction.is_open_source).toBe("boolean"); }, - 60000 + 60000, ); // 60 secs }); }); diff --git a/apps/api/src/controllers/__tests__/crawl.test.ts b/apps/api/src/controllers/__tests__/crawl.test.ts index 81fa2e5d..a004ee3c 100644 --- a/apps/api/src/controllers/__tests__/crawl.test.ts +++ b/apps/api/src/controllers/__tests__/crawl.test.ts @@ -10,9 +10,9 @@ jest.mock("../auth", () => ({ success: true, team_id: "team123", error: null, - status: 200 + status: 200, }), - reduce: jest.fn() + reduce: jest.fn(), })); jest.mock("../../services/idempotency/validate"); @@ -21,15 +21,15 @@ describe("crawlController", () => { const req = { headers: { "x-idempotency-key": await uuidv4(), - Authorization: `Bearer ${process.env.TEST_API_KEY}` + Authorization: `Bearer ${process.env.TEST_API_KEY}`, }, body: { - url: "https://mendable.ai" - } + url: "https://mendable.ai", + }, } as unknown as Request; const res = { status: jest.fn().mockReturnThis(), - json: jest.fn() + json: jest.fn(), } as unknown as Response; // Mock the idempotency key validation to return false for the second call @@ -45,7 +45,7 @@ describe("crawlController", () => { await crawlController(req, res); expect(res.status).toHaveBeenCalledWith(409); expect(res.json).toHaveBeenCalledWith({ - error: "Idempotency key already used" + error: "Idempotency key already used", }); }); }); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 947c2784..f865984a 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -4,7 +4,7 @@ import { AuthResponse, NotificationType, PlanType, - RateLimiterMode + RateLimiterMode, } from "../types"; import { supabase_service } from "../services/supabase"; import { withAuth } from "../lib/withAuth"; @@ -41,7 +41,7 @@ export async function setCachedACUC( acuc: | AuthCreditUsageChunk | null - | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null) + | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null), ) { const cacheKeyACUC = `acuc_${api_key}`; const redLockKey = `lock_${cacheKeyACUC}`; @@ -76,7 +76,7 @@ export async function setCachedACUC( export async function getACUC( api_key: string, cacheOnly = false, - useCache = true + useCache = true, ): Promise { const cacheKeyACUC = `acuc_${api_key}`; @@ -97,7 +97,7 @@ export async function getACUC( ({ data, error } = await supabase_service.rpc( "auth_credit_usage_chunk_test_21_credit_pack", { input_key: api_key }, - { get: true } + { get: true }, )); if (!error) { @@ -105,13 +105,13 @@ export async function getACUC( } logger.warn( - `Failed to retrieve authentication and credit usage data after ${retries}, trying again...` + `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`, ); retries++; if (retries === maxRetries) { throw new Error( "Failed to retrieve authentication and credit usage data after 3 attempts: " + - JSON.stringify(error) + JSON.stringify(error), ); } @@ -143,19 +143,19 @@ export async function clearACUC(api_key: string): Promise { export async function authenticateUser( req, res, - mode?: RateLimiterMode + mode?: RateLimiterMode, ): Promise { return withAuth(supaAuthenticateUser, { success: true, chunk: null, - team_id: "bypass" + team_id: "bypass", })(req, res, mode); } export async function supaAuthenticateUser( req, res, - mode?: RateLimiterMode + mode?: RateLimiterMode, ): Promise { const authHeader = req.headers.authorization ?? @@ -170,7 +170,7 @@ export async function supaAuthenticateUser( return { success: false, error: "Unauthorized: Token missing", - status: 401 + status: 401, }; } @@ -199,7 +199,7 @@ export async function supaAuthenticateUser( return { success: false, error: "Unauthorized: Invalid token", - status: 401 + status: 401, }; } @@ -209,7 +209,7 @@ export async function supaAuthenticateUser( return { success: false, error: "Unauthorized: Invalid token", - status: 401 + status: 401, }; } @@ -219,14 +219,14 @@ export async function supaAuthenticateUser( const plan = getPlanByPriceId(priceId); subscriptionData = { team_id: teamId, - plan + plan, }; switch (mode) { case RateLimiterMode.Crawl: rateLimiter = getRateLimiter( RateLimiterMode.Crawl, token, - subscriptionData.plan + subscriptionData.plan, ); break; case RateLimiterMode.Scrape: @@ -234,21 +234,21 @@ export async function supaAuthenticateUser( RateLimiterMode.Scrape, token, subscriptionData.plan, - teamId + teamId, ); break; case RateLimiterMode.Search: rateLimiter = getRateLimiter( RateLimiterMode.Search, token, - subscriptionData.plan + subscriptionData.plan, ); break; case RateLimiterMode.Map: rateLimiter = getRateLimiter( RateLimiterMode.Map, token, - subscriptionData.plan + subscriptionData.plan, ); break; case RateLimiterMode.CrawlStatus: @@ -278,7 +278,7 @@ export async function supaAuthenticateUser( priceId, plan: subscriptionData?.plan, mode, - rateLimiterRes + rateLimiterRes, }); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); @@ -293,7 +293,7 @@ export async function supaAuthenticateUser( return { success: false, error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, - status: 429 + status: 429, }; } @@ -323,7 +323,7 @@ export async function supaAuthenticateUser( success: true, team_id: teamId ?? undefined, plan: (subscriptionData?.plan ?? "") as PlanType, - chunk + chunk, }; } function getPlanByPriceId(price_id: string | null): PlanType { diff --git a/apps/api/src/controllers/v0/admin/queue.ts b/apps/api/src/controllers/v0/admin/queue.ts index 6cc1c6e0..d7d9c089 100644 --- a/apps/api/src/controllers/v0/admin/queue.ts +++ b/apps/api/src/controllers/v0/admin/queue.ts @@ -8,7 +8,7 @@ import { sendSlackWebhook } from "../../../services/alerts/slack"; export async function cleanBefore24hCompleteJobsController( req: Request, - res: Response + res: Response, ) { logger.info("🐂 Cleaning jobs older than 24h"); try { @@ -22,8 +22,8 @@ export async function cleanBefore24hCompleteJobsController( ["completed"], i * batchSize, i * batchSize + batchSize, - true - ) + true, + ), ); } const completedJobs: Job[] = ( @@ -33,7 +33,7 @@ export async function cleanBefore24hCompleteJobsController( completedJobs.filter( (job) => job.finishedOn !== undefined && - job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + job.finishedOn < Date.now() - 24 * 60 * 60 * 1000, ) || []; let count = 0; @@ -73,14 +73,14 @@ export async function queuesController(req: Request, res: Response) { const scrapeQueue = getScrapeQueue(); const [webScraperActive] = await Promise.all([ - scrapeQueue.getActiveCount() + scrapeQueue.getActiveCount(), ]); const noActiveJobs = webScraperActive === 0; // 200 if no active jobs, 503 if there are active jobs return res.status(noActiveJobs ? 200 : 500).json({ webScraperActive, - noActiveJobs + noActiveJobs, }); } catch (error) { logger.error(error); @@ -99,7 +99,7 @@ export async function autoscalerController(req: Request, res: Response) { await Promise.all([ scrapeQueue.getActiveCount(), scrapeQueue.getWaitingCount(), - scrapeQueue.getPrioritizedCount() + scrapeQueue.getPrioritizedCount(), ]); let waitingAndPriorityCount = webScraperWaiting + webScraperPriority; @@ -109,9 +109,9 @@ export async function autoscalerController(req: Request, res: Response) { "https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines", { headers: { - Authorization: `Bearer ${process.env.FLY_API_TOKEN}` - } - } + Authorization: `Bearer ${process.env.FLY_API_TOKEN}`, + }, + }, ); const machines = await request.json(); @@ -121,7 +121,7 @@ export async function autoscalerController(req: Request, res: Response) { (machine.state === "started" || machine.state === "starting" || machine.state === "replacing") && - machine.config.env["FLY_PROCESS_GROUP"] === "worker" + machine.config.env["FLY_PROCESS_GROUP"] === "worker", ).length; let targetMachineCount = activeMachines; @@ -134,17 +134,17 @@ export async function autoscalerController(req: Request, res: Response) { if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) { targetMachineCount = Math.min( maxNumberOfMachines, - activeMachines + baseScaleUp * 3 + activeMachines + baseScaleUp * 3, ); } else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) { targetMachineCount = Math.min( maxNumberOfMachines, - activeMachines + baseScaleUp * 2 + activeMachines + baseScaleUp * 2, ); } else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) { targetMachineCount = Math.min( maxNumberOfMachines, - activeMachines + baseScaleUp + activeMachines + baseScaleUp, ); } @@ -152,47 +152,47 @@ export async function autoscalerController(req: Request, res: Response) { if (webScraperActive < 100 && waitingAndPriorityCount < 50) { targetMachineCount = Math.max( minNumberOfMachines, - activeMachines - baseScaleDown * 3 + activeMachines - baseScaleDown * 3, ); } else if (webScraperActive < 500 && waitingAndPriorityCount < 200) { targetMachineCount = Math.max( minNumberOfMachines, - activeMachines - baseScaleDown * 2 + activeMachines - baseScaleDown * 2, ); } else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) { targetMachineCount = Math.max( minNumberOfMachines, - activeMachines - baseScaleDown + activeMachines - baseScaleDown, ); } if (targetMachineCount !== activeMachines) { logger.info( - `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting` + `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`, ); if (targetMachineCount > activeMachines) { sendSlackWebhook( `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, - process.env.SLACK_AUTOSCALER ?? "" + process.env.SLACK_AUTOSCALER ?? "", ); } else { sendSlackWebhook( `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, - process.env.SLACK_AUTOSCALER ?? "" + process.env.SLACK_AUTOSCALER ?? "", ); } return res.status(200).json({ mode: "scale-descale", - count: targetMachineCount + count: targetMachineCount, }); } return res.status(200).json({ mode: "normal", - count: activeMachines + count: activeMachines, }); } catch (error) { logger.error(error); diff --git a/apps/api/src/controllers/v0/admin/redis-health.ts b/apps/api/src/controllers/v0/admin/redis-health.ts index 963755ef..b3256edf 100644 --- a/apps/api/src/controllers/v0/admin/redis-health.ts +++ b/apps/api/src/controllers/v0/admin/redis-health.ts @@ -38,7 +38,7 @@ export async function redisHealthController(req: Request, res: Response) { try { await retryOperation(() => redisRateLimitClient.set(testKey, testValue)); redisRateLimitHealth = await retryOperation(() => - redisRateLimitClient.get(testKey) + redisRateLimitClient.get(testKey), ); await retryOperation(() => redisRateLimitClient.del(testKey)); } catch (error) { @@ -49,7 +49,7 @@ export async function redisHealthController(req: Request, res: Response) { const healthStatus = { queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", redisRateLimitClient: - redisRateLimitHealth === testValue ? "healthy" : "unhealthy" + redisRateLimitHealth === testValue ? "healthy" : "unhealthy", }; if ( @@ -60,7 +60,7 @@ export async function redisHealthController(req: Request, res: Response) { return res.status(200).json({ status: "healthy", details: healthStatus }); } else { logger.info( - `Redis instances health check: ${JSON.stringify(healthStatus)}` + `Redis instances health check: ${JSON.stringify(healthStatus)}`, ); // await sendSlackWebhook( // `[REDIS DOWN] Redis instances health check: ${JSON.stringify( diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts index b445978c..db834230 100644 --- a/apps/api/src/controllers/v0/crawl-cancel.ts +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -48,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) { } res.json({ - status: "cancelled" + status: "cancelled", }); } catch (error) { Sentry.captureException(error); diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 756fca44..60ca0e7f 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -60,12 +60,12 @@ export async function crawlStatusController(req: Request, res: Response) { // Combine jobs and jobStatuses into a single array of objects let jobsWithStatuses = jobs.map((job, index) => ({ job, - status: jobStatuses[index] + status: jobStatuses[index], })); // Filter out failed jobs jobsWithStatuses = jobsWithStatuses.filter( - (x) => x.status !== "failed" && x.status !== "unknown" + (x) => x.status !== "failed" && x.status !== "unknown", ); // Sort jobs by timestamp @@ -84,10 +84,10 @@ export async function crawlStatusController(req: Request, res: Response) { const data = jobs .filter( (x) => - x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null + x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null, ) .map((x) => - Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue + Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue, ); if ( @@ -117,7 +117,7 @@ export async function crawlStatusController(req: Request, res: Response) { ? [] : data .filter((x) => x !== null) - .map((x) => toLegacyDocument(x, sc.internalOptions)) + .map((x) => toLegacyDocument(x, sc.internalOptions)), }); } catch (error) { Sentry.captureException(error); diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index bb9ba363..36b8309f 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -10,7 +10,7 @@ import { createIdempotencyKey } from "../../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, - defaultOrigin + defaultOrigin, } from "../../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; import { logger } from "../../../src/lib/logger"; @@ -21,7 +21,7 @@ import { lockURL, lockURLs, saveCrawl, - StoredCrawl + StoredCrawl, } from "../../../src/lib/crawl-redis"; import { getScrapeQueue } from "../../../src/services/queue-service"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; @@ -54,7 +54,7 @@ export async function crawlController(req: Request, res: Response) { const crawlerOptions = { ...defaultCrawlerOptions, - ...req.body.crawlerOptions + ...req.body.crawlerOptions, }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; @@ -82,13 +82,13 @@ export async function crawlController(req: Request, res: Response) { const { success: creditsCheckSuccess, message: creditsCheckMessage, - remainingCredits + remainingCredits, } = await checkTeamCredits(chunk, team_id, limitCheck); if (!creditsCheckSuccess) { return res.status(402).json({ error: - "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" + "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com", }); } @@ -113,7 +113,7 @@ export async function crawlController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", }); } @@ -153,7 +153,7 @@ export async function crawlController(req: Request, res: Response) { const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions( pageOptions, undefined, - undefined + undefined, ); internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter @@ -166,7 +166,7 @@ export async function crawlController(req: Request, res: Response) { internalOptions, team_id, plan, - createdAt: Date.now() + createdAt: Date.now(), }; const crawler = crawlToCrawler(id, sc); @@ -204,23 +204,23 @@ export async function crawlController(req: Request, res: Response) { plan, origin: req.body.origin ?? defaultOrigin, crawl_id: id, - sitemapped: true + sitemapped: true, }, opts: { jobId: uuid, - priority: jobPriority - } + priority: jobPriority, + }, }; }); await lockURLs( id, sc, - jobs.map((x) => x.data.url) + jobs.map((x) => x.data.url), ); await addCrawlJobs( id, - jobs.map((x) => x.opts.jobId) + jobs.map((x) => x.opts.jobId), ); for (const job of jobs) { // add with sentry instrumentation @@ -243,12 +243,12 @@ export async function crawlController(req: Request, res: Response) { team_id, plan: plan!, origin: req.body.origin ?? defaultOrigin, - crawl_id: id + crawl_id: id, }, { - priority: 15 // prioritize request 0 of crawl jobs same as scrape jobs + priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs }, - jobId + jobId, ); await addCrawlJob(id, jobId); } @@ -258,7 +258,7 @@ export async function crawlController(req: Request, res: Response) { Sentry.captureException(error); logger.error(error); return res.status(500).json({ - error: error instanceof ZodError ? "Invalid URL" : error.message + error: error instanceof ZodError ? "Invalid URL" : error.message, }); } } diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index 3b47bfaa..405e49c2 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -9,7 +9,7 @@ import { crawlToCrawler, lockURL, saveCrawl, - StoredCrawl + StoredCrawl, } from "../../../src/lib/crawl-redis"; import { addScrapeJob } from "../../../src/services/queue-jobs"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; @@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) { if (isUrlBlocked(url)) { return res.status(403).json({ error: - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", }); } @@ -51,7 +51,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, - removeTags: [] + removeTags: [], }; // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? @@ -94,7 +94,7 @@ export async function crawlPreviewController(req: Request, res: Response) { const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions( pageOptions, undefined, - undefined + undefined, ); const sc: StoredCrawl = { @@ -105,7 +105,7 @@ export async function crawlPreviewController(req: Request, res: Response) { team_id, plan, robots, - createdAt: Date.now() + createdAt: Date.now(), }; await saveCrawl(id, sc); @@ -131,10 +131,10 @@ export async function crawlPreviewController(req: Request, res: Response) { internalOptions, origin: "website-preview", crawl_id: id, - sitemapped: true + sitemapped: true, }, {}, - jobId + jobId, ); await addCrawlJob(id, jobId); } @@ -151,10 +151,10 @@ export async function crawlPreviewController(req: Request, res: Response) { scrapeOptions, internalOptions, origin: "website-preview", - crawl_id: id + crawl_id: id, }, {}, - jobId + jobId, ); await addCrawlJob(id, jobId); } diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 4a761ea3..8501e502 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -2,7 +2,7 @@ import { ExtractorOptions, PageOptions } from "./../../lib/entities"; import { Request, Response } from "express"; import { billTeam, - checkTeamCredits + checkTeamCredits, } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; @@ -11,7 +11,7 @@ import { Document, fromLegacyCombo, toLegacyDocument, - url as urlSchema + url as urlSchema, } from "../v1/types"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; @@ -19,7 +19,7 @@ import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, - defaultOrigin + defaultOrigin, } from "../../lib/default-values"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { getScrapeQueue } from "../../services/queue-service"; @@ -38,7 +38,7 @@ export async function scrapeHelper( pageOptions: PageOptions, extractorOptions: ExtractorOptions, timeout: number, - plan?: PlanType + plan?: PlanType, ): Promise<{ success: boolean; error?: string; @@ -55,7 +55,7 @@ export async function scrapeHelper( success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", - returnCode: 403 + returnCode: 403, }; } @@ -65,7 +65,7 @@ export async function scrapeHelper( pageOptions, extractorOptions, timeout, - crawlerOptions + crawlerOptions, ); await addScrapeJob( @@ -77,11 +77,11 @@ export async function scrapeHelper( internalOptions, plan: plan!, origin: req.body.origin ?? defaultOrigin, - is_scrape: true + is_scrape: true, }, {}, jobId, - jobPriority + jobPriority, ); let doc; @@ -90,7 +90,7 @@ export async function scrapeHelper( { name: "Wait for job to finish", op: "bullmq.wait", - attributes: { job: jobId } + attributes: { job: jobId }, }, async (span) => { try { @@ -104,20 +104,20 @@ export async function scrapeHelper( return { success: false, error: "Request timed out", - returnCode: 408 + returnCode: 408, }; } else if ( typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes( - "LLM extraction did not match the extraction schema you provided." + "LLM extraction did not match the extraction schema you provided.", )) ) { return { success: false, error: e, - returnCode: 500 + returnCode: 500, }; } else { throw e; @@ -125,7 +125,7 @@ export async function scrapeHelper( } span.setAttribute("result", JSON.stringify(doc)); return null; - } + }, ); if (err !== null) { @@ -140,7 +140,7 @@ export async function scrapeHelper( success: true, error: "No page found", returnCode: 200, - data: doc + data: doc, }; } @@ -166,7 +166,7 @@ export async function scrapeHelper( return { success: true, data: toLegacyDocument(doc, internalOptions), - returnCode: 200 + returnCode: 200, }; } @@ -185,7 +185,7 @@ export async function scrapeController(req: Request, res: Response) { const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const extractorOptions = { ...defaultExtractorOptions, - ...req.body.extractorOptions + ...req.body.extractorOptions, }; const origin = req.body.origin ?? defaultOrigin; let timeout = req.body.timeout ?? defaultTimeout; @@ -197,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) { ) { return res.status(400).json({ error: - "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" + "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified", }); } @@ -213,7 +213,7 @@ export async function scrapeController(req: Request, res: Response) { earlyReturn = true; return res.status(402).json({ error: - "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" + "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing", }); } } catch (error) { @@ -221,7 +221,7 @@ export async function scrapeController(req: Request, res: Response) { earlyReturn = true; return res.status(500).json({ error: - "Error checking team credits. Please contact help@firecrawl.com for help." + "Error checking team credits. Please contact help@firecrawl.com for help.", }); } @@ -236,7 +236,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions, extractorOptions, timeout, - plan + plan, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -244,7 +244,7 @@ export async function scrapeController(req: Request, res: Response) { result.data && (result.data as Document).markdown ? numTokensFromString( (result.data as Document).markdown!, - "gpt-3.5-turbo" + "gpt-3.5-turbo", ) : 0; @@ -267,7 +267,7 @@ export async function scrapeController(req: Request, res: Response) { // billing for doc done on queue end, bill only for llm extraction billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => { logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}` + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -290,7 +290,7 @@ export async function scrapeController(req: Request, res: Response) { const { scrapeOptions } = fromLegacyScrapeOptions( pageOptions, extractorOptions, - timeout + timeout, ); logJob({ @@ -306,7 +306,7 @@ export async function scrapeController(req: Request, res: Response) { crawlerOptions: crawlerOptions, scrapeOptions, origin: origin, - num_tokens: numTokens + num_tokens: numTokens, }); return res.status(result.returnCode).json(result); @@ -319,7 +319,7 @@ export async function scrapeController(req: Request, res: Response) { ? "Invalid URL" : typeof error === "string" ? error - : (error?.message ?? "Internal Server Error") + : (error?.message ?? "Internal Server Error"), }); } } diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 4950ea5f..6a3513df 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import { billTeam, - checkTeamCredits + checkTeamCredits, } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; @@ -20,7 +20,7 @@ import { Document, fromLegacyCombo, fromLegacyScrapeOptions, - toLegacyDocument + toLegacyDocument, } from "../v1/types"; export async function searchHelper( @@ -31,7 +31,7 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, - plan: PlanType | undefined + plan: PlanType | undefined, ): Promise<{ success: boolean; error?: string; @@ -62,7 +62,7 @@ export async function searchHelper( filter: filter, lang: searchOptions.lang ?? "en", country: searchOptions.country ?? "us", - location: searchOptions.location + location: searchOptions.location, }); let justSearch = pageOptions.fetchPageContent === false; @@ -71,13 +71,13 @@ export async function searchHelper( pageOptions, undefined, 60000, - crawlerOptions + crawlerOptions, ); if (justSearch) { billTeam(team_id, subscription_id, res.length).catch((error) => { logger.error( - `Failed to bill team ${team_id} for ${res.length} credits: ${error}` + `Failed to bill team ${team_id} for ${res.length} credits: ${error}`, ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -107,12 +107,12 @@ export async function searchHelper( mode: "single_urls", team_id: team_id, scrapeOptions, - internalOptions + internalOptions, }, opts: { jobId: uuid, - priority: jobPriority - } + priority: jobPriority, + }, }; }); @@ -123,7 +123,7 @@ export async function searchHelper( const docs = ( await Promise.all( - jobDatas.map((x) => waitForJob(x.opts.jobId, 60000)) + jobDatas.map((x) => waitForJob(x.opts.jobId, 60000)), ) ).map((x) => toLegacyDocument(x, internalOptions)); @@ -136,7 +136,7 @@ export async function searchHelper( // make sure doc.content is not empty const filteredDocs = docs.filter( - (doc: any) => doc && doc.content && doc.content.trim().length > 0 + (doc: any) => doc && doc.content && doc.content.trim().length > 0, ); if (filteredDocs.length === 0) { @@ -144,14 +144,14 @@ export async function searchHelper( success: true, error: "No page found", returnCode: 200, - data: docs + data: docs, }; } return { success: true, data: filteredDocs, - returnCode: 200 + returnCode: 200, }; } @@ -169,7 +169,7 @@ export async function searchController(req: Request, res: Response) { onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false, fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true, removeTags: req.body.pageOptions?.removeTags ?? [], - fallback: req.body.pageOptions?.fallback ?? false + fallback: req.body.pageOptions?.fallback ?? false, }; const origin = req.body.origin ?? "api"; @@ -197,7 +197,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, - plan + plan, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -212,7 +212,7 @@ export async function searchController(req: Request, res: Response) { mode: "search", url: req.body.query, crawlerOptions: crawlerOptions, - origin: origin + origin: origin, }); return res.status(result.returnCode).json(result); } catch (error) { diff --git a/apps/api/src/controllers/v0/status.ts b/apps/api/src/controllers/v0/status.ts index 73bfa159..c68579ea 100644 --- a/apps/api/src/controllers/v0/status.ts +++ b/apps/api/src/controllers/v0/status.ts @@ -6,7 +6,7 @@ import * as Sentry from "@sentry/node"; export async function crawlJobStatusPreviewController( req: Request, - res: Response + res: Response, ) { try { const sc = await getCrawl(req.params.jobId); @@ -26,7 +26,7 @@ export async function crawlJobStatusPreviewController( // } const jobs = (await getJobs(req.params.jobId, jobIDs)).sort( - (a, b) => a.timestamp - b.timestamp + (a, b) => a.timestamp - b.timestamp, ); const jobStatuses = await Promise.all(jobs.map((x) => x.getState())); const jobStatus = sc.cancelled @@ -38,7 +38,7 @@ export async function crawlJobStatusPreviewController( : "active"; const data = jobs.map((x) => - Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue + Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue, ); res.json({ @@ -48,7 +48,7 @@ export async function crawlJobStatusPreviewController( total: jobs.length, data: jobStatus === "completed" ? data : null, partial_data: - jobStatus === "completed" ? [] : data.filter((x) => x !== null) + jobStatus === "completed" ? [] : data.filter((x) => x !== null), }); } catch (error) { Sentry.captureException(error); diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts index 1ce058a0..b455e5ab 100644 --- a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -25,13 +25,13 @@ describe("URL Schema Validation", () => { it("should reject URLs without a valid top-level domain", () => { expect(() => url.parse("http://example")).toThrow( - "URL must have a valid top-level domain or be a valid path" + "URL must have a valid top-level domain or be a valid path", ); }); it("should reject blocked URLs", () => { expect(() => url.parse("https://facebook.com")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); @@ -47,28 +47,28 @@ describe("URL Schema Validation", () => { it("should handle URLs with subdomains that are blocked", () => { expect(() => url.parse("https://sub.facebook.com")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); it("should handle URLs with paths that are blocked", () => { expect(() => url.parse("http://facebook.com/path")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); expect(() => url.parse("https://facebook.com/another/path")).toThrow( - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", ); }); it("should reject malformed URLs starting with 'http://http'", () => { expect(() => url.parse("http://http://example.com")).toThrow( - "Invalid URL. Invalid protocol." + "Invalid URL. Invalid protocol.", ); }); it("should reject malformed URLs containing multiple 'http://'", () => { expect(() => - url.parse("http://example.com/http://example.com") + url.parse("http://example.com/http://example.com"), ).not.toThrow(); }); diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index a78264e3..89fa6741 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -5,14 +5,14 @@ import { batchScrapeRequestSchema, CrawlResponse, RequestWithAuth, - ScrapeOptions + ScrapeOptions, } from "./types"; import { addCrawlJobs, getCrawl, lockURLs, saveCrawl, - StoredCrawl + StoredCrawl, } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getJobPriority } from "../../lib/job-priority"; @@ -22,7 +22,7 @@ import { logger as _logger } from "../../lib/logger"; export async function batchScrapeController( req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>, - res: Response + res: Response, ) { req.body = batchScrapeRequestSchema.parse(req.body); @@ -33,12 +33,12 @@ export async function batchScrapeController( module: "api/v1", method: "batchScrapeController", teamId: req.auth.team_id, - plan: req.auth.plan + plan: req.auth.plan, }); logger.debug("Batch scrape " + id + " starting", { urlsLength: req.body.urls, appendToId: req.body.appendToId, - account: req.account + account: req.account, }); if (!req.body.appendToId) { @@ -59,7 +59,7 @@ export async function batchScrapeController( internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), - plan: req.auth.plan + plan: req.auth.plan, }; if (!req.body.appendToId) { @@ -75,7 +75,7 @@ export async function batchScrapeController( jobPriority = await getJobPriority({ plan: req.auth.plan, team_id: req.auth.team_id, - basePriority: 21 + basePriority: 21, }); } logger.debug("Using job priority " + jobPriority, { jobPriority }); @@ -97,12 +97,12 @@ export async function batchScrapeController( crawl_id: id, sitemapped: true, v1: true, - webhook: req.body.webhook + webhook: req.body.webhook, }, opts: { jobId: uuidv4(), - priority: 20 - } + priority: 20, + }, }; }); @@ -110,19 +110,19 @@ export async function batchScrapeController( await lockURLs( id, sc, - jobs.map((x) => x.data.url) + jobs.map((x) => x.data.url), ); logger.debug("Adding scrape jobs to Redis..."); await addCrawlJobs( id, - jobs.map((x) => x.opts.jobId) + jobs.map((x) => x.opts.jobId), ); logger.debug("Adding scrape jobs to BullMQ..."); await addScrapeJobs(jobs); if (req.body.webhook) { logger.debug("Calling webhook with batch_scrape.started...", { - webhook: req.body.webhook + webhook: req.body.webhook, }); await callWebhook( req.auth.team_id, @@ -130,7 +130,7 @@ export async function batchScrapeController( null, req.body.webhook, true, - "batch_scrape.started" + "batch_scrape.started", ); } @@ -139,6 +139,6 @@ export async function batchScrapeController( return res.status(200).json({ success: true, id, - url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}` + url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`, }); } diff --git a/apps/api/src/controllers/v1/concurrency-check.ts b/apps/api/src/controllers/v1/concurrency-check.ts index bd25c73b..5ed569f5 100644 --- a/apps/api/src/controllers/v1/concurrency-check.ts +++ b/apps/api/src/controllers/v1/concurrency-check.ts @@ -2,7 +2,7 @@ import { authenticateUser } from "../auth"; import { ConcurrencyCheckParams, ConcurrencyCheckResponse, - RequestWithAuth + RequestWithAuth, } from "./types"; import { RateLimiterMode } from "../../types"; import { Response } from "express"; @@ -10,14 +10,14 @@ import { redisConnection } from "../../services/queue-service"; // Basically just middleware and error wrapping export async function concurrencyCheckController( req: RequestWithAuth, - res: Response + res: Response, ) { const concurrencyLimiterKey = "concurrency-limiter:" + req.auth.team_id; const now = Date.now(); const activeJobsOfTeam = await redisConnection.zrangebyscore( concurrencyLimiterKey, now, - Infinity + Infinity, ); return res .status(200) diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts index 986ff104..00af8b31 100644 --- a/apps/api/src/controllers/v1/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -9,7 +9,7 @@ configDotenv(); export async function crawlCancelController( req: RequestWithAuth<{ jobId: string }>, - res: Response + res: Response, ) { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; @@ -43,7 +43,7 @@ export async function crawlCancelController( } res.json({ - status: "cancelled" + status: "cancelled", }); } catch (error) { Sentry.captureException(error); diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index d9994d97..817dc184 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -6,7 +6,7 @@ import { CrawlStatusResponse, Document, ErrorResponse, - RequestWithAuth + RequestWithAuth, } from "./types"; import { WebSocket } from "ws"; import { v4 as uuidv4 } from "uuid"; @@ -19,7 +19,7 @@ import { getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, - isCrawlFinishedLocked + isCrawlFinishedLocked, } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { getJob, getJobs } from "./crawl-status"; @@ -64,7 +64,7 @@ function close(ws: WebSocket, code: number, msg: Message) { async function crawlStatusWS( ws: WebSocket, - req: RequestWithAuth + req: RequestWithAuth, ) { const sc = await getCrawl(req.params.jobId); if (!sc) { @@ -89,7 +89,10 @@ async function crawlStatusWS( const notDoneJobIDs = jobIDs.filter((x) => !doneJobIDs.includes(x)); const jobStatuses = await Promise.all( - notDoneJobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)]) + notDoneJobIDs.map(async (x) => [ + x, + await getScrapeQueue().getJobState(x), + ]), ); const newlyDoneJobIDs: string[] = jobStatuses .filter((x) => x[1] === "completed" || x[1] === "failed") @@ -102,7 +105,7 @@ async function crawlStatusWS( if (job.returnvalue) { send(ws, { type: "document", - data: job.returnvalue + data: job.returnvalue, }); } else { return close(ws, 3000, { type: "error", error: job.failedReason }); @@ -120,7 +123,9 @@ async function crawlStatusWS( let jobIDs = await getCrawlJobs(req.params.jobId); let jobStatuses = await Promise.all( - jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const) + jobIDs.map( + async (x) => [x, await getScrapeQueue().getJobState(x)] as const, + ), ); const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id))); @@ -161,8 +166,8 @@ async function crawlStatusWS( completed: doneJobIDs.length, creditsUsed: jobIDs.length, expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), - data: data - } + data: data, + }, }); if (status !== "scraping") { @@ -174,7 +179,7 @@ async function crawlStatusWS( // Basically just middleware and error wrapping export async function crawlStatusWSController( ws: WebSocket, - req: RequestWithAuth + req: RequestWithAuth, ) { try { const auth = await authenticateUser(req, null, RateLimiterMode.CrawlStatus); @@ -182,7 +187,7 @@ export async function crawlStatusWSController( if (!auth.success) { return close(ws, 3000, { type: "error", - error: auth.error + error: auth.error, }); } @@ -201,7 +206,7 @@ export async function crawlStatusWSController( verbose = JSON.stringify({ message: err.message, name: err.name, - stack: err.stack + stack: err.stack, }); } } @@ -212,13 +217,13 @@ export async function crawlStatusWSController( ") -- ID " + id + " -- " + - verbose + verbose, ); return close(ws, 1011, { type: "error", error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + - id + id, }); } } diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index d88d26fb..59db16d8 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -3,7 +3,7 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, - RequestWithAuth + RequestWithAuth, } from "./types"; import { getCrawl, @@ -11,12 +11,12 @@ import { getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, - getThrottledJobs + getThrottledJobs, } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById, - supabaseGetJobsById + supabaseGetJobsById, } from "../../lib/supabase-jobs"; import { configDotenv } from "dotenv"; import { Job, JobState } from "bullmq"; @@ -70,7 +70,7 @@ export async function getJobs(ids: string[]) { export async function crawlStatusController( req: RequestWithAuth, res: Response, - isBatch = false + isBatch = false, ) { const sc = await getCrawl(req.params.jobId); if (!sc) { @@ -90,7 +90,9 @@ export async function crawlStatusController( let jobIDs = await getCrawlJobs(req.params.jobId); let jobStatuses = await Promise.all( - jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const) + jobIDs.map( + async (x) => [x, await getScrapeQueue().getJobState(x)] as const, + ), ); const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id))); @@ -124,7 +126,7 @@ export async function crawlStatusController( const doneJobsOrder = await getDoneJobsOrdered( req.params.jobId, start, - end ?? -1 + end ?? -1, ); let doneJobs: Job[] = []; @@ -158,7 +160,7 @@ export async function crawlStatusController( if (job.returnvalue === undefined) { logger.warn( "Job was considered done, but returnvalue is undefined!", - { jobId: job.id, state } + { jobId: job.id, state }, ); continue; } @@ -175,8 +177,8 @@ export async function crawlStatusController( doneJobs = ( await Promise.all( (await getJobs(doneJobsOrder)).map(async (x) => - (await x.getState()) === "failed" ? null : x - ) + (await x.getState()) === "failed" ? null : x, + ), ) ).filter((x) => x !== null) as Job[]; } @@ -185,7 +187,7 @@ export async function crawlStatusController( const protocol = process.env.ENV === "local" ? req.protocol : "https"; const nextURL = new URL( - `${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}` + `${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`, ); nextURL.searchParams.set("skip", (start + data.length).toString()); @@ -215,6 +217,6 @@ export async function crawlStatusController( status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this ? undefined : nextURL.href, - data: data + data: data, }); } diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index dac1b735..1fb470f9 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -5,7 +5,7 @@ import { crawlRequestSchema, CrawlResponse, RequestWithAuth, - toLegacyCrawlerOptions + toLegacyCrawlerOptions, } from "./types"; import { addCrawlJob, @@ -14,7 +14,7 @@ import { lockURL, lockURLs, saveCrawl, - StoredCrawl + StoredCrawl, } from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; @@ -26,7 +26,7 @@ import { scrapeOptions as scrapeOptionsSchema } from "./types"; export async function crawlController( req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, - res: Response + res: Response, ) { const preNormalizedBody = req.body; req.body = crawlRequestSchema.parse(req.body); @@ -37,12 +37,12 @@ export async function crawlController( module: "api/v1", method: "crawlController", teamId: req.auth.team_id, - plan: req.auth.plan + plan: req.auth.plan, }); logger.debug("Crawl " + id + " starting", { request: req.body, originalRequest: preNormalizedBody, - account: req.account + account: req.account, }); await logCrawl(id, req.auth.team_id); @@ -56,7 +56,7 @@ export async function crawlController( const crawlerOptions = { ...req.body, url: undefined, - scrapeOptions: undefined + scrapeOptions: undefined, }; const scrapeOptions = req.body.scrapeOptions; @@ -86,7 +86,7 @@ export async function crawlController( logger.debug("Determined limit: " + crawlerOptions.limit, { remainingCredits, bodyLimit: originalLimit, - originalBodyLimit: preNormalizedBody.limit + originalBodyLimit: preNormalizedBody.limit, }); const sc: StoredCrawl = { @@ -96,7 +96,7 @@ export async function crawlController( internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter team_id: req.auth.team_id, createdAt: Date.now(), - plan: req.auth.plan + plan: req.auth.plan, }; const crawler = crawlToCrawler(id, sc); @@ -105,7 +105,7 @@ export async function crawlController( sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification); } catch (e) { logger.debug("Failed to get robots.txt (this is probably fine!)", { - error: e + error: e, }); } @@ -117,7 +117,7 @@ export async function crawlController( if (sitemap !== null && sitemap.length > 0) { logger.debug("Using sitemap of length " + sitemap.length, { - sitemapLength: sitemap.length + sitemapLength: sitemap.length, }); let jobPriority = 20; // If it is over 1000, we need to get the job priority, @@ -127,7 +127,7 @@ export async function crawlController( jobPriority = await getJobPriority({ plan: req.auth.plan, team_id: req.auth.team_id, - basePriority: 21 + basePriority: 21, }); } logger.debug("Using job priority " + jobPriority, { jobPriority }); @@ -149,12 +149,12 @@ export async function crawlController( crawl_id: id, sitemapped: true, webhook: req.body.webhook, - v1: true + v1: true, }, opts: { jobId: uuid, - priority: 20 - } + priority: 20, + }, }; }); @@ -162,18 +162,18 @@ export async function crawlController( await lockURLs( id, sc, - jobs.map((x) => x.data.url) + jobs.map((x) => x.data.url), ); logger.debug("Adding scrape jobs to Redis..."); await addCrawlJobs( id, - jobs.map((x) => x.opts.jobId) + jobs.map((x) => x.opts.jobId), ); logger.debug("Adding scrape jobs to BullMQ..."); await getScrapeQueue().addBulk(jobs); } else { logger.debug("Sitemap not found or ignored.", { - ignoreSitemap: sc.crawlerOptions.ignoreSitemap + ignoreSitemap: sc.crawlerOptions.ignoreSitemap, }); logger.debug("Locking URL..."); @@ -192,12 +192,12 @@ export async function crawlController( origin: "api", crawl_id: id, webhook: req.body.webhook, - v1: true + v1: true, }, { - priority: 15 + priority: 15, }, - jobId + jobId, ); logger.debug("Adding scrape job to BullMQ...", { jobId }); await addCrawlJob(id, jobId); @@ -206,7 +206,7 @@ export async function crawlController( if (req.body.webhook) { logger.debug("Calling webhook with crawl.started...", { - webhook: req.body.webhook + webhook: req.body.webhook, }); await callWebhook( req.auth.team_id, @@ -214,7 +214,7 @@ export async function crawlController( null, req.body.webhook, true, - "crawl.started" + "crawl.started", ); } @@ -223,6 +223,6 @@ export async function crawlController( return res.status(200).json({ success: true, id, - url: `${protocol}://${req.get("host")}/v1/crawl/${id}` + url: `${protocol}://${req.get("host")}/v1/crawl/${id}`, }); } diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 74b188e7..0c286253 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -6,7 +6,7 @@ import { extractRequestSchema, ExtractResponse, MapDocument, - scrapeOptions + scrapeOptions, } from "./types"; import { Document } from "../../lib/entities"; import Redis from "ioredis"; @@ -43,7 +43,7 @@ const MIN_REQUIRED_LINKS = 1; */ export async function extractController( req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, - res: Response + res: Response, ) { const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true"; @@ -81,7 +81,7 @@ export async function extractController( // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping ignoreSitemap: !selfHosted ? true : false, includeMetadata: true, - includeSubdomains: req.body.includeSubdomains + includeSubdomains: req.body.includeSubdomains, }); let mappedLinks = mapResults.links as MapDocument[]; @@ -89,7 +89,8 @@ export async function extractController( mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); let mappedLinksRerank = mappedLinks.map( - (x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}` + (x) => + `url: ${x.url}, title: ${x.title}, description: ${x.description}`, ); // Filter by path prefix if present @@ -103,31 +104,31 @@ export async function extractController( const linksAndScores = await performRanking( mappedLinksRerank, mappedLinks.map((l) => l.url), - mapUrl + mapUrl, ); // First try with high threshold let filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - INITIAL_SCORE_THRESHOLD + INITIAL_SCORE_THRESHOLD, ); // If we don't have enough high-quality links, try with lower threshold if (filteredLinks.length < MIN_REQUIRED_LINKS) { logger.info( - `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...` + `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, ); filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - FALLBACK_SCORE_THRESHOLD + FALLBACK_SCORE_THRESHOLD, ); if (filteredLinks.length === 0) { // If still no results, take top N results regardless of score logger.warn( - `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.` + `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`, ); filteredLinks = linksAndScores .sort((a, b) => b.score - a.score) @@ -135,7 +136,9 @@ export async function extractController( .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => - x !== undefined && x.url !== undefined && !isUrlBlocked(x.url) + x !== undefined && + x.url !== undefined && + !isUrlBlocked(x.url), ); } } @@ -161,7 +164,7 @@ export async function extractController( return res.status(400).json({ success: false, error: - "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs." + "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.", }); } @@ -174,7 +177,7 @@ export async function extractController( const jobPriority = await getJobPriority({ plan: req.auth.plan as PlanType, team_id: req.auth.team_id, - basePriority: 10 + basePriority: 10, }); await addScrapeJob( @@ -186,11 +189,11 @@ export async function extractController( internalOptions: {}, plan: req.auth.plan!, origin, - is_scrape: true + is_scrape: true, }, {}, jobId, - jobPriority + jobPriority, ); try { @@ -208,12 +211,12 @@ export async function extractController( ) { throw { status: 408, - error: "Request timed out" + error: "Request timed out", }; } else { throw { status: 500, - error: `(Internal server error) - ${e && e.message ? e.message : e}` + error: `(Internal server error) - ${e && e.message ? e.message : e}`, }; } } @@ -225,7 +228,7 @@ export async function extractController( } catch (e) { return res.status(e.status).json({ success: false, - error: e.error + error: e.error, }); } @@ -237,11 +240,11 @@ export async function extractController( "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "), prompt: req.body.prompt, - schema: req.body.schema + schema: req.body.schema, }, docs.map((x) => buildDocument(x)).join("\n"), undefined, - true // isExtractEndpoint + true, // isExtractEndpoint ); // TODO: change this later @@ -249,9 +252,9 @@ export async function extractController( billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch( (error) => { logger.error( - `Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}` + `Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`, ); - } + }, ); let data = completions.extract ?? {}; @@ -269,14 +272,14 @@ export async function extractController( url: req.body.urls.join(", "), scrapeOptions: req.body, origin: req.body.origin ?? "api", - num_tokens: completions.numTokens ?? 0 + num_tokens: completions.numTokens ?? 0, }); return res.status(200).json({ success: true, data: data, scrape_id: id, - warning: warning + warning: warning, }); } @@ -295,13 +298,13 @@ function filterAndProcessLinks( score: number; originalIndex: number; }[], - threshold: number + threshold: number, ): MapDocument[] { return linksAndScores .filter((x) => x.score > threshold) .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => - x !== undefined && x.url !== undefined && !isUrlBlocked(x.url) + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), ); } diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 7ddd7b78..cd302708 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -4,7 +4,7 @@ import { MapDocument, mapRequestSchema, RequestWithAuth, - scrapeOptions + scrapeOptions, } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; @@ -13,7 +13,7 @@ import { checkAndUpdateURLForMap, isSameDomain, isSameSubdomain, - removeDuplicateUrls + removeDuplicateUrls, } from "../../lib/validateUrl"; import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; @@ -49,7 +49,7 @@ export async function getMapResults({ plan, origin, includeMetadata = false, - allowExternalLinks + allowExternalLinks, }: { url: string; search?: string; @@ -72,13 +72,13 @@ export async function getMapResults({ crawlerOptions: { ...crawlerOptions, limit: crawlerOptions.sitemapOnly ? 10000000 : limit, - scrapeOptions: undefined + scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), internalOptions: {}, team_id: teamId, createdAt: Date.now(), - plan: plan + plan: plan, }; const crawler = crawlToCrawler(id, sc); @@ -114,7 +114,7 @@ export async function getMapResults({ const resultsPerPage = 100; const maxPages = Math.ceil( - Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage + Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage, ); const cacheKey = `fireEngineMap:${mapUrl}`; @@ -129,12 +129,12 @@ export async function getMapResults({ const fetchPage = async (page: number) => { return fireEngineMap(mapUrl, { numResults: resultsPerPage, - page: page + page: page, }); }; pagePromises = Array.from({ length: maxPages }, (_, i) => - fetchPage(i + 1) + fetchPage(i + 1), ); allResults = await Promise.all(pagePromises); @@ -144,7 +144,7 @@ export async function getMapResults({ // Parallelize sitemap fetch with serper search const [sitemap, ...searchResults] = await Promise.all([ ignoreSitemap ? null : crawler.tryGetSitemap(true), - ...(cachedResult ? [] : pagePromises) + ...(cachedResult ? [] : pagePromises), ]); if (!cachedResult) { @@ -172,7 +172,7 @@ export async function getMapResults({ links = [ mapResults[0].url, ...mapResults.slice(1).map((x) => x.url), - ...links + ...links, ]; } else { mapResults.map((x) => { @@ -218,13 +218,13 @@ export async function getMapResults({ links: includeMetadata ? mapResults : linksToReturn, scrape_id: origin?.includes("website") ? id : undefined, job_id: id, - time_taken: (new Date().getTime() - Date.now()) / 1000 + time_taken: (new Date().getTime() - Date.now()) / 1000, }; } export async function mapController( req: RequestWithAuth<{}, MapResponse, MapRequest>, - res: Response + res: Response, ) { req.body = mapRequestSchema.parse(req.body); @@ -237,13 +237,13 @@ export async function mapController( crawlerOptions: req.body, origin: req.body.origin, teamId: req.auth.team_id, - plan: req.auth.plan + plan: req.auth.plan, }); // Bill the team billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => { logger.error( - `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` + `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`, ); }); @@ -261,13 +261,13 @@ export async function mapController( crawlerOptions: {}, scrapeOptions: {}, origin: req.body.origin ?? "api", - num_tokens: 0 + num_tokens: 0, }); const response = { success: true as const, links: result.links, - scrape_id: result.scrape_id + scrape_id: result.scrape_id, }; return res.status(200).json(response); diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index b366b79e..7fec74a1 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -13,29 +13,29 @@ export async function scrapeStatusController(req: any, res: any) { const job = await supabaseGetJobByIdOnlyData(req.params.jobId); const allowedTeams = [ "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", - "511544f2-2fce-4183-9c59-6c29b02c69b5" + "511544f2-2fce-4183-9c59-6c29b02c69b5", ]; if (!allowedTeams.includes(job?.team_id)) { return res.status(403).json({ success: false, - error: "You are not allowed to access this resource." + error: "You are not allowed to access this resource.", }); } return res.status(200).json({ success: true, - data: job?.docs[0] + data: job?.docs[0], }); } catch (error) { if (error instanceof Error && error.message == "Too Many Requests") { return res.status(429).json({ success: false, - error: "Rate limit exceeded. Please try again later." + error: "Rate limit exceeded. Please try again later.", }); } else { return res.status(500).json({ success: false, - error: "An unexpected error occurred." + error: "An unexpected error occurred.", }); } } diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 05cc68e3..ddd5da74 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -5,7 +5,7 @@ import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, - ScrapeResponse + ScrapeResponse, } from "./types"; import { billTeam } from "../../services/billing/credit_billing"; import { v4 as uuidv4 } from "uuid"; @@ -17,7 +17,7 @@ import { getScrapeQueue } from "../../services/queue-service"; export async function scrapeController( req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, - res: Response + res: Response, ) { req.body = scrapeRequestSchema.parse(req.body); let earlyReturn = false; @@ -30,7 +30,7 @@ export async function scrapeController( const jobPriority = await getJobPriority({ plan: req.auth.plan as PlanType, team_id: req.auth.team_id, - basePriority: 10 + basePriority: 10, }); await addScrapeJob( @@ -42,18 +42,18 @@ export async function scrapeController( internalOptions: {}, plan: req.auth.plan!, origin: req.body.origin, - is_scrape: true + is_scrape: true, }, {}, jobId, - jobPriority + jobPriority, ); const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce( (a, x) => (x.type === "wait" ? (x.milliseconds ?? 0) : 0) + a, - 0 + 0, ); let doc: Document; @@ -67,12 +67,12 @@ export async function scrapeController( ) { return res.status(408).json({ success: false, - error: "Request timed out" + error: "Request timed out", }); } else { return res.status(500).json({ success: false, - error: `(Internal server error) - ${e && e.message ? e.message : e}` + error: `(Internal server error) - ${e && e.message ? e.message : e}`, }); } } @@ -99,10 +99,10 @@ export async function scrapeController( billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch( (error) => { logger.error( - `Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}` + `Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`, ); // Optionally, you could notify an admin or add to a retry queue here - } + }, ); if (!req.body.formats.includes("rawHtml")) { @@ -123,12 +123,12 @@ export async function scrapeController( url: req.body.url, scrapeOptions: req.body, origin: origin, - num_tokens: numTokens + num_tokens: numTokens, }); return res.status(200).json({ success: true, data: doc, - scrape_id: origin?.includes("website") ? jobId : undefined + scrape_id: origin?.includes("website") ? jobId : undefined, }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index f9fa2392..57e208b4 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -8,7 +8,7 @@ import { ExtractorOptions, PageOptions, ScrapeActionContent, - Document as V0Document + Document as V0Document, } from "../../lib/entities"; import { InternalOptions } from "../../scraper/scrapeURL"; @@ -34,7 +34,7 @@ export const url = z.preprocess( .regex(/^https?:\/\//, "URL uses unsupported protocol") .refine( (x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x), - "URL must have a valid top-level domain or be a valid path" + "URL must have a valid top-level domain or be a valid path", ) .refine((x) => { try { @@ -46,8 +46,8 @@ export const url = z.preprocess( }, "Invalid URL") .refine( (x) => !isUrlBlocked(x as string), - "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." - ) + "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", + ), ); const strictMessage = @@ -60,9 +60,9 @@ export const extractOptions = z systemPrompt: z .string() .default( - "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required." + "Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required.", ), - prompt: z.string().optional() + prompt: z.string().optional(), }) .strict(strictMessage); @@ -74,7 +74,7 @@ export const actionsSchema = z.array( .object({ type: z.literal("wait"), milliseconds: z.number().int().positive().finite().optional(), - selector: z.string().optional() + selector: z.string().optional(), }) .refine( (data) => @@ -82,38 +82,38 @@ export const actionsSchema = z.array( !(data.milliseconds !== undefined && data.selector !== undefined), { message: - "Either 'milliseconds' or 'selector' must be provided, but not both." - } + "Either 'milliseconds' or 'selector' must be provided, but not both.", + }, ), z.object({ type: z.literal("click"), - selector: z.string() + selector: z.string(), }), z.object({ type: z.literal("screenshot"), - fullPage: z.boolean().default(false) + fullPage: z.boolean().default(false), }), z.object({ type: z.literal("write"), - text: z.string() + text: z.string(), }), z.object({ type: z.literal("press"), - key: z.string() + key: z.string(), }), z.object({ type: z.literal("scroll"), direction: z.enum(["up", "down"]).optional().default("down"), - selector: z.string().optional() + selector: z.string().optional(), }), z.object({ - type: z.literal("scrape") + type: z.literal("scrape"), }), z.object({ type: z.literal("executeJavascript"), - script: z.string() - }) - ]) + script: z.string(), + }), + ]), ); export const scrapeOptions = z @@ -126,14 +126,14 @@ export const scrapeOptions = z "links", "screenshot", "screenshot@fullPage", - "extract" + "extract", ]) .array() .optional() .default(["markdown"]) .refine( (x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), - "You may only specify either screenshot or screenshot@fullPage" + "You may only specify either screenshot or screenshot@fullPage", ), headers: z.record(z.string(), z.string()).optional(), includeTags: z.string().array().optional(), @@ -155,11 +155,11 @@ export const scrapeOptions = z (val) => !val || Object.keys(countries).includes(val.toUpperCase()), { message: - "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code." - } + "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", + }, ) .transform((val) => (val ? val.toUpperCase() : "US")), - languages: z.string().array().optional() + languages: z.string().array().optional(), }) .optional(), @@ -173,15 +173,15 @@ export const scrapeOptions = z (val) => !val || Object.keys(countries).includes(val.toUpperCase()), { message: - "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code." - } + "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.", + }, ) .transform((val) => (val ? val.toUpperCase() : "US")), - languages: z.string().array().optional() + languages: z.string().array().optional(), }) .optional(), skipTlsVerification: z.boolean().default(false), - removeBase64Images: z.boolean().default(true) + removeBase64Images: z.boolean().default(true), }) .strict(strictMessage); @@ -199,7 +199,7 @@ export const extractV1Options = z includeSubdomains: z.boolean().default(true), allowExternalLinks: z.boolean().default(false), origin: z.string().optional().default("api"), - timeout: z.number().int().positive().finite().safe().default(60000) + timeout: z.number().int().positive().finite().safe().default(60000), }) .strict(strictMessage); @@ -212,7 +212,7 @@ export const scrapeRequestSchema = scrapeOptions .extend({ url, origin: z.string().optional().default("api"), - timeout: z.number().int().positive().finite().safe().default(30000) + timeout: z.number().int().positive().finite().safe().default(30000), }) .strict(strictMessage) .refine( @@ -226,8 +226,8 @@ export const scrapeRequestSchema = scrapeOptions }, { message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa" - } + "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + }, ) .transform((obj) => { if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) { @@ -250,9 +250,9 @@ export const webhookSchema = z.preprocess( z .object({ url: z.string().url(), - headers: z.record(z.string(), z.string()).default({}) + headers: z.record(z.string(), z.string()).default({}), }) - .strict(strictMessage) + .strict(strictMessage), ); export const batchScrapeRequestSchema = scrapeOptions @@ -260,7 +260,7 @@ export const batchScrapeRequestSchema = scrapeOptions urls: url.array(), origin: z.string().optional().default("api"), webhook: webhookSchema.optional(), - appendToId: z.string().uuid().optional() + appendToId: z.string().uuid().optional(), }) .strict(strictMessage) .refine( @@ -274,8 +274,8 @@ export const batchScrapeRequestSchema = scrapeOptions }, { message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa" - } + "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + }, ); export type BatchScrapeRequest = z.infer; @@ -292,7 +292,7 @@ const crawlerOptions = z ignoreRobotsTxt: z.boolean().default(false), ignoreSitemap: z.boolean().default(false), deduplicateSimilarURLs: z.boolean().default(true), - ignoreQueryParameters: z.boolean().default(false) + ignoreQueryParameters: z.boolean().default(false), }) .strict(strictMessage); @@ -314,7 +314,7 @@ export const crawlRequestSchema = crawlerOptions origin: z.string().optional().default("api"), scrapeOptions: scrapeOptions.default({}), webhook: webhookSchema.optional(), - limit: z.number().default(10000) + limit: z.number().default(10000), }) .strict(strictMessage); @@ -340,7 +340,7 @@ export const mapRequestSchema = crawlerOptions search: z.string().optional(), ignoreSitemap: z.boolean().default(false), sitemapOnly: z.boolean().default(false), - limit: z.number().min(1).max(5000).default(5000) + limit: z.number().min(1).max(5000).default(5000), }) .strict(strictMessage); @@ -510,7 +510,7 @@ export type AuthCreditUsageChunk = { export interface RequestWithMaybeACUC< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends Request { acuc?: AuthCreditUsageChunk; } @@ -518,7 +518,7 @@ export interface RequestWithMaybeACUC< export interface RequestWithACUC< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends Request { acuc: AuthCreditUsageChunk; } @@ -526,7 +526,7 @@ export interface RequestWithACUC< export interface RequestWithAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends Request { auth: AuthObject; account?: Account; @@ -535,7 +535,7 @@ export interface RequestWithAuth< export interface RequestWithMaybeAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends RequestWithMaybeACUC { auth?: AuthObject; account?: Account; @@ -544,7 +544,7 @@ export interface RequestWithMaybeAuth< export interface RequestWithAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends RequestWithACUC { auth: AuthObject; account?: Account; @@ -569,7 +569,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { ignoreRobotsTxt: x.ignoreRobotsTxt, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, - ignoreQueryParameters: x.ignoreQueryParameters + ignoreQueryParameters: x.ignoreQueryParameters, }; } @@ -589,11 +589,11 @@ export function fromLegacyCrawlerOptions(x: any): { ignoreRobotsTxt: x.ignoreRobotsTxt, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, - ignoreQueryParameters: x.ignoreQueryParameters + ignoreQueryParameters: x.ignoreQueryParameters, }), internalOptions: { - v0CrawlOnlyUrls: x.returnOnlyUrls - } + v0CrawlOnlyUrls: x.returnOnlyUrls, + }, }; } @@ -605,7 +605,7 @@ export interface MapDocument { export function fromLegacyScrapeOptions( pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, - timeout: number | undefined + timeout: number | undefined, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { return { scrapeOptions: scrapeOptions.parse({ @@ -621,7 +621,7 @@ export function fromLegacyScrapeOptions( extractorOptions.mode.includes("llm-extraction") ? ("extract" as const) : null, - "links" + "links", ].filter((x) => x !== null), waitFor: pageOptions.waitFor, headers: pageOptions.headers, @@ -646,16 +646,16 @@ export function fromLegacyScrapeOptions( ? { systemPrompt: extractorOptions.extractionPrompt, prompt: extractorOptions.userPrompt, - schema: extractorOptions.extractionSchema + schema: extractorOptions.extractionSchema, } : undefined, - mobile: pageOptions.mobile + mobile: pageOptions.mobile, }), internalOptions: { atsv: pageOptions.atsv, v0DisableJsDom: pageOptions.disableJsDom, - v0UseFastMode: pageOptions.useFastMode - } + v0UseFastMode: pageOptions.useFastMode, + }, // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks }; } @@ -664,12 +664,12 @@ export function fromLegacyCombo( pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, - crawlerOptions: any + crawlerOptions: any, ): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } { const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions( pageOptions, extractorOptions, - timeout + timeout, ); const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; @@ -677,7 +677,7 @@ export function fromLegacyCombo( export function toLegacyDocument( document: Document, - internalOptions: InternalOptions + internalOptions: InternalOptions, ): V0Document | { url: string } { if (internalOptions.v0CrawlOnlyUrls) { return { url: document.metadata.sourceURL! }; @@ -696,9 +696,9 @@ export function toLegacyDocument( statusCode: undefined, pageError: document.metadata.error, pageStatusCode: document.metadata.statusCode, - screenshot: document.screenshot + screenshot: document.screenshot, }, actions: document.actions, - warning: document.warning + warning: document.warning, }; } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index a4f4445b..adc080f2 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -46,12 +46,12 @@ serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ queues: [new BullAdapter(getScrapeQueue())], - serverAdapter: serverAdapter + serverAdapter: serverAdapter, }); app.use( `/admin/${process.env.BULL_AUTH_KEY}/queues`, - serverAdapter.getRouter() + serverAdapter.getRouter(), ); app.get("/", (req, res) => { @@ -75,7 +75,7 @@ function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { logger.info(`Worker ${process.pid} listening on port ${port}`); logger.info( - `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`, ); }); @@ -103,7 +103,7 @@ app.get(`/serverHealthCheck`, async (req, res) => { const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs return res.status(noWaitingJobs ? 200 : 500).json({ - waitingJobs + waitingJobs, }); } catch (error) { Sentry.captureException(error); @@ -120,7 +120,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => { const getWaitingJobsCount = async () => { const scrapeQueue = getScrapeQueue(); const [waitingJobsCount] = await Promise.all([ - scrapeQueue.getWaitingCount() + scrapeQueue.getWaitingCount(), ]); return waitingJobsCount; @@ -140,15 +140,15 @@ app.get("/serverHealthCheck/notify", async (req, res) => { const message = { text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ timeout / 60000 - } minute(s).` + } minute(s).`, }; const response = await fetch(slackWebhookUrl, { method: "POST", headers: { - "Content-Type": "application/json" + "Content-Type": "application/json", }, - body: JSON.stringify(message) + body: JSON.stringify(message), }); if (!response.ok) { @@ -176,7 +176,7 @@ app.use( err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, - next: NextFunction + next: NextFunction, ) => { if (err instanceof ZodError) { if ( @@ -192,7 +192,7 @@ app.use( } else { next(err); } - } + }, ); Sentry.setupExpressErrorHandler(app); @@ -202,7 +202,7 @@ app.use( err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, - next: NextFunction + next: NextFunction, ) => { if ( err instanceof SyntaxError && @@ -222,7 +222,7 @@ app.use( verbose = JSON.stringify({ message: err.message, name: err.name, - stack: err.stack + stack: err.stack, }); } } @@ -233,15 +233,15 @@ app.use( ") -- ID " + id + " -- " + - verbose + verbose, ); res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + - id + id, }); - } + }, ); logger.info(`Worker ${process.pid} started`); diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 47ecaf18..de7017ea 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -10,7 +10,7 @@ import { logger } from "../logger"; export async function generateCompletions( documents: Document[], extractionOptions: ExtractorOptions | undefined, - mode: "markdown" | "raw-html" + mode: "markdown" | "raw-html", ): Promise { // const schema = zodToJsonSchema(options.schema) @@ -32,7 +32,7 @@ export async function generateCompletions( schema: schema, prompt: prompt, systemPrompt: systemPrompt, - mode: mode + mode: mode, }); // Validate the JSON output against the schema using AJV if (schema) { @@ -43,8 +43,8 @@ export async function generateCompletions( `JSON parsing error(s): ${validate.errors ?.map((err) => err.message) .join( - ", " - )}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` + ", ", + )}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`, ); } } @@ -57,7 +57,7 @@ export async function generateCompletions( default: throw new Error("Invalid client"); } - }) + }), ); return completions; diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 563863c0..cc1355de 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -14,7 +14,7 @@ const defaultPrompt = function prepareOpenAIDoc( document: Document, - mode: "markdown" | "raw-html" + mode: "markdown" | "raw-html", ): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null { let markdown = document.markdown; @@ -50,7 +50,7 @@ export async function generateOpenAICompletions({ systemPrompt = defaultPrompt, prompt, temperature, - mode + mode, }: { client: OpenAI; model?: string; @@ -68,7 +68,7 @@ export async function generateOpenAICompletions({ return { ...document, warning: - "LLM extraction was not performed since the document's content is empty or missing." + "LLM extraction was not performed since the document's content is empty or missing.", }; } const [content, numTokens] = preparedDoc; @@ -81,21 +81,21 @@ export async function generateOpenAICompletions({ messages: [ { role: "system", - content: systemPrompt + content: systemPrompt, }, { role: "user", content }, { role: "user", - content: `Transform the above content into structured json output based on the following user request: ${prompt}` - } + content: `Transform the above content into structured json output based on the following user request: ${prompt}`, + }, ], response_format: { type: "json_object" }, - temperature + temperature, }); try { llmExtraction = JSON.parse( - (jsonCompletion.choices[0].message.content ?? "").trim() + (jsonCompletion.choices[0].message.content ?? "").trim(), ); } catch (e) { throw new Error("Invalid JSON"); @@ -106,9 +106,9 @@ export async function generateOpenAICompletions({ messages: [ { role: "system", - content: systemPrompt + content: systemPrompt, }, - { role: "user", content } + { role: "user", content }, ], tools: [ { @@ -116,12 +116,12 @@ export async function generateOpenAICompletions({ function: { name: "extract_content", description: "Extracts the content from the given webpage(s)", - parameters: schema - } - } + parameters: schema, + }, + }, ], tool_choice: { type: "function", function: { name: "extract_content" } }, - temperature + temperature, }); const c = completion.choices[0].message.tool_calls[0].function.arguments; @@ -140,6 +140,6 @@ export async function generateOpenAICompletions({ warning: numTokens > maxTokens ? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.` - : undefined + : undefined, }; } diff --git a/apps/api/src/lib/__tests__/html-to-markdown.test.ts b/apps/api/src/lib/__tests__/html-to-markdown.test.ts index f69c2949..d35e2cce 100644 --- a/apps/api/src/lib/__tests__/html-to-markdown.test.ts +++ b/apps/api/src/lib/__tests__/html-to-markdown.test.ts @@ -31,16 +31,16 @@ describe("parseMarkdown", () => { { html: "

Unclosed tag", expected: "Unclosed tag" }, { html: "

Missing closing div", - expected: "Missing closing div" + expected: "Missing closing div", }, { html: "

Wrong nesting

", - expected: "**Wrong nesting**" + expected: "**Wrong nesting**", }, { html: 'Link without closing tag', - expected: "[Link without closing tag](http://example.com)" - } + expected: "[Link without closing tag](http://example.com)", + }, ]; for (const { html, expected } of invalidHtmls) { diff --git a/apps/api/src/lib/__tests__/job-priority.test.ts b/apps/api/src/lib/__tests__/job-priority.test.ts index 4bd5fda9..1a7550ef 100644 --- a/apps/api/src/lib/__tests__/job-priority.test.ts +++ b/apps/api/src/lib/__tests__/job-priority.test.ts @@ -1,7 +1,7 @@ import { getJobPriority, addJobPriority, - deleteJobPriority + deleteJobPriority, } from "../job-priority"; import { redisConnection } from "../../services/queue-service"; import { PlanType } from "../../types"; @@ -11,8 +11,8 @@ jest.mock("../../services/queue-service", () => ({ sadd: jest.fn(), srem: jest.fn(), scard: jest.fn(), - expire: jest.fn() - } + expire: jest.fn(), + }, })); describe("Job Priority Tests", () => { @@ -26,11 +26,11 @@ describe("Job Priority Tests", () => { await addJobPriority(team_id, job_id); expect(redisConnection.sadd).toHaveBeenCalledWith( `limit_team_id:${team_id}`, - job_id + job_id, ); expect(redisConnection.expire).toHaveBeenCalledWith( `limit_team_id:${team_id}`, - 60 + 60, ); }); @@ -40,7 +40,7 @@ describe("Job Priority Tests", () => { await deleteJobPriority(team_id, job_id); expect(redisConnection.srem).toHaveBeenCalledWith( `limit_team_id:${team_id}`, - job_id + job_id, ); }); @@ -89,7 +89,7 @@ describe("Job Priority Tests", () => { await addJobPriority(team_id, job_id1); expect(redisConnection.expire).toHaveBeenCalledWith( `limit_team_id:${team_id}`, - 60 + 60, ); // Clear the mock calls @@ -99,7 +99,7 @@ describe("Job Priority Tests", () => { await addJobPriority(team_id, job_id2); expect(redisConnection.expire).toHaveBeenCalledWith( `limit_team_id:${team_id}`, - 60 + 60, ); }); @@ -112,7 +112,7 @@ describe("Job Priority Tests", () => { await addJobPriority(team_id, job_id); expect(redisConnection.expire).toHaveBeenCalledWith( `limit_team_id:${team_id}`, - 60 + 60, ); // Fast-forward time by 59 seconds diff --git a/apps/api/src/lib/batch-process.ts b/apps/api/src/lib/batch-process.ts index 20bb4ab6..1e4ac7be 100644 --- a/apps/api/src/lib/batch-process.ts +++ b/apps/api/src/lib/batch-process.ts @@ -1,7 +1,7 @@ export async function batchProcess( array: T[], batchSize: number, - asyncFunction: (item: T, index: number) => Promise + asyncFunction: (item: T, index: number) => Promise, ): Promise { const batches: T[][] = []; for (let i = 0; i < array.length; i += batchSize) { diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index 30c9f0b4..7dcbf88b 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -6,14 +6,14 @@ const logger = _logger.child({ module: "cache" }); export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, { - maxRetriesPerRequest: null + maxRetriesPerRequest: null, }) : null; export function cacheKey( url: string, scrapeOptions: ScrapeOptions, - internalOptions: InternalOptions + internalOptions: InternalOptions, ): string | null { if (!cacheRedis) return null; @@ -49,7 +49,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) { } export async function getEntryFromCache( - key: string + key: string, ): Promise { if (!cacheRedis) return null; diff --git a/apps/api/src/lib/concurrency-limit.ts b/apps/api/src/lib/concurrency-limit.ts index aba1fd3a..8205113f 100644 --- a/apps/api/src/lib/concurrency-limit.ts +++ b/apps/api/src/lib/concurrency-limit.ts @@ -14,37 +14,37 @@ export function getConcurrencyLimitMax(plan: string): number { export async function cleanOldConcurrencyLimitEntries( team_id: string, - now: number = Date.now() + now: number = Date.now(), ) { await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now); } export async function getConcurrencyLimitActiveJobs( team_id: string, - now: number = Date.now() + now: number = Date.now(), ): Promise { return await redisConnection.zrangebyscore( constructKey(team_id), now, - Infinity + Infinity, ); } export async function pushConcurrencyLimitActiveJob( team_id: string, id: string, - now: number = Date.now() + now: number = Date.now(), ) { await redisConnection.zadd( constructKey(team_id), now + stalledJobTimeoutMs, - id + id, ); } export async function removeConcurrencyLimitActiveJob( team_id: string, - id: string + id: string, ) { await redisConnection.zrem(constructKey(team_id), id); } @@ -57,7 +57,7 @@ export type ConcurrencyLimitedJob = { }; export async function takeConcurrencyLimitedJob( - team_id: string + team_id: string, ): Promise { const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN"); if (res === null || res === undefined) { @@ -69,11 +69,11 @@ export async function takeConcurrencyLimitedJob( export async function pushConcurrencyLimitedJob( team_id: string, - job: ConcurrencyLimitedJob + job: ConcurrencyLimitedJob, ) { await redisConnection.zadd( constructQueueKey(team_id), job.priority ?? 1, - JSON.stringify(job) + JSON.stringify(job), ); } diff --git a/apps/api/src/lib/crawl-redis.test.ts b/apps/api/src/lib/crawl-redis.test.ts index ef2dabee..65d4e13a 100644 --- a/apps/api/src/lib/crawl-redis.test.ts +++ b/apps/api/src/lib/crawl-redis.test.ts @@ -3,7 +3,7 @@ import { generateURLPermutations } from "./crawl-redis"; describe("generateURLPermutations", () => { it("generates permutations correctly", () => { const bareHttps = generateURLPermutations("https://firecrawl.dev").map( - (x) => x.href + (x) => x.href, ); expect(bareHttps.length).toBe(4); expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true); @@ -12,7 +12,7 @@ describe("generateURLPermutations", () => { expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true); const bareHttp = generateURLPermutations("http://firecrawl.dev").map( - (x) => x.href + (x) => x.href, ); expect(bareHttp.length).toBe(4); expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true); @@ -21,7 +21,7 @@ describe("generateURLPermutations", () => { expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true); const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map( - (x) => x.href + (x) => x.href, ); expect(wwwHttps.length).toBe(4); expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true); @@ -30,7 +30,7 @@ describe("generateURLPermutations", () => { expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true); const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map( - (x) => x.href + (x) => x.href, ); expect(wwwHttp.length).toBe(4); expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index ab1a238d..6ccb9436 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -24,7 +24,7 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) { method: "saveCrawl", crawlId: id, teamId: crawl.team_id, - plan: crawl.plan + plan: crawl.plan, }); await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX"); @@ -53,7 +53,7 @@ export async function addCrawlJob(id: string, job_id: string) { jobId: job_id, module: "crawl-redis", method: "addCrawlJob", - crawlId: id + crawlId: id, }); await redisConnection.sadd("crawl:" + id + ":jobs", job_id); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); @@ -64,7 +64,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { jobIds: job_ids, module: "crawl-redis", method: "addCrawlJobs", - crawlId: id + crawlId: id, }); await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); @@ -73,19 +73,19 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobDone( id: string, job_id: string, - success: boolean + success: boolean, ) { _logger.debug("Adding done crawl job to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJobDone", - crawlId: id + crawlId: id, }); await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); await redisConnection.expire( "crawl:" + id + ":jobs_done", 24 * 60 * 60, - "NX" + "NX", ); if (success) { @@ -93,7 +93,7 @@ export async function addCrawlJobDone( await redisConnection.expire( "crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, - "NX" + "NX", ); } } @@ -105,12 +105,12 @@ export async function getDoneJobsOrderedLength(id: string): Promise { export async function getDoneJobsOrdered( id: string, start = 0, - end = -1 + end = -1, ): Promise { return await redisConnection.lrange( "crawl:" + id + ":jobs_done_ordered", start, - end + end, ); } @@ -130,7 +130,7 @@ export async function finishCrawl(id: string) { _logger.debug("Marking crawl as finished.", { module: "crawl-redis", method: "finishCrawl", - crawlId: id + crawlId: id, }); const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); if (set === 1) { @@ -141,7 +141,7 @@ export async function finishCrawl(id: string) { _logger.debug("Crawl can not be finished yet, not marking as finished.", { module: "crawl-redis", method: "finishCrawl", - crawlId: id + crawlId: id, }); } } @@ -154,7 +154,7 @@ export async function getThrottledJobs(teamId: string): Promise { return await redisConnection.zrangebyscore( "concurrency-limiter:" + teamId + ":throttled", Date.now(), - Infinity + Infinity, ); } @@ -201,7 +201,7 @@ export function generateURLPermutations(url: string | URL): URL[] { export async function lockURL( id: string, sc: StoredCrawl, - url: string + url: string, ): Promise { let logger = _logger.child({ crawlId: id, @@ -209,7 +209,7 @@ export async function lockURL( method: "lockURL", preNormalizedURL: url, teamId: sc.team_id, - plan: sc.plan + plan: sc.plan, }); if (typeof sc.crawlerOptions?.limit === "number") { @@ -218,7 +218,7 @@ export async function lockURL( sc.crawlerOptions.limit ) { logger.debug( - "Crawl has already hit visited_unique limit, not locking URL." + "Crawl has already hit visited_unique limit, not locking URL.", ); return false; } @@ -231,7 +231,7 @@ export async function lockURL( await redisConnection.expire( "crawl:" + id + ":visited_unique", 24 * 60 * 60, - "NX" + "NX", ); let res: boolean; @@ -242,7 +242,7 @@ export async function lockURL( // logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations }); const x = await redisConnection.sadd( "crawl:" + id + ":visited", - ...permutations + ...permutations, ); res = x === permutations.length; } @@ -250,7 +250,7 @@ export async function lockURL( await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { - res + res, }); return res; } @@ -259,7 +259,7 @@ export async function lockURL( export async function lockURLs( id: string, sc: StoredCrawl, - urls: string[] + urls: string[], ): Promise { urls = urls.map((url) => normalizeURL(url, sc)); const logger = _logger.child({ @@ -267,7 +267,7 @@ export async function lockURLs( module: "crawl-redis", method: "lockURL", teamId: sc.team_id, - plan: sc.plan + plan: sc.plan, }); // Add to visited_unique set @@ -276,7 +276,7 @@ export async function lockURLs( await redisConnection.expire( "crawl:" + id + ":visited_unique", 24 * 60 * 60, - "NX" + "NX", ); let res: boolean; @@ -285,12 +285,12 @@ export async function lockURLs( res = x === urls.length; } else { const allPermutations = urls.flatMap((url) => - generateURLPermutations(url).map((x) => x.href) + generateURLPermutations(url).map((x) => x.href), ); logger.debug("Adding " + allPermutations.length + " URL permutations..."); const x = await redisConnection.sadd( "crawl:" + id + ":visited", - ...allPermutations + ...allPermutations, ); res = x === allPermutations.length; } @@ -304,7 +304,7 @@ export async function lockURLs( export function crawlToCrawler( id: string, sc: StoredCrawl, - newBase?: string + newBase?: string, ): WebCrawler { const crawler = new WebCrawler({ jobId: id, @@ -315,7 +315,7 @@ export function crawlToCrawler( maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledDepth: getAdjustedMaxDepth( sc.originUrl!, - sc.crawlerOptions?.maxDepth ?? 10 + sc.crawlerOptions?.maxDepth ?? 10, ), limit: sc.crawlerOptions?.limit ?? 10000, generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, @@ -323,7 +323,7 @@ export function crawlToCrawler( allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, - ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false + ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, }); if (sc.robots !== undefined) { diff --git a/apps/api/src/lib/custom-error.ts b/apps/api/src/lib/custom-error.ts index 25502a8e..20a01cb6 100644 --- a/apps/api/src/lib/custom-error.ts +++ b/apps/api/src/lib/custom-error.ts @@ -8,7 +8,7 @@ export class CustomError extends Error { statusCode: number, status: string, message: string = "", - dataIngestionJob?: any + dataIngestionJob?: any, ) { super(message); this.statusCode = statusCode; diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index ceca176c..2754b7cd 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -8,21 +8,21 @@ export const defaultPageOptions = { waitFor: 0, screenshot: false, fullPageScreenshot: false, - parsePDF: true + parsePDF: true, }; export const defaultCrawlerOptions = { allowBackwardCrawling: false, - limit: 10000 + limit: 10000, }; export const defaultCrawlPageOptions = { onlyMainContent: false, includeHtml: false, removeTags: [], - parsePDF: true + parsePDF: true, }; export const defaultExtractorOptions = { - mode: "markdown" + mode: "markdown", }; diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 044f71a4..26e7ac06 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -1,21 +1,21 @@ import { CohereClient } from "cohere-ai"; import { MapDocument } from "../../controllers/v1/types"; const cohere = new CohereClient({ - token: process.env.COHERE_API_KEY + token: process.env.COHERE_API_KEY, }); export async function rerankDocuments( documents: (string | Record)[], query: string, topN = 3, - model = "rerank-english-v3.0" + model = "rerank-english-v3.0", ) { const rerank = await cohere.v2.rerank({ documents, query, topN, model, - returnDocuments: true + returnDocuments: true, }); return rerank.results @@ -23,6 +23,6 @@ export async function rerankDocuments( .map((x) => ({ document: x.document, index: x.index, - relevanceScore: x.relevanceScore + relevanceScore: x.relevanceScore, })); } diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 7a0020d1..cba1a80b 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -13,7 +13,7 @@ const goExecutablePath = join( process.cwd(), "sharedLibs", "go-html-to-md", - "html-to-markdown.so" + "html-to-markdown.so", ); class GoMarkdownConverter { @@ -51,7 +51,7 @@ class GoMarkdownConverter { } export async function parseMarkdown( - html: string | null | undefined + html: string | null | undefined, ): Promise { if (!html) { return ""; @@ -74,12 +74,12 @@ export async function parseMarkdown( ) { Sentry.captureException(error); logger.error( - `Error converting HTML to Markdown with Go parser: ${error}` + `Error converting HTML to Markdown with Go parser: ${error}`, ); } else { logger.warn( "Tried to use Go parser, but it doesn't exist in the file system.", - { goExecutablePath } + { goExecutablePath }, ); } } @@ -101,7 +101,7 @@ export async function parseMarkdown( var href = node.getAttribute("href").trim(); var title = node.title ? ' "' + node.title + '"' : ""; return "[" + content.trim() + "](" + href + title + ")\n"; - } + }, }); var gfm = turndownPluginGfm.gfm; turndownService.use(gfm); @@ -145,7 +145,7 @@ function removeSkipToContentLinks(markdownContent: string): string { // Remove [Skip to Content](#page) and [Skip to content](#skip) const newMarkdownContent = markdownContent.replace( /\[Skip to Content\]\(#[^\)]*\)/gi, - "" + "", ); return newMarkdownContent; } diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts index 2bafc3e6..7e2d44de 100644 --- a/apps/api/src/lib/job-priority.ts +++ b/apps/api/src/lib/job-priority.ts @@ -31,7 +31,7 @@ export async function deleteJobPriority(team_id, job_id) { export async function getJobPriority({ plan, team_id, - basePriority = 10 + basePriority = 10, }: { plan: PlanType | undefined; team_id: string; @@ -91,12 +91,12 @@ export async function getJobPriority({ } else { // If not, we keep base priority + planModifier return Math.ceil( - basePriority + Math.ceil((setLength - bucketLimit) * planModifier) + basePriority + Math.ceil((setLength - bucketLimit) * planModifier), ); } } catch (e) { logger.error( - `Get job priority failed: ${team_id}, ${plan}, ${basePriority}` + `Get job priority failed: ${team_id}, ${plan}, ${basePriority}`, ); return basePriority; } diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index 6996ffd4..3cc04a11 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -14,14 +14,14 @@ const logFormat = winston.format.printf( name: value.name, message: value.message, stack: value.stack, - cause: value.cause + cause: value.cause, }; } else { return value; } }) : "" - }` + }`, ); export const logger = winston.createLogger({ @@ -34,26 +34,26 @@ export const logger = winston.createLogger({ name: value.name, message: value.message, stack: value.stack, - cause: value.cause + cause: value.cause, }; } else { return value; } - } + }, }), transports: [ new winston.transports.Console({ format: winston.format.combine( winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), winston.format.metadata({ - fillExcept: ["message", "level", "timestamp"] + fillExcept: ["message", "level", "timestamp"], }), ...((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || process.env.ENV !== "production" ? [winston.format.colorize(), logFormat] - : []) - ) - }) - ] + : []), + ), + }), + ], }); diff --git a/apps/api/src/lib/map-cosine.ts b/apps/api/src/lib/map-cosine.ts index 2a089548..a6c06e27 100644 --- a/apps/api/src/lib/map-cosine.ts +++ b/apps/api/src/lib/map-cosine.ts @@ -6,10 +6,10 @@ export function performCosineSimilarity(links: string[], searchQuery: string) { const cosineSimilarity = (vec1: number[], vec2: number[]): number => { const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0); const magnitude1 = Math.sqrt( - vec1.reduce((sum, val) => sum + val * val, 0) + vec1.reduce((sum, val) => sum + val * val, 0), ); const magnitude2 = Math.sqrt( - vec2.reduce((sum, val) => sum + val * val, 0) + vec2.reduce((sum, val) => sum + val * val, 0), ); if (magnitude1 === 0 || magnitude2 === 0) return 0; return dotProduct / (magnitude1 * magnitude2); diff --git a/apps/api/src/lib/ranker.test.ts b/apps/api/src/lib/ranker.test.ts index 2b30de19..b884c2fb 100644 --- a/apps/api/src/lib/ranker.test.ts +++ b/apps/api/src/lib/ranker.test.ts @@ -5,13 +5,13 @@ describe("performRanking", () => { const linksWithContext = [ "url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds", "url: https://example.com/cats, title: Cat care guide, description: Everything about cats", - "url: https://example.com/pets, title: General pet care, description: Care for all types of pets" + "url: https://example.com/pets, title: General pet care, description: Care for all types of pets", ]; const links = [ "https://example.com/dogs", "https://example.com/cats", - "https://example.com/pets" + "https://example.com/pets", ]; const searchQuery = "cats training"; @@ -50,7 +50,7 @@ describe("performRanking", () => { it("should maintain original order for equal scores", async () => { const linksWithContext = [ "url: https://example.com/1, title: Similar content A, description: test", - "url: https://example.com/2, title: Similar content B, description: test" + "url: https://example.com/2, title: Similar content B, description: test", ]; const links = ["https://example.com/1", "https://example.com/2"]; diff --git a/apps/api/src/lib/ranker.ts b/apps/api/src/lib/ranker.ts index 2f06d76d..bffbc9c2 100644 --- a/apps/api/src/lib/ranker.ts +++ b/apps/api/src/lib/ranker.ts @@ -5,14 +5,14 @@ import OpenAI from "openai"; configDotenv(); const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY + apiKey: process.env.OPENAI_API_KEY, }); async function getEmbedding(text: string) { const embedding = await openai.embeddings.create({ model: "text-embedding-ada-002", input: text, - encoding_format: "float" + encoding_format: "float", }); return embedding.data[0].embedding; @@ -39,7 +39,7 @@ const textToVector = (searchQuery: string, text: string): number[] => { async function performRanking( linksWithContext: string[], links: string[], - searchQuery: string + searchQuery: string, ) { try { // Handle invalid inputs @@ -64,7 +64,7 @@ async function performRanking( link: links[index], linkWithContext, score, - originalIndex: index + originalIndex: index, }; } catch (err) { // If embedding fails for a link, return with score 0 @@ -72,10 +72,10 @@ async function performRanking( link: links[index], linkWithContext, score: 0, - originalIndex: index + originalIndex: index, }; } - }) + }), ); // Sort links based on similarity scores while preserving original order for equal scores diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 6c39c722..97e2cecc 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -56,7 +56,7 @@ export class ScrapeEvents { .insert({ job_id: jobId, type: content.type, - content: content + content: content, // created_at }) .select() @@ -73,7 +73,7 @@ export class ScrapeEvents { static async updateScrapeResult( logId: number | null, - result: ScrapeScrapeEvent["result"] + result: ScrapeScrapeEvent["result"], ) { if (logId === null) return; @@ -86,8 +86,8 @@ export class ScrapeEvents { .update({ content: { ...previousLog.content, - result - } + result, + }, }) .eq("id", logId); } catch (error) { @@ -100,7 +100,7 @@ export class ScrapeEvents { await this.insert(((job as any).id ? (job as any).id : job) as string, { type: "queue", event, - worker: process.env.FLY_MACHINE_ID + worker: process.env.FLY_MACHINE_ID, }); } catch (error) { logger.error(`Error logging job event: ${error}`); diff --git a/apps/api/src/lib/validate-country.ts b/apps/api/src/lib/validate-country.ts index 797ea542..bff1c25c 100644 --- a/apps/api/src/lib/validate-country.ts +++ b/apps/api/src/lib/validate-country.ts @@ -6,7 +6,7 @@ export const countries = { continent: "EU", capital: "Andorra la Vella", currency: ["EUR"], - languages: ["ca"] + languages: ["ca"], }, AE: { name: "United Arab Emirates", @@ -15,7 +15,7 @@ export const countries = { continent: "AS", capital: "Abu Dhabi", currency: ["AED"], - languages: ["ar"] + languages: ["ar"], }, AF: { name: "Afghanistan", @@ -24,7 +24,7 @@ export const countries = { continent: "AS", capital: "Kabul", currency: ["AFN"], - languages: ["ps", "uz", "tk"] + languages: ["ps", "uz", "tk"], }, AG: { name: "Antigua and Barbuda", @@ -33,7 +33,7 @@ export const countries = { continent: "NA", capital: "Saint John's", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, AI: { name: "Anguilla", @@ -42,7 +42,7 @@ export const countries = { continent: "NA", capital: "The Valley", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, AL: { name: "Albania", @@ -51,7 +51,7 @@ export const countries = { continent: "EU", capital: "Tirana", currency: ["ALL"], - languages: ["sq"] + languages: ["sq"], }, AM: { name: "Armenia", @@ -60,7 +60,7 @@ export const countries = { continent: "AS", capital: "Yerevan", currency: ["AMD"], - languages: ["hy", "ru"] + languages: ["hy", "ru"], }, AO: { name: "Angola", @@ -69,7 +69,7 @@ export const countries = { continent: "AF", capital: "Luanda", currency: ["AOA"], - languages: ["pt"] + languages: ["pt"], }, AQ: { name: "Antarctica", @@ -78,7 +78,7 @@ export const countries = { continent: "AN", capital: "", currency: [], - languages: [] + languages: [], }, AR: { name: "Argentina", @@ -87,7 +87,7 @@ export const countries = { continent: "SA", capital: "Buenos Aires", currency: ["ARS"], - languages: ["es", "gn"] + languages: ["es", "gn"], }, AS: { name: "American Samoa", @@ -96,7 +96,7 @@ export const countries = { continent: "OC", capital: "Pago Pago", currency: ["USD"], - languages: ["en", "sm"] + languages: ["en", "sm"], }, AT: { name: "Austria", @@ -105,7 +105,7 @@ export const countries = { continent: "EU", capital: "Vienna", currency: ["EUR"], - languages: ["de"] + languages: ["de"], }, AU: { name: "Australia", @@ -114,7 +114,7 @@ export const countries = { continent: "OC", capital: "Canberra", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, AW: { name: "Aruba", @@ -123,7 +123,7 @@ export const countries = { continent: "NA", capital: "Oranjestad", currency: ["AWG"], - languages: ["nl", "pa"] + languages: ["nl", "pa"], }, AX: { name: "Aland", @@ -133,7 +133,7 @@ export const countries = { capital: "Mariehamn", currency: ["EUR"], languages: ["sv"], - partOf: "FI" + partOf: "FI", }, AZ: { name: "Azerbaijan", @@ -143,7 +143,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Baku", currency: ["AZN"], - languages: ["az"] + languages: ["az"], }, BA: { name: "Bosnia and Herzegovina", @@ -152,7 +152,7 @@ export const countries = { continent: "EU", capital: "Sarajevo", currency: ["BAM"], - languages: ["bs", "hr", "sr"] + languages: ["bs", "hr", "sr"], }, BB: { name: "Barbados", @@ -161,7 +161,7 @@ export const countries = { continent: "NA", capital: "Bridgetown", currency: ["BBD"], - languages: ["en"] + languages: ["en"], }, BD: { name: "Bangladesh", @@ -170,7 +170,7 @@ export const countries = { continent: "AS", capital: "Dhaka", currency: ["BDT"], - languages: ["bn"] + languages: ["bn"], }, BE: { name: "Belgium", @@ -179,7 +179,7 @@ export const countries = { continent: "EU", capital: "Brussels", currency: ["EUR"], - languages: ["nl", "fr", "de"] + languages: ["nl", "fr", "de"], }, BF: { name: "Burkina Faso", @@ -188,7 +188,7 @@ export const countries = { continent: "AF", capital: "Ouagadougou", currency: ["XOF"], - languages: ["fr", "ff"] + languages: ["fr", "ff"], }, BG: { name: "Bulgaria", @@ -197,7 +197,7 @@ export const countries = { continent: "EU", capital: "Sofia", currency: ["BGN"], - languages: ["bg"] + languages: ["bg"], }, BH: { name: "Bahrain", @@ -206,7 +206,7 @@ export const countries = { continent: "AS", capital: "Manama", currency: ["BHD"], - languages: ["ar"] + languages: ["ar"], }, BI: { name: "Burundi", @@ -215,7 +215,7 @@ export const countries = { continent: "AF", capital: "Bujumbura", currency: ["BIF"], - languages: ["fr", "rn"] + languages: ["fr", "rn"], }, BJ: { name: "Benin", @@ -224,7 +224,7 @@ export const countries = { continent: "AF", capital: "Porto-Novo", currency: ["XOF"], - languages: ["fr"] + languages: ["fr"], }, BL: { name: "Saint Barthelemy", @@ -233,7 +233,7 @@ export const countries = { continent: "NA", capital: "Gustavia", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, BM: { name: "Bermuda", @@ -242,7 +242,7 @@ export const countries = { continent: "NA", capital: "Hamilton", currency: ["BMD"], - languages: ["en"] + languages: ["en"], }, BN: { name: "Brunei", @@ -251,7 +251,7 @@ export const countries = { continent: "AS", capital: "Bandar Seri Begawan", currency: ["BND"], - languages: ["ms"] + languages: ["ms"], }, BO: { name: "Bolivia", @@ -260,7 +260,7 @@ export const countries = { continent: "SA", capital: "Sucre", currency: ["BOB", "BOV"], - languages: ["es", "ay", "qu"] + languages: ["es", "ay", "qu"], }, BQ: { name: "Bonaire", @@ -269,7 +269,7 @@ export const countries = { continent: "NA", capital: "Kralendijk", currency: ["USD"], - languages: ["nl"] + languages: ["nl"], }, BR: { name: "Brazil", @@ -278,7 +278,7 @@ export const countries = { continent: "SA", capital: "Brasília", currency: ["BRL"], - languages: ["pt"] + languages: ["pt"], }, BS: { name: "Bahamas", @@ -287,7 +287,7 @@ export const countries = { continent: "NA", capital: "Nassau", currency: ["BSD"], - languages: ["en"] + languages: ["en"], }, BT: { name: "Bhutan", @@ -296,7 +296,7 @@ export const countries = { continent: "AS", capital: "Thimphu", currency: ["BTN", "INR"], - languages: ["dz"] + languages: ["dz"], }, BV: { name: "Bouvet Island", @@ -305,7 +305,7 @@ export const countries = { continent: "AN", capital: "", currency: ["NOK"], - languages: ["no", "nb", "nn"] + languages: ["no", "nb", "nn"], }, BW: { name: "Botswana", @@ -314,7 +314,7 @@ export const countries = { continent: "AF", capital: "Gaborone", currency: ["BWP"], - languages: ["en", "tn"] + languages: ["en", "tn"], }, BY: { name: "Belarus", @@ -323,7 +323,7 @@ export const countries = { continent: "EU", capital: "Minsk", currency: ["BYN"], - languages: ["be", "ru"] + languages: ["be", "ru"], }, BZ: { name: "Belize", @@ -332,7 +332,7 @@ export const countries = { continent: "NA", capital: "Belmopan", currency: ["BZD"], - languages: ["en", "es"] + languages: ["en", "es"], }, CA: { name: "Canada", @@ -341,7 +341,7 @@ export const countries = { continent: "NA", capital: "Ottawa", currency: ["CAD"], - languages: ["en", "fr"] + languages: ["en", "fr"], }, CC: { name: "Cocos (Keeling) Islands", @@ -350,7 +350,7 @@ export const countries = { continent: "AS", capital: "West Island", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, CD: { name: "Democratic Republic of the Congo", @@ -359,7 +359,7 @@ export const countries = { continent: "AF", capital: "Kinshasa", currency: ["CDF"], - languages: ["fr", "ln", "kg", "sw", "lu"] + languages: ["fr", "ln", "kg", "sw", "lu"], }, CF: { name: "Central African Republic", @@ -368,7 +368,7 @@ export const countries = { continent: "AF", capital: "Bangui", currency: ["XAF"], - languages: ["fr", "sg"] + languages: ["fr", "sg"], }, CG: { name: "Republic of the Congo", @@ -377,7 +377,7 @@ export const countries = { continent: "AF", capital: "Brazzaville", currency: ["XAF"], - languages: ["fr", "ln"] + languages: ["fr", "ln"], }, CH: { name: "Switzerland", @@ -386,7 +386,7 @@ export const countries = { continent: "EU", capital: "Bern", currency: ["CHE", "CHF", "CHW"], - languages: ["de", "fr", "it"] + languages: ["de", "fr", "it"], }, CI: { name: "Ivory Coast", @@ -395,7 +395,7 @@ export const countries = { continent: "AF", capital: "Yamoussoukro", currency: ["XOF"], - languages: ["fr"] + languages: ["fr"], }, CK: { name: "Cook Islands", @@ -404,7 +404,7 @@ export const countries = { continent: "OC", capital: "Avarua", currency: ["NZD"], - languages: ["en"] + languages: ["en"], }, CL: { name: "Chile", @@ -413,7 +413,7 @@ export const countries = { continent: "SA", capital: "Santiago", currency: ["CLF", "CLP"], - languages: ["es"] + languages: ["es"], }, CM: { name: "Cameroon", @@ -422,7 +422,7 @@ export const countries = { continent: "AF", capital: "Yaoundé", currency: ["XAF"], - languages: ["en", "fr"] + languages: ["en", "fr"], }, CN: { name: "China", @@ -431,7 +431,7 @@ export const countries = { continent: "AS", capital: "Beijing", currency: ["CNY"], - languages: ["zh"] + languages: ["zh"], }, CO: { name: "Colombia", @@ -440,7 +440,7 @@ export const countries = { continent: "SA", capital: "Bogotá", currency: ["COP"], - languages: ["es"] + languages: ["es"], }, CR: { name: "Costa Rica", @@ -449,7 +449,7 @@ export const countries = { continent: "NA", capital: "San José", currency: ["CRC"], - languages: ["es"] + languages: ["es"], }, CU: { name: "Cuba", @@ -458,7 +458,7 @@ export const countries = { continent: "NA", capital: "Havana", currency: ["CUC", "CUP"], - languages: ["es"] + languages: ["es"], }, CV: { name: "Cape Verde", @@ -467,7 +467,7 @@ export const countries = { continent: "AF", capital: "Praia", currency: ["CVE"], - languages: ["pt"] + languages: ["pt"], }, CW: { name: "Curacao", @@ -476,7 +476,7 @@ export const countries = { continent: "NA", capital: "Willemstad", currency: ["ANG"], - languages: ["nl", "pa", "en"] + languages: ["nl", "pa", "en"], }, CX: { name: "Christmas Island", @@ -485,7 +485,7 @@ export const countries = { continent: "AS", capital: "Flying Fish Cove", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, CY: { name: "Cyprus", @@ -494,7 +494,7 @@ export const countries = { continent: "EU", capital: "Nicosia", currency: ["EUR"], - languages: ["el", "tr", "hy"] + languages: ["el", "tr", "hy"], }, CZ: { name: "Czech Republic", @@ -503,7 +503,7 @@ export const countries = { continent: "EU", capital: "Prague", currency: ["CZK"], - languages: ["cs"] + languages: ["cs"], }, DE: { name: "Germany", @@ -512,7 +512,7 @@ export const countries = { continent: "EU", capital: "Berlin", currency: ["EUR"], - languages: ["de"] + languages: ["de"], }, DJ: { name: "Djibouti", @@ -521,7 +521,7 @@ export const countries = { continent: "AF", capital: "Djibouti", currency: ["DJF"], - languages: ["fr", "ar"] + languages: ["fr", "ar"], }, DK: { name: "Denmark", @@ -531,7 +531,7 @@ export const countries = { continents: ["EU", "NA"], capital: "Copenhagen", currency: ["DKK"], - languages: ["da"] + languages: ["da"], }, DM: { name: "Dominica", @@ -540,7 +540,7 @@ export const countries = { continent: "NA", capital: "Roseau", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, DO: { name: "Dominican Republic", @@ -549,7 +549,7 @@ export const countries = { continent: "NA", capital: "Santo Domingo", currency: ["DOP"], - languages: ["es"] + languages: ["es"], }, DZ: { name: "Algeria", @@ -558,7 +558,7 @@ export const countries = { continent: "AF", capital: "Algiers", currency: ["DZD"], - languages: ["ar"] + languages: ["ar"], }, EC: { name: "Ecuador", @@ -567,7 +567,7 @@ export const countries = { continent: "SA", capital: "Quito", currency: ["USD"], - languages: ["es"] + languages: ["es"], }, EE: { name: "Estonia", @@ -576,7 +576,7 @@ export const countries = { continent: "EU", capital: "Tallinn", currency: ["EUR"], - languages: ["et"] + languages: ["et"], }, EG: { name: "Egypt", @@ -586,7 +586,7 @@ export const countries = { continents: ["AF", "AS"], capital: "Cairo", currency: ["EGP"], - languages: ["ar"] + languages: ["ar"], }, EH: { name: "Western Sahara", @@ -595,7 +595,7 @@ export const countries = { continent: "AF", capital: "El Aaiún", currency: ["MAD", "DZD", "MRU"], - languages: ["es"] + languages: ["es"], }, ER: { name: "Eritrea", @@ -604,7 +604,7 @@ export const countries = { continent: "AF", capital: "Asmara", currency: ["ERN"], - languages: ["ti", "ar", "en"] + languages: ["ti", "ar", "en"], }, ES: { name: "Spain", @@ -613,7 +613,7 @@ export const countries = { continent: "EU", capital: "Madrid", currency: ["EUR"], - languages: ["es", "eu", "ca", "gl", "oc"] + languages: ["es", "eu", "ca", "gl", "oc"], }, ET: { name: "Ethiopia", @@ -622,7 +622,7 @@ export const countries = { continent: "AF", capital: "Addis Ababa", currency: ["ETB"], - languages: ["am"] + languages: ["am"], }, FI: { name: "Finland", @@ -631,7 +631,7 @@ export const countries = { continent: "EU", capital: "Helsinki", currency: ["EUR"], - languages: ["fi", "sv"] + languages: ["fi", "sv"], }, FJ: { name: "Fiji", @@ -640,7 +640,7 @@ export const countries = { continent: "OC", capital: "Suva", currency: ["FJD"], - languages: ["en", "fj", "hi", "ur"] + languages: ["en", "fj", "hi", "ur"], }, FK: { name: "Falkland Islands", @@ -649,7 +649,7 @@ export const countries = { continent: "SA", capital: "Stanley", currency: ["FKP"], - languages: ["en"] + languages: ["en"], }, FM: { name: "Micronesia", @@ -658,7 +658,7 @@ export const countries = { continent: "OC", capital: "Palikir", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, FO: { name: "Faroe Islands", @@ -667,7 +667,7 @@ export const countries = { continent: "EU", capital: "Tórshavn", currency: ["DKK"], - languages: ["fo"] + languages: ["fo"], }, FR: { name: "France", @@ -676,7 +676,7 @@ export const countries = { continent: "EU", capital: "Paris", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, GA: { name: "Gabon", @@ -685,7 +685,7 @@ export const countries = { continent: "AF", capital: "Libreville", currency: ["XAF"], - languages: ["fr"] + languages: ["fr"], }, GB: { name: "United Kingdom", @@ -694,7 +694,7 @@ export const countries = { continent: "EU", capital: "London", currency: ["GBP"], - languages: ["en"] + languages: ["en"], }, GD: { name: "Grenada", @@ -703,7 +703,7 @@ export const countries = { continent: "NA", capital: "St. George's", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, GE: { name: "Georgia", @@ -713,7 +713,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Tbilisi", currency: ["GEL"], - languages: ["ka"] + languages: ["ka"], }, GF: { name: "French Guiana", @@ -722,7 +722,7 @@ export const countries = { continent: "SA", capital: "Cayenne", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, GG: { name: "Guernsey", @@ -731,7 +731,7 @@ export const countries = { continent: "EU", capital: "St. Peter Port", currency: ["GBP"], - languages: ["en", "fr"] + languages: ["en", "fr"], }, GH: { name: "Ghana", @@ -740,7 +740,7 @@ export const countries = { continent: "AF", capital: "Accra", currency: ["GHS"], - languages: ["en"] + languages: ["en"], }, GI: { name: "Gibraltar", @@ -749,7 +749,7 @@ export const countries = { continent: "EU", capital: "Gibraltar", currency: ["GIP"], - languages: ["en"] + languages: ["en"], }, GL: { name: "Greenland", @@ -758,7 +758,7 @@ export const countries = { continent: "NA", capital: "Nuuk", currency: ["DKK"], - languages: ["kl"] + languages: ["kl"], }, GM: { name: "Gambia", @@ -767,7 +767,7 @@ export const countries = { continent: "AF", capital: "Banjul", currency: ["GMD"], - languages: ["en"] + languages: ["en"], }, GN: { name: "Guinea", @@ -776,7 +776,7 @@ export const countries = { continent: "AF", capital: "Conakry", currency: ["GNF"], - languages: ["fr", "ff"] + languages: ["fr", "ff"], }, GP: { name: "Guadeloupe", @@ -785,7 +785,7 @@ export const countries = { continent: "NA", capital: "Basse-Terre", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, GQ: { name: "Equatorial Guinea", @@ -794,7 +794,7 @@ export const countries = { continent: "AF", capital: "Malabo", currency: ["XAF"], - languages: ["es", "fr"] + languages: ["es", "fr"], }, GR: { name: "Greece", @@ -803,7 +803,7 @@ export const countries = { continent: "EU", capital: "Athens", currency: ["EUR"], - languages: ["el"] + languages: ["el"], }, GS: { name: "South Georgia and the South Sandwich Islands", @@ -812,7 +812,7 @@ export const countries = { continent: "AN", capital: "King Edward Point", currency: ["GBP"], - languages: ["en"] + languages: ["en"], }, GT: { name: "Guatemala", @@ -821,7 +821,7 @@ export const countries = { continent: "NA", capital: "Guatemala City", currency: ["GTQ"], - languages: ["es"] + languages: ["es"], }, GU: { name: "Guam", @@ -830,7 +830,7 @@ export const countries = { continent: "OC", capital: "Hagåtña", currency: ["USD"], - languages: ["en", "ch", "es"] + languages: ["en", "ch", "es"], }, GW: { name: "Guinea-Bissau", @@ -839,7 +839,7 @@ export const countries = { continent: "AF", capital: "Bissau", currency: ["XOF"], - languages: ["pt"] + languages: ["pt"], }, GY: { name: "Guyana", @@ -848,7 +848,7 @@ export const countries = { continent: "SA", capital: "Georgetown", currency: ["GYD"], - languages: ["en"] + languages: ["en"], }, HK: { name: "Hong Kong", @@ -857,7 +857,7 @@ export const countries = { continent: "AS", capital: "City of Victoria", currency: ["HKD"], - languages: ["zh", "en"] + languages: ["zh", "en"], }, HM: { name: "Heard Island and McDonald Islands", @@ -866,7 +866,7 @@ export const countries = { continent: "AN", capital: "", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, HN: { name: "Honduras", @@ -875,7 +875,7 @@ export const countries = { continent: "NA", capital: "Tegucigalpa", currency: ["HNL"], - languages: ["es"] + languages: ["es"], }, HR: { name: "Croatia", @@ -884,7 +884,7 @@ export const countries = { continent: "EU", capital: "Zagreb", currency: ["EUR"], - languages: ["hr"] + languages: ["hr"], }, HT: { name: "Haiti", @@ -893,7 +893,7 @@ export const countries = { continent: "NA", capital: "Port-au-Prince", currency: ["HTG", "USD"], - languages: ["fr", "ht"] + languages: ["fr", "ht"], }, HU: { name: "Hungary", @@ -902,7 +902,7 @@ export const countries = { continent: "EU", capital: "Budapest", currency: ["HUF"], - languages: ["hu"] + languages: ["hu"], }, ID: { name: "Indonesia", @@ -911,7 +911,7 @@ export const countries = { continent: "AS", capital: "Jakarta", currency: ["IDR"], - languages: ["id"] + languages: ["id"], }, IE: { name: "Ireland", @@ -920,7 +920,7 @@ export const countries = { continent: "EU", capital: "Dublin", currency: ["EUR"], - languages: ["ga", "en"] + languages: ["ga", "en"], }, IL: { name: "Israel", @@ -929,7 +929,7 @@ export const countries = { continent: "AS", capital: "Jerusalem", currency: ["ILS"], - languages: ["he", "ar"] + languages: ["he", "ar"], }, IM: { name: "Isle of Man", @@ -938,7 +938,7 @@ export const countries = { continent: "EU", capital: "Douglas", currency: ["GBP"], - languages: ["en", "gv"] + languages: ["en", "gv"], }, IN: { name: "India", @@ -947,7 +947,7 @@ export const countries = { continent: "AS", capital: "New Delhi", currency: ["INR"], - languages: ["hi", "en"] + languages: ["hi", "en"], }, IO: { name: "British Indian Ocean Territory", @@ -956,7 +956,7 @@ export const countries = { continent: "AS", capital: "Diego Garcia", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, IQ: { name: "Iraq", @@ -965,7 +965,7 @@ export const countries = { continent: "AS", capital: "Baghdad", currency: ["IQD"], - languages: ["ar", "ku"] + languages: ["ar", "ku"], }, IR: { name: "Iran", @@ -974,7 +974,7 @@ export const countries = { continent: "AS", capital: "Tehran", currency: ["IRR"], - languages: ["fa"] + languages: ["fa"], }, IS: { name: "Iceland", @@ -983,7 +983,7 @@ export const countries = { continent: "EU", capital: "Reykjavik", currency: ["ISK"], - languages: ["is"] + languages: ["is"], }, IT: { name: "Italy", @@ -992,7 +992,7 @@ export const countries = { continent: "EU", capital: "Rome", currency: ["EUR"], - languages: ["it"] + languages: ["it"], }, JE: { name: "Jersey", @@ -1001,7 +1001,7 @@ export const countries = { continent: "EU", capital: "Saint Helier", currency: ["GBP"], - languages: ["en", "fr"] + languages: ["en", "fr"], }, JM: { name: "Jamaica", @@ -1010,7 +1010,7 @@ export const countries = { continent: "NA", capital: "Kingston", currency: ["JMD"], - languages: ["en"] + languages: ["en"], }, JO: { name: "Jordan", @@ -1019,7 +1019,7 @@ export const countries = { continent: "AS", capital: "Amman", currency: ["JOD"], - languages: ["ar"] + languages: ["ar"], }, JP: { name: "Japan", @@ -1028,7 +1028,7 @@ export const countries = { continent: "AS", capital: "Tokyo", currency: ["JPY"], - languages: ["ja"] + languages: ["ja"], }, KE: { name: "Kenya", @@ -1037,7 +1037,7 @@ export const countries = { continent: "AF", capital: "Nairobi", currency: ["KES"], - languages: ["en", "sw"] + languages: ["en", "sw"], }, KG: { name: "Kyrgyzstan", @@ -1046,7 +1046,7 @@ export const countries = { continent: "AS", capital: "Bishkek", currency: ["KGS"], - languages: ["ky", "ru"] + languages: ["ky", "ru"], }, KH: { name: "Cambodia", @@ -1055,7 +1055,7 @@ export const countries = { continent: "AS", capital: "Phnom Penh", currency: ["KHR"], - languages: ["km"] + languages: ["km"], }, KI: { name: "Kiribati", @@ -1064,7 +1064,7 @@ export const countries = { continent: "OC", capital: "South Tarawa", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, KM: { name: "Comoros", @@ -1073,7 +1073,7 @@ export const countries = { continent: "AF", capital: "Moroni", currency: ["KMF"], - languages: ["ar", "fr"] + languages: ["ar", "fr"], }, KN: { name: "Saint Kitts and Nevis", @@ -1082,7 +1082,7 @@ export const countries = { continent: "NA", capital: "Basseterre", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, KP: { name: "North Korea", @@ -1091,7 +1091,7 @@ export const countries = { continent: "AS", capital: "Pyongyang", currency: ["KPW"], - languages: ["ko"] + languages: ["ko"], }, KR: { name: "South Korea", @@ -1100,7 +1100,7 @@ export const countries = { continent: "AS", capital: "Seoul", currency: ["KRW"], - languages: ["ko"] + languages: ["ko"], }, KW: { name: "Kuwait", @@ -1109,7 +1109,7 @@ export const countries = { continent: "AS", capital: "Kuwait City", currency: ["KWD"], - languages: ["ar"] + languages: ["ar"], }, KY: { name: "Cayman Islands", @@ -1118,7 +1118,7 @@ export const countries = { continent: "NA", capital: "George Town", currency: ["KYD"], - languages: ["en"] + languages: ["en"], }, KZ: { name: "Kazakhstan", @@ -1128,7 +1128,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Astana", currency: ["KZT"], - languages: ["kk", "ru"] + languages: ["kk", "ru"], }, LA: { name: "Laos", @@ -1137,7 +1137,7 @@ export const countries = { continent: "AS", capital: "Vientiane", currency: ["LAK"], - languages: ["lo"] + languages: ["lo"], }, LB: { name: "Lebanon", @@ -1146,7 +1146,7 @@ export const countries = { continent: "AS", capital: "Beirut", currency: ["LBP"], - languages: ["ar", "fr"] + languages: ["ar", "fr"], }, LC: { name: "Saint Lucia", @@ -1155,7 +1155,7 @@ export const countries = { continent: "NA", capital: "Castries", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, LI: { name: "Liechtenstein", @@ -1164,7 +1164,7 @@ export const countries = { continent: "EU", capital: "Vaduz", currency: ["CHF"], - languages: ["de"] + languages: ["de"], }, LK: { name: "Sri Lanka", @@ -1173,7 +1173,7 @@ export const countries = { continent: "AS", capital: "Colombo", currency: ["LKR"], - languages: ["si", "ta"] + languages: ["si", "ta"], }, LR: { name: "Liberia", @@ -1182,7 +1182,7 @@ export const countries = { continent: "AF", capital: "Monrovia", currency: ["LRD"], - languages: ["en"] + languages: ["en"], }, LS: { name: "Lesotho", @@ -1191,7 +1191,7 @@ export const countries = { continent: "AF", capital: "Maseru", currency: ["LSL", "ZAR"], - languages: ["en", "st"] + languages: ["en", "st"], }, LT: { name: "Lithuania", @@ -1200,7 +1200,7 @@ export const countries = { continent: "EU", capital: "Vilnius", currency: ["EUR"], - languages: ["lt"] + languages: ["lt"], }, LU: { name: "Luxembourg", @@ -1209,7 +1209,7 @@ export const countries = { continent: "EU", capital: "Luxembourg", currency: ["EUR"], - languages: ["fr", "de", "lb"] + languages: ["fr", "de", "lb"], }, LV: { name: "Latvia", @@ -1218,7 +1218,7 @@ export const countries = { continent: "EU", capital: "Riga", currency: ["EUR"], - languages: ["lv"] + languages: ["lv"], }, LY: { name: "Libya", @@ -1227,7 +1227,7 @@ export const countries = { continent: "AF", capital: "Tripoli", currency: ["LYD"], - languages: ["ar"] + languages: ["ar"], }, MA: { name: "Morocco", @@ -1236,7 +1236,7 @@ export const countries = { continent: "AF", capital: "Rabat", currency: ["MAD"], - languages: ["ar"] + languages: ["ar"], }, MC: { name: "Monaco", @@ -1245,7 +1245,7 @@ export const countries = { continent: "EU", capital: "Monaco", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, MD: { name: "Moldova", @@ -1254,7 +1254,7 @@ export const countries = { continent: "EU", capital: "Chișinău", currency: ["MDL"], - languages: ["ro"] + languages: ["ro"], }, ME: { name: "Montenegro", @@ -1263,7 +1263,7 @@ export const countries = { continent: "EU", capital: "Podgorica", currency: ["EUR"], - languages: ["sr", "bs", "sq", "hr"] + languages: ["sr", "bs", "sq", "hr"], }, MF: { name: "Saint Martin", @@ -1272,7 +1272,7 @@ export const countries = { continent: "NA", capital: "Marigot", currency: ["EUR"], - languages: ["en", "fr", "nl"] + languages: ["en", "fr", "nl"], }, MG: { name: "Madagascar", @@ -1281,7 +1281,7 @@ export const countries = { continent: "AF", capital: "Antananarivo", currency: ["MGA"], - languages: ["fr", "mg"] + languages: ["fr", "mg"], }, MH: { name: "Marshall Islands", @@ -1290,7 +1290,7 @@ export const countries = { continent: "OC", capital: "Majuro", currency: ["USD"], - languages: ["en", "mh"] + languages: ["en", "mh"], }, MK: { name: "North Macedonia", @@ -1299,7 +1299,7 @@ export const countries = { continent: "EU", capital: "Skopje", currency: ["MKD"], - languages: ["mk"] + languages: ["mk"], }, ML: { name: "Mali", @@ -1308,7 +1308,7 @@ export const countries = { continent: "AF", capital: "Bamako", currency: ["XOF"], - languages: ["fr"] + languages: ["fr"], }, MM: { name: "Myanmar (Burma)", @@ -1317,7 +1317,7 @@ export const countries = { continent: "AS", capital: "Naypyidaw", currency: ["MMK"], - languages: ["my"] + languages: ["my"], }, MN: { name: "Mongolia", @@ -1326,7 +1326,7 @@ export const countries = { continent: "AS", capital: "Ulan Bator", currency: ["MNT"], - languages: ["mn"] + languages: ["mn"], }, MO: { name: "Macao", @@ -1335,7 +1335,7 @@ export const countries = { continent: "AS", capital: "", currency: ["MOP"], - languages: ["zh", "pt"] + languages: ["zh", "pt"], }, MP: { name: "Northern Mariana Islands", @@ -1344,7 +1344,7 @@ export const countries = { continent: "OC", capital: "Saipan", currency: ["USD"], - languages: ["en", "ch"] + languages: ["en", "ch"], }, MQ: { name: "Martinique", @@ -1353,7 +1353,7 @@ export const countries = { continent: "NA", capital: "Fort-de-France", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, MR: { name: "Mauritania", @@ -1362,7 +1362,7 @@ export const countries = { continent: "AF", capital: "Nouakchott", currency: ["MRU"], - languages: ["ar"] + languages: ["ar"], }, MS: { name: "Montserrat", @@ -1371,7 +1371,7 @@ export const countries = { continent: "NA", capital: "Plymouth", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, MT: { name: "Malta", @@ -1380,7 +1380,7 @@ export const countries = { continent: "EU", capital: "Valletta", currency: ["EUR"], - languages: ["mt", "en"] + languages: ["mt", "en"], }, MU: { name: "Mauritius", @@ -1389,7 +1389,7 @@ export const countries = { continent: "AF", capital: "Port Louis", currency: ["MUR"], - languages: ["en"] + languages: ["en"], }, MV: { name: "Maldives", @@ -1398,7 +1398,7 @@ export const countries = { continent: "AS", capital: "Malé", currency: ["MVR"], - languages: ["dv"] + languages: ["dv"], }, MW: { name: "Malawi", @@ -1407,7 +1407,7 @@ export const countries = { continent: "AF", capital: "Lilongwe", currency: ["MWK"], - languages: ["en", "ny"] + languages: ["en", "ny"], }, MX: { name: "Mexico", @@ -1416,7 +1416,7 @@ export const countries = { continent: "NA", capital: "Mexico City", currency: ["MXN"], - languages: ["es"] + languages: ["es"], }, MY: { name: "Malaysia", @@ -1425,7 +1425,7 @@ export const countries = { continent: "AS", capital: "Kuala Lumpur", currency: ["MYR"], - languages: ["ms"] + languages: ["ms"], }, MZ: { name: "Mozambique", @@ -1434,7 +1434,7 @@ export const countries = { continent: "AF", capital: "Maputo", currency: ["MZN"], - languages: ["pt"] + languages: ["pt"], }, NA: { name: "Namibia", @@ -1443,7 +1443,7 @@ export const countries = { continent: "AF", capital: "Windhoek", currency: ["NAD", "ZAR"], - languages: ["en", "af"] + languages: ["en", "af"], }, NC: { name: "New Caledonia", @@ -1452,7 +1452,7 @@ export const countries = { continent: "OC", capital: "Nouméa", currency: ["XPF"], - languages: ["fr"] + languages: ["fr"], }, NE: { name: "Niger", @@ -1461,7 +1461,7 @@ export const countries = { continent: "AF", capital: "Niamey", currency: ["XOF"], - languages: ["fr"] + languages: ["fr"], }, NF: { name: "Norfolk Island", @@ -1470,7 +1470,7 @@ export const countries = { continent: "OC", capital: "Kingston", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, NG: { name: "Nigeria", @@ -1479,7 +1479,7 @@ export const countries = { continent: "AF", capital: "Abuja", currency: ["NGN"], - languages: ["en"] + languages: ["en"], }, NI: { name: "Nicaragua", @@ -1488,7 +1488,7 @@ export const countries = { continent: "NA", capital: "Managua", currency: ["NIO"], - languages: ["es"] + languages: ["es"], }, NL: { name: "Netherlands", @@ -1497,7 +1497,7 @@ export const countries = { continent: "EU", capital: "Amsterdam", currency: ["EUR"], - languages: ["nl"] + languages: ["nl"], }, NO: { name: "Norway", @@ -1506,7 +1506,7 @@ export const countries = { continent: "EU", capital: "Oslo", currency: ["NOK"], - languages: ["no", "nb", "nn"] + languages: ["no", "nb", "nn"], }, NP: { name: "Nepal", @@ -1515,7 +1515,7 @@ export const countries = { continent: "AS", capital: "Kathmandu", currency: ["NPR"], - languages: ["ne"] + languages: ["ne"], }, NR: { name: "Nauru", @@ -1524,7 +1524,7 @@ export const countries = { continent: "OC", capital: "Yaren", currency: ["AUD"], - languages: ["en", "na"] + languages: ["en", "na"], }, NU: { name: "Niue", @@ -1533,7 +1533,7 @@ export const countries = { continent: "OC", capital: "Alofi", currency: ["NZD"], - languages: ["en"] + languages: ["en"], }, NZ: { name: "New Zealand", @@ -1542,7 +1542,7 @@ export const countries = { continent: "OC", capital: "Wellington", currency: ["NZD"], - languages: ["en", "mi"] + languages: ["en", "mi"], }, OM: { name: "Oman", @@ -1551,7 +1551,7 @@ export const countries = { continent: "AS", capital: "Muscat", currency: ["OMR"], - languages: ["ar"] + languages: ["ar"], }, PA: { name: "Panama", @@ -1560,7 +1560,7 @@ export const countries = { continent: "NA", capital: "Panama City", currency: ["PAB", "USD"], - languages: ["es"] + languages: ["es"], }, PE: { name: "Peru", @@ -1569,7 +1569,7 @@ export const countries = { continent: "SA", capital: "Lima", currency: ["PEN"], - languages: ["es"] + languages: ["es"], }, PF: { name: "French Polynesia", @@ -1578,7 +1578,7 @@ export const countries = { continent: "OC", capital: "Papeetē", currency: ["XPF"], - languages: ["fr"] + languages: ["fr"], }, PG: { name: "Papua New Guinea", @@ -1587,7 +1587,7 @@ export const countries = { continent: "OC", capital: "Port Moresby", currency: ["PGK"], - languages: ["en"] + languages: ["en"], }, PH: { name: "Philippines", @@ -1596,7 +1596,7 @@ export const countries = { continent: "AS", capital: "Manila", currency: ["PHP"], - languages: ["en"] + languages: ["en"], }, PK: { name: "Pakistan", @@ -1605,7 +1605,7 @@ export const countries = { continent: "AS", capital: "Islamabad", currency: ["PKR"], - languages: ["en", "ur"] + languages: ["en", "ur"], }, PL: { name: "Poland", @@ -1614,7 +1614,7 @@ export const countries = { continent: "EU", capital: "Warsaw", currency: ["PLN"], - languages: ["pl"] + languages: ["pl"], }, PM: { name: "Saint Pierre and Miquelon", @@ -1623,7 +1623,7 @@ export const countries = { continent: "NA", capital: "Saint-Pierre", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, PN: { name: "Pitcairn Islands", @@ -1632,7 +1632,7 @@ export const countries = { continent: "OC", capital: "Adamstown", currency: ["NZD"], - languages: ["en"] + languages: ["en"], }, PR: { name: "Puerto Rico", @@ -1641,7 +1641,7 @@ export const countries = { continent: "NA", capital: "San Juan", currency: ["USD"], - languages: ["es", "en"] + languages: ["es", "en"], }, PS: { name: "Palestine", @@ -1650,7 +1650,7 @@ export const countries = { continent: "AS", capital: "Ramallah", currency: ["ILS"], - languages: ["ar"] + languages: ["ar"], }, PT: { name: "Portugal", @@ -1659,7 +1659,7 @@ export const countries = { continent: "EU", capital: "Lisbon", currency: ["EUR"], - languages: ["pt"] + languages: ["pt"], }, PW: { name: "Palau", @@ -1668,7 +1668,7 @@ export const countries = { continent: "OC", capital: "Ngerulmud", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, PY: { name: "Paraguay", @@ -1677,7 +1677,7 @@ export const countries = { continent: "SA", capital: "Asunción", currency: ["PYG"], - languages: ["es", "gn"] + languages: ["es", "gn"], }, QA: { name: "Qatar", @@ -1686,7 +1686,7 @@ export const countries = { continent: "AS", capital: "Doha", currency: ["QAR"], - languages: ["ar"] + languages: ["ar"], }, RE: { name: "Reunion", @@ -1695,7 +1695,7 @@ export const countries = { continent: "AF", capital: "Saint-Denis", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, RO: { name: "Romania", @@ -1704,7 +1704,7 @@ export const countries = { continent: "EU", capital: "Bucharest", currency: ["RON"], - languages: ["ro"] + languages: ["ro"], }, RS: { name: "Serbia", @@ -1713,7 +1713,7 @@ export const countries = { continent: "EU", capital: "Belgrade", currency: ["RSD"], - languages: ["sr"] + languages: ["sr"], }, RU: { name: "Russia", @@ -1723,7 +1723,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Moscow", currency: ["RUB"], - languages: ["ru"] + languages: ["ru"], }, RW: { name: "Rwanda", @@ -1732,7 +1732,7 @@ export const countries = { continent: "AF", capital: "Kigali", currency: ["RWF"], - languages: ["rw", "en", "fr"] + languages: ["rw", "en", "fr"], }, SA: { name: "Saudi Arabia", @@ -1741,7 +1741,7 @@ export const countries = { continent: "AS", capital: "Riyadh", currency: ["SAR"], - languages: ["ar"] + languages: ["ar"], }, SB: { name: "Solomon Islands", @@ -1750,7 +1750,7 @@ export const countries = { continent: "OC", capital: "Honiara", currency: ["SBD"], - languages: ["en"] + languages: ["en"], }, SC: { name: "Seychelles", @@ -1759,7 +1759,7 @@ export const countries = { continent: "AF", capital: "Victoria", currency: ["SCR"], - languages: ["fr", "en"] + languages: ["fr", "en"], }, SD: { name: "Sudan", @@ -1768,7 +1768,7 @@ export const countries = { continent: "AF", capital: "Khartoum", currency: ["SDG"], - languages: ["ar", "en"] + languages: ["ar", "en"], }, SE: { name: "Sweden", @@ -1777,7 +1777,7 @@ export const countries = { continent: "EU", capital: "Stockholm", currency: ["SEK"], - languages: ["sv"] + languages: ["sv"], }, SG: { name: "Singapore", @@ -1786,7 +1786,7 @@ export const countries = { continent: "AS", capital: "Singapore", currency: ["SGD"], - languages: ["en", "ms", "ta", "zh"] + languages: ["en", "ms", "ta", "zh"], }, SH: { name: "Saint Helena", @@ -1795,7 +1795,7 @@ export const countries = { continent: "AF", capital: "Jamestown", currency: ["SHP"], - languages: ["en"] + languages: ["en"], }, SI: { name: "Slovenia", @@ -1804,7 +1804,7 @@ export const countries = { continent: "EU", capital: "Ljubljana", currency: ["EUR"], - languages: ["sl"] + languages: ["sl"], }, SJ: { name: "Svalbard and Jan Mayen", @@ -1813,7 +1813,7 @@ export const countries = { continent: "EU", capital: "Longyearbyen", currency: ["NOK"], - languages: ["no"] + languages: ["no"], }, SK: { name: "Slovakia", @@ -1822,7 +1822,7 @@ export const countries = { continent: "EU", capital: "Bratislava", currency: ["EUR"], - languages: ["sk"] + languages: ["sk"], }, SL: { name: "Sierra Leone", @@ -1831,7 +1831,7 @@ export const countries = { continent: "AF", capital: "Freetown", currency: ["SLL"], - languages: ["en"] + languages: ["en"], }, SM: { name: "San Marino", @@ -1840,7 +1840,7 @@ export const countries = { continent: "EU", capital: "City of San Marino", currency: ["EUR"], - languages: ["it"] + languages: ["it"], }, SN: { name: "Senegal", @@ -1849,7 +1849,7 @@ export const countries = { continent: "AF", capital: "Dakar", currency: ["XOF"], - languages: ["fr"] + languages: ["fr"], }, SO: { name: "Somalia", @@ -1858,7 +1858,7 @@ export const countries = { continent: "AF", capital: "Mogadishu", currency: ["SOS"], - languages: ["so", "ar"] + languages: ["so", "ar"], }, SR: { name: "Suriname", @@ -1867,7 +1867,7 @@ export const countries = { continent: "SA", capital: "Paramaribo", currency: ["SRD"], - languages: ["nl"] + languages: ["nl"], }, SS: { name: "South Sudan", @@ -1876,7 +1876,7 @@ export const countries = { continent: "AF", capital: "Juba", currency: ["SSP"], - languages: ["en"] + languages: ["en"], }, ST: { name: "Sao Tome and Principe", @@ -1885,7 +1885,7 @@ export const countries = { continent: "AF", capital: "São Tomé", currency: ["STN"], - languages: ["pt"] + languages: ["pt"], }, SV: { name: "El Salvador", @@ -1894,7 +1894,7 @@ export const countries = { continent: "NA", capital: "San Salvador", currency: ["SVC", "USD"], - languages: ["es"] + languages: ["es"], }, SX: { name: "Sint Maarten", @@ -1903,7 +1903,7 @@ export const countries = { continent: "NA", capital: "Philipsburg", currency: ["ANG"], - languages: ["nl", "en"] + languages: ["nl", "en"], }, SY: { name: "Syria", @@ -1912,7 +1912,7 @@ export const countries = { continent: "AS", capital: "Damascus", currency: ["SYP"], - languages: ["ar"] + languages: ["ar"], }, SZ: { name: "Eswatini", @@ -1921,7 +1921,7 @@ export const countries = { continent: "AF", capital: "Lobamba", currency: ["SZL"], - languages: ["en", "ss"] + languages: ["en", "ss"], }, TC: { name: "Turks and Caicos Islands", @@ -1930,7 +1930,7 @@ export const countries = { continent: "NA", capital: "Cockburn Town", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, TD: { name: "Chad", @@ -1939,7 +1939,7 @@ export const countries = { continent: "AF", capital: "N'Djamena", currency: ["XAF"], - languages: ["fr", "ar"] + languages: ["fr", "ar"], }, TF: { name: "French Southern Territories", @@ -1948,7 +1948,7 @@ export const countries = { continent: "AN", capital: "Port-aux-Français", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, TG: { name: "Togo", @@ -1957,7 +1957,7 @@ export const countries = { continent: "AF", capital: "Lomé", currency: ["XOF"], - languages: ["fr"] + languages: ["fr"], }, TH: { name: "Thailand", @@ -1966,7 +1966,7 @@ export const countries = { continent: "AS", capital: "Bangkok", currency: ["THB"], - languages: ["th"] + languages: ["th"], }, TJ: { name: "Tajikistan", @@ -1975,7 +1975,7 @@ export const countries = { continent: "AS", capital: "Dushanbe", currency: ["TJS"], - languages: ["tg", "ru"] + languages: ["tg", "ru"], }, TK: { name: "Tokelau", @@ -1984,7 +1984,7 @@ export const countries = { continent: "OC", capital: "Fakaofo", currency: ["NZD"], - languages: ["en"] + languages: ["en"], }, TL: { name: "East Timor", @@ -1993,7 +1993,7 @@ export const countries = { continent: "OC", capital: "Dili", currency: ["USD"], - languages: ["pt"] + languages: ["pt"], }, TM: { name: "Turkmenistan", @@ -2002,7 +2002,7 @@ export const countries = { continent: "AS", capital: "Ashgabat", currency: ["TMT"], - languages: ["tk", "ru"] + languages: ["tk", "ru"], }, TN: { name: "Tunisia", @@ -2011,7 +2011,7 @@ export const countries = { continent: "AF", capital: "Tunis", currency: ["TND"], - languages: ["ar"] + languages: ["ar"], }, TO: { name: "Tonga", @@ -2020,7 +2020,7 @@ export const countries = { continent: "OC", capital: "Nuku'alofa", currency: ["TOP"], - languages: ["en", "to"] + languages: ["en", "to"], }, TR: { name: "Turkey", @@ -2030,7 +2030,7 @@ export const countries = { continents: ["AS", "EU"], capital: "Ankara", currency: ["TRY"], - languages: ["tr"] + languages: ["tr"], }, TT: { name: "Trinidad and Tobago", @@ -2039,7 +2039,7 @@ export const countries = { continent: "NA", capital: "Port of Spain", currency: ["TTD"], - languages: ["en"] + languages: ["en"], }, TV: { name: "Tuvalu", @@ -2048,7 +2048,7 @@ export const countries = { continent: "OC", capital: "Funafuti", currency: ["AUD"], - languages: ["en"] + languages: ["en"], }, TW: { name: "Taiwan", @@ -2057,7 +2057,7 @@ export const countries = { continent: "AS", capital: "Taipei", currency: ["TWD"], - languages: ["zh"] + languages: ["zh"], }, TZ: { name: "Tanzania", @@ -2066,7 +2066,7 @@ export const countries = { continent: "AF", capital: "Dodoma", currency: ["TZS"], - languages: ["sw", "en"] + languages: ["sw", "en"], }, UA: { name: "Ukraine", @@ -2075,7 +2075,7 @@ export const countries = { continent: "EU", capital: "Kyiv", currency: ["UAH"], - languages: ["uk"] + languages: ["uk"], }, UG: { name: "Uganda", @@ -2084,7 +2084,7 @@ export const countries = { continent: "AF", capital: "Kampala", currency: ["UGX"], - languages: ["en", "sw"] + languages: ["en", "sw"], }, UM: { name: "U.S. Minor Outlying Islands", @@ -2093,7 +2093,7 @@ export const countries = { continent: "OC", capital: "", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, US: { name: "United States", @@ -2102,7 +2102,7 @@ export const countries = { continent: "NA", capital: "Washington D.C.", currency: ["USD", "USN", "USS"], - languages: ["en"] + languages: ["en"], }, UY: { name: "Uruguay", @@ -2111,7 +2111,7 @@ export const countries = { continent: "SA", capital: "Montevideo", currency: ["UYI", "UYU"], - languages: ["es"] + languages: ["es"], }, UZ: { name: "Uzbekistan", @@ -2120,7 +2120,7 @@ export const countries = { continent: "AS", capital: "Tashkent", currency: ["UZS"], - languages: ["uz", "ru"] + languages: ["uz", "ru"], }, VA: { name: "Vatican City", @@ -2129,7 +2129,7 @@ export const countries = { continent: "EU", capital: "Vatican City", currency: ["EUR"], - languages: ["it", "la"] + languages: ["it", "la"], }, VC: { name: "Saint Vincent and the Grenadines", @@ -2138,7 +2138,7 @@ export const countries = { continent: "NA", capital: "Kingstown", currency: ["XCD"], - languages: ["en"] + languages: ["en"], }, VE: { name: "Venezuela", @@ -2147,7 +2147,7 @@ export const countries = { continent: "SA", capital: "Caracas", currency: ["VES"], - languages: ["es"] + languages: ["es"], }, VG: { name: "British Virgin Islands", @@ -2156,7 +2156,7 @@ export const countries = { continent: "NA", capital: "Road Town", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, VI: { name: "U.S. Virgin Islands", @@ -2165,7 +2165,7 @@ export const countries = { continent: "NA", capital: "Charlotte Amalie", currency: ["USD"], - languages: ["en"] + languages: ["en"], }, VN: { name: "Vietnam", @@ -2174,7 +2174,7 @@ export const countries = { continent: "AS", capital: "Hanoi", currency: ["VND"], - languages: ["vi"] + languages: ["vi"], }, VU: { name: "Vanuatu", @@ -2183,7 +2183,7 @@ export const countries = { continent: "OC", capital: "Port Vila", currency: ["VUV"], - languages: ["bi", "en", "fr"] + languages: ["bi", "en", "fr"], }, WF: { name: "Wallis and Futuna", @@ -2192,7 +2192,7 @@ export const countries = { continent: "OC", capital: "Mata-Utu", currency: ["XPF"], - languages: ["fr"] + languages: ["fr"], }, WS: { name: "Samoa", @@ -2201,7 +2201,7 @@ export const countries = { continent: "OC", capital: "Apia", currency: ["WST"], - languages: ["sm", "en"] + languages: ["sm", "en"], }, XK: { name: "Kosovo", @@ -2211,7 +2211,7 @@ export const countries = { capital: "Pristina", currency: ["EUR"], languages: ["sq", "sr"], - userAssigned: true + userAssigned: true, }, YE: { name: "Yemen", @@ -2220,7 +2220,7 @@ export const countries = { continent: "AS", capital: "Sana'a", currency: ["YER"], - languages: ["ar"] + languages: ["ar"], }, YT: { name: "Mayotte", @@ -2229,7 +2229,7 @@ export const countries = { continent: "AF", capital: "Mamoudzou", currency: ["EUR"], - languages: ["fr"] + languages: ["fr"], }, ZA: { name: "South Africa", @@ -2238,7 +2238,7 @@ export const countries = { continent: "AF", capital: "Pretoria", currency: ["ZAR"], - languages: ["af", "en", "nr", "st", "ss", "tn", "ts", "ve", "xh", "zu"] + languages: ["af", "en", "nr", "st", "ss", "tn", "ts", "ve", "xh", "zu"], }, ZM: { name: "Zambia", @@ -2247,7 +2247,7 @@ export const countries = { continent: "AF", capital: "Lusaka", currency: ["ZMW"], - languages: ["en"] + languages: ["en"], }, ZW: { name: "Zimbabwe", @@ -2256,6 +2256,6 @@ export const countries = { continent: "AF", capital: "Harare", currency: ["USD", "ZAR", "BWP", "GBP", "AUD", "CNY", "INR", "JPY"], - languages: ["en", "sn", "nd"] - } + languages: ["en", "sn", "nd"], + }, }; diff --git a/apps/api/src/lib/validateUrl.test.ts b/apps/api/src/lib/validateUrl.test.ts index 81c150fb..e417b444 100644 --- a/apps/api/src/lib/validateUrl.test.ts +++ b/apps/api/src/lib/validateUrl.test.ts @@ -20,7 +20,7 @@ describe("isSameDomain", () => { it("should return true for a subdomain with different protocols", () => { const result = isSameDomain( "https://sub.example.com", - "http://example.com" + "http://example.com", ); expect(result).toBe(true); }); @@ -35,7 +35,7 @@ describe("isSameDomain", () => { it("should return true for a subdomain with www prefix", () => { const result = isSameDomain( "http://www.sub.example.com", - "http://example.com" + "http://example.com", ); expect(result).toBe(true); }); @@ -43,7 +43,7 @@ describe("isSameDomain", () => { it("should return true for the same domain with www prefix", () => { const result = isSameDomain( "http://docs.s.s.example.com", - "http://example.com" + "http://example.com", ); expect(result).toBe(true); }); @@ -53,7 +53,7 @@ describe("isSameSubdomain", () => { it("should return false for a subdomain", () => { const result = isSameSubdomain( "http://example.com", - "http://docs.example.com" + "http://docs.example.com", ); expect(result).toBe(false); }); @@ -61,7 +61,7 @@ describe("isSameSubdomain", () => { it("should return true for the same subdomain", () => { const result = isSameSubdomain( "http://docs.example.com", - "http://docs.example.com" + "http://docs.example.com", ); expect(result).toBe(true); }); @@ -69,7 +69,7 @@ describe("isSameSubdomain", () => { it("should return false for different subdomains", () => { const result = isSameSubdomain( "http://docs.example.com", - "http://blog.example.com" + "http://blog.example.com", ); expect(result).toBe(false); }); @@ -89,7 +89,7 @@ describe("isSameSubdomain", () => { it("should return true for the same subdomain with different protocols", () => { const result = isSameSubdomain( "https://docs.example.com", - "http://docs.example.com" + "http://docs.example.com", ); expect(result).toBe(true); }); @@ -97,7 +97,7 @@ describe("isSameSubdomain", () => { it("should return true for the same subdomain with www prefix", () => { const result = isSameSubdomain( "http://www.docs.example.com", - "http://docs.example.com" + "http://docs.example.com", ); expect(result).toBe(true); }); @@ -105,7 +105,7 @@ describe("isSameSubdomain", () => { it("should return false for a subdomain with www prefix and different subdomain", () => { const result = isSameSubdomain( "http://www.docs.example.com", - "http://blog.example.com" + "http://blog.example.com", ); expect(result).toBe(false); }); @@ -117,7 +117,7 @@ describe("removeDuplicateUrls", () => { "http://example.com", "https://example.com", "http://www.example.com", - "https://www.example.com" + "https://www.example.com", ]; const result = removeDuplicateUrls(urls); expect(result).toEqual(["https://example.com"]); @@ -128,14 +128,14 @@ describe("removeDuplicateUrls", () => { "https://example.com/page1", "https://example.com/page2", "https://example.com/page1?param=1", - "https://example.com/page1#section1" + "https://example.com/page1#section1", ]; const result = removeDuplicateUrls(urls); expect(result).toEqual([ "https://example.com/page1", "https://example.com/page2", "https://example.com/page1?param=1", - "https://example.com/page1#section1" + "https://example.com/page1#section1", ]); }); diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index ab3f4d4b..a585fe0a 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -8,7 +8,7 @@ let warningCount = 0; export function withAuth( originalFunction: (...args: U) => Promise, - mockSuccess: T + mockSuccess: T, ) { return async function (...args: U): Promise { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 981189ab..dc907371 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -2,7 +2,7 @@ import { Job } from "bullmq"; import { WebScraperOptions, RunWebScraperParams, - RunWebScraperResult + RunWebScraperResult, } from "../types"; import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../controllers/v1/types"; @@ -13,14 +13,14 @@ import { configDotenv } from "dotenv"; import { EngineResultsTracker, scrapeURL, - ScrapeUrlResponse + ScrapeUrlResponse, } from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; configDotenv(); export async function startWebScraperPipeline({ job, - token + token, }: { job: Job & { id: string }; token: string; @@ -32,9 +32,9 @@ export async function startWebScraperPipeline({ ...job.data.scrapeOptions, ...(job.data.crawl_id ? { - formats: job.data.scrapeOptions.formats.concat(["rawHtml"]) + formats: job.data.scrapeOptions.formats.concat(["rawHtml"]), } - : {}) + : {}), }, internalOptions: job.data.internalOptions, // onSuccess: (result, mode) => { @@ -48,7 +48,7 @@ export async function startWebScraperPipeline({ team_id: job.data.team_id, bull_job_id: job.id.toString(), priority: job.opts.priority, - is_scrape: job.data.is_scrape ?? false + is_scrape: job.data.is_scrape ?? false, }); } @@ -62,14 +62,14 @@ export async function runWebScraper({ team_id, bull_job_id, priority, - is_scrape = false + is_scrape = false, }: RunWebScraperParams): Promise { let response: ScrapeUrlResponse | undefined = undefined; let engines: EngineResultsTracker = {}; try { response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, - ...internalOptions + ...internalOptions, }); if (!response.success) { if (response.error instanceof Error) { @@ -81,7 +81,7 @@ export async function runWebScraper({ ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) - : response.error) + : response.error), ); } } @@ -94,7 +94,7 @@ export async function runWebScraper({ billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}` + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, ); // Optionally, you could notify an admin or add to a retry queue here }); @@ -117,14 +117,14 @@ export async function runWebScraper({ return { ...response, success: false, - error + error, }; } else { return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], - engines + engines, }; } // onError(error); @@ -154,8 +154,8 @@ export async function runWebScraper({ : result.state === "timeout" ? "Timed out" : undefined, - time_taken: result.finishedAt - result.startedAt - } + time_taken: result.finishedAt - result.startedAt, + }, }); } } @@ -166,7 +166,7 @@ const saveJob = async ( result: any, token: string, mode: string, - engines?: EngineResultsTracker + engines?: EngineResultsTracker, ) => { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index 861ae9fc..ec9967b8 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -4,7 +4,7 @@ import { autoscalerController, checkQueuesController, cleanBefore24hCompleteJobsController, - queuesController + queuesController, } from "../controllers/v0/admin/queue"; import { wrap } from "./v1"; import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; @@ -13,27 +13,27 @@ export const adminRouter = express.Router(); adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, - redisHealthController + redisHealthController, ); adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, - cleanBefore24hCompleteJobsController + cleanBefore24hCompleteJobsController, ); adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, - checkQueuesController + checkQueuesController, ); adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController); adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/autoscaler`, - autoscalerController + autoscalerController, ); adminRouter.post( `/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`, - wrap(acucCacheClearController) + wrap(acucCacheClearController), ); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index a9727e00..5daa077b 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -8,7 +8,7 @@ import { ErrorResponse, RequestWithACUC, RequestWithAuth, - RequestWithMaybeAuth + RequestWithMaybeAuth, } from "../controllers/v1/types"; import { RateLimiterMode } from "../types"; import { authenticateUser } from "../controllers/auth"; @@ -33,7 +33,7 @@ import { extractController } from "../controllers/v1/extract"; // import { readinessController } from "../controllers/v1/readiness"; function checkCreditsMiddleware( - minimum?: number + minimum?: number, ): (req: RequestWithAuth, res: Response, next: NextFunction) => void { return (req, res, next) => { (async () => { @@ -44,20 +44,20 @@ function checkCreditsMiddleware( const { success, remainingCredits, chunk } = await checkTeamCredits( req.acuc, req.auth.team_id, - minimum ?? 1 + minimum ?? 1, ); if (chunk) { req.acuc = chunk; } if (!success) { logger.error( - `Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}` + `Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`, ); if (!res.headersSent) { return res.status(402).json({ success: false, error: - "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." + "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value.", }); } } @@ -68,7 +68,7 @@ function checkCreditsMiddleware( } export function authMiddleware( - rateLimiterMode: RateLimiterMode + rateLimiterMode: RateLimiterMode, ): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { return (req, res, next) => { (async () => { @@ -99,7 +99,7 @@ export function authMiddleware( function idempotencyMiddleware( req: Request, res: Response, - next: NextFunction + next: NextFunction, ) { (async () => { if (req.headers["x-idempotency-key"]) { @@ -123,7 +123,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { return res.status(403).json({ success: false, error: - "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." + "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions.", }); } } @@ -131,7 +131,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { } export function wrap( - controller: (req: Request, res: Response) => Promise + controller: (req: Request, res: Response) => Promise, ): (req: Request, res: Response, next: NextFunction) => any { return (req, res, next) => { controller(req, res).catch((err) => next(err)); @@ -147,7 +147,7 @@ v1Router.post( authMiddleware(RateLimiterMode.Scrape), checkCreditsMiddleware(1), blocklistMiddleware, - wrap(scrapeController) + wrap(scrapeController), ); v1Router.post( @@ -156,7 +156,7 @@ v1Router.post( checkCreditsMiddleware(), blocklistMiddleware, idempotencyMiddleware, - wrap(crawlController) + wrap(crawlController), ); v1Router.post( @@ -165,7 +165,7 @@ v1Router.post( checkCreditsMiddleware(), blocklistMiddleware, idempotencyMiddleware, - wrap(batchScrapeController) + wrap(batchScrapeController), ); v1Router.post( @@ -173,20 +173,20 @@ v1Router.post( authMiddleware(RateLimiterMode.Map), checkCreditsMiddleware(1), blocklistMiddleware, - wrap(mapController) + wrap(mapController), ); v1Router.get( "/crawl/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), - wrap(crawlStatusController) + wrap(crawlStatusController), ); v1Router.get( "/batch/scrape/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), // Yes, it uses the same controller as the normal crawl status controller - wrap((req: any, res): any => crawlStatusController(req, res, true)) + wrap((req: any, res): any => crawlStatusController(req, res, true)), ); v1Router.get("/scrape/:jobId", wrap(scrapeStatusController)); @@ -194,7 +194,7 @@ v1Router.get("/scrape/:jobId", wrap(scrapeStatusController)); v1Router.get( "/concurrency-check", authMiddleware(RateLimiterMode.CrawlStatus), - wrap(concurrencyCheckController) + wrap(concurrencyCheckController), ); v1Router.ws("/crawl/:jobId", crawlStatusWSController); @@ -203,7 +203,7 @@ v1Router.post( "/extract", authMiddleware(RateLimiterMode.Scrape), checkCreditsMiddleware(1), - wrap(extractController) + wrap(extractController), ); // v1Router.post("/crawlWebsitePreview", crawlPreviewController); @@ -211,7 +211,7 @@ v1Router.post( v1Router.delete( "/crawl/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), - crawlCancelController + crawlCancelController, ); // v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController); diff --git a/apps/api/src/run-req.ts b/apps/api/src/run-req.ts index 61ee61bd..a7f4694a 100644 --- a/apps/api/src/run-req.ts +++ b/apps/api/src/run-req.ts @@ -18,20 +18,20 @@ async function sendCrawl(result: Result): Promise { { url: url, crawlerOptions: { - limit: 75 + limit: 75, }, pageOptions: { includeHtml: true, replaceAllPathsWithAbsolutePaths: true, - waitFor: 1000 - } + waitFor: 1000, + }, }, { headers: { "Content-Type": "application/json", - Authorization: `Bearer ` - } - } + Authorization: `Bearer `, + }, + }, ); result.idempotency_key = idempotencyKey; return response.data.jobId; @@ -51,9 +51,9 @@ async function getContent(result: Result): Promise { { headers: { "Content-Type": "application/json", - Authorization: `Bearer ` - } - } + Authorization: `Bearer `, + }, + }, ); if (response.data.status === "completed") { result.result_data_jsonb = response.data.data; @@ -97,11 +97,11 @@ async function processResults(results: Result[]): Promise { // Save job id along with the start_url const resultWithJobId = results.map((r) => ({ start_url: r.start_url, - job_id: r.job_id + job_id: r.job_id, })); await fs.writeFile( "results_with_job_id_4000_6000.json", - JSON.stringify(resultWithJobId, null, 4) + JSON.stringify(resultWithJobId, null, 4), ); } catch (error) { console.error("Error writing to results_with_content.json:", error); diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index da2b7d61..897ea46c 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -32,7 +32,7 @@ describe("WebCrawler", () => { getMatchingLineNumber: jest.fn().mockReturnValue(0), getCrawlDelay: jest.fn().mockReturnValue(0), getSitemaps: jest.fn().mockReturnValue([]), - getPreferredHost: jest.fn().mockReturnValue("example.com") + getPreferredHost: jest.fn().mockReturnValue("example.com"), }); }); @@ -46,7 +46,7 @@ describe("WebCrawler", () => { includes: [], excludes: [], limit: limit, // Apply the limit - maxCrawledDepth: 10 + maxCrawledDepth: 10, }); // Mock sitemap fetching function to return more links than the limit @@ -56,7 +56,7 @@ describe("WebCrawler", () => { initialUrl, initialUrl + "/page1", initialUrl + "/page2", - initialUrl + "/page3" + initialUrl + "/page3", ]); const filteredLinks = crawler["filterLinks"]( @@ -64,10 +64,10 @@ describe("WebCrawler", () => { initialUrl, initialUrl + "/page1", initialUrl + "/page2", - initialUrl + "/page3" + initialUrl + "/page3", ], limit, - 10 + 10, ); expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index be3cdf72..19b0b5b4 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -40,7 +40,7 @@ export class WebCrawler { allowBackwardCrawling = false, allowExternalContentLinks = false, allowSubdomains = false, - ignoreRobotsTxt = false + ignoreRobotsTxt = false, }: { jobId: string; initialUrl: string; @@ -79,7 +79,7 @@ export class WebCrawler { sitemapLinks: string[], limit: number, maxDepth: number, - fromMap: boolean = false + fromMap: boolean = false, ): string[] { // If the initial URL is a sitemap.xml, skip filtering if (this.initialUrl.endsWith("sitemap.xml") && fromMap) { @@ -95,7 +95,7 @@ export class WebCrawler { this.logger.debug(`Error processing link: ${link}`, { link, error, - method: "filterLinks" + method: "filterLinks", }); return false; } @@ -112,7 +112,7 @@ export class WebCrawler { if (this.excludes.length > 0 && this.excludes[0] !== "") { if ( this.excludes.some((excludePattern) => - new RegExp(excludePattern).test(path) + new RegExp(excludePattern).test(path), ) ) { return false; @@ -123,7 +123,7 @@ export class WebCrawler { if (this.includes.length > 0 && this.includes[0] !== "") { if ( !this.includes.some((includePattern) => - new RegExp(includePattern).test(path) + new RegExp(includePattern).test(path), ) ) { return false; @@ -140,7 +140,7 @@ export class WebCrawler { } const initialHostname = normalizedInitialUrl.hostname.replace( /^www\./, - "" + "", ); const linkHostname = normalizedLink.hostname.replace(/^www\./, ""); @@ -165,7 +165,7 @@ export class WebCrawler { if (!isAllowed) { this.logger.debug(`Link disallowed by robots.txt: ${link}`, { method: "filterLinks", - link + link, }); return false; } @@ -183,12 +183,12 @@ export class WebCrawler { let extraArgs = {}; if (skipTlsVerification) { extraArgs["httpsAgent"] = new https.Agent({ - rejectUnauthorized: false + rejectUnauthorized: false, }); } const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, - ...extraArgs + ...extraArgs, }); return response.data; } @@ -199,10 +199,10 @@ export class WebCrawler { public async tryGetSitemap( fromMap: boolean = false, - onlySitemap: boolean = false + onlySitemap: boolean = false, ): Promise<{ url: string; html: string }[] | null> { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { - method: "tryGetSitemap" + method: "tryGetSitemap", }); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (fromMap && onlySitemap) { @@ -213,7 +213,7 @@ export class WebCrawler { sitemapLinks, this.limit, this.maxCrawledDepth, - fromMap + fromMap, ); return filteredLinks.map((link) => ({ url: link, html: "" })); } @@ -303,7 +303,7 @@ export class WebCrawler { private isRobotsAllowed( url: string, - ignoreRobotsTxt: boolean = false + ignoreRobotsTxt: boolean = false, ): boolean { return ignoreRobotsTxt ? true @@ -352,7 +352,7 @@ export class WebCrawler { url .split("/") .slice(3) - .filter((subArray) => subArray.length > 0).length + .filter((subArray) => subArray.length > 0).length, ); } @@ -373,7 +373,7 @@ export class WebCrawler { private isSubdomain(link: string): boolean { return new URL(link, this.baseUrl).hostname.endsWith( - "." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".") + "." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."), ); } @@ -405,7 +405,7 @@ export class WebCrawler { ".ttf", ".woff2", ".webp", - ".inc" + ".inc", ]; try { @@ -414,7 +414,7 @@ export class WebCrawler { } catch (error) { this.logger.error(`Error processing URL in isFile`, { method: "isFile", - error + error, }); return false; } @@ -431,7 +431,7 @@ export class WebCrawler { "github.com", "calendly.com", "discord.gg", - "discord.com" + "discord.com", ]; return socialMediaOrEmail.some((ext) => url.includes(ext)); } @@ -457,14 +457,14 @@ export class WebCrawler { } catch (error) { this.logger.debug( `Failed to fetch sitemap with axios from ${sitemapUrl}`, - { method: "tryFetchSitemapLinks", sitemapUrl, error } + { method: "tryFetchSitemapLinks", sitemapUrl, error }, ); if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { const response = await getLinksFromSitemap( { sitemapUrl, mode: "fire-engine" }, - this.logger + this.logger, ); if (response) { sitemapLinks = response; @@ -476,26 +476,26 @@ export class WebCrawler { const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { const response = await axios.get(baseUrlSitemap, { - timeout: axiosTimeout + timeout: axiosTimeout, }); if (response.status === 200) { sitemapLinks = await getLinksFromSitemap( { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, - this.logger + this.logger, ); } } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", sitemapUrl: baseUrlSitemap, - error + error, }); if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { sitemapLinks = await getLinksFromSitemap( { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, - this.logger + this.logger, ); } } @@ -503,7 +503,7 @@ export class WebCrawler { const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map((link) => - normalizeUrl(link) + normalizeUrl(link), ); // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl if ( diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index ba77b78b..01c40de9 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -2,7 +2,7 @@ import { logger } from "../../../lib/logger"; export async function handleCustomScraping( text: string, - url: string + url: string, ): Promise<{ scraper: string; url: string; @@ -15,7 +15,7 @@ export async function handleCustomScraping( !url.includes("developers.notion.com") ) { logger.debug( - `Special use case detected for ${url}, using Fire Engine with wait time 1000ms` + `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`, ); return { scraper: "fire-engine", @@ -23,21 +23,21 @@ export async function handleCustomScraping( waitAfterLoad: 1000, pageOptions: { scrollXPaths: [ - '//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]' - ] - } + '//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]', + ], + }, }; } // Check for Vanta security portals if (text.includes(' { try { let content: string = ""; @@ -29,7 +29,7 @@ export async function getLinksFromSitemap( "sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), - { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true } + { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, ); if (!response.success) { throw response.error; @@ -41,7 +41,7 @@ export async function getLinksFromSitemap( method: "getLinksFromSitemap", mode, sitemapUrl, - error + error, }); return allUrls; @@ -56,8 +56,8 @@ export async function getLinksFromSitemap( .map((sitemap) => getLinksFromSitemap( { sitemapUrl: sitemap.loc[0], allUrls, mode }, - logger - ) + logger, + ), ); await Promise.all(sitemapPromises); } else if (root && root.url) { @@ -66,7 +66,7 @@ export async function getLinksFromSitemap( (url) => url.loc && url.loc.length > 0 && - !WebCrawler.prototype.isFile(url.loc[0]) + !WebCrawler.prototype.isFile(url.loc[0]), ) .map((url) => url.loc[0]); allUrls.push(...validUrls); @@ -76,7 +76,7 @@ export async function getLinksFromSitemap( method: "getLinksFromSitemap", mode, sitemapUrl, - error + error, }); } @@ -85,12 +85,12 @@ export async function getLinksFromSitemap( export const fetchSitemapData = async ( url: string, - timeout?: number + timeout?: number, ): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { const response = await axios.get(sitemapUrl, { - timeout: timeout || axiosTimeout + timeout: timeout || axiosTimeout, }); if (response.status === 200) { const xml = response.data; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts index d256aa44..d3963685 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -15,7 +15,7 @@ describe("Blocklist Functionality", () => { "https://flickr.com/photos/johndoe", "https://whatsapp.com/download", "https://wechat.com/features", - "https://telegram.org/apps" + "https://telegram.org/apps", ])("should return true for blocklisted URL %s", (url) => { expect(isUrlBlocked(url)).toBe(true); }); @@ -33,7 +33,7 @@ describe("Blocklist Functionality", () => { "https://flickr.com/help/terms", "https://whatsapp.com/legal", "https://wechat.com/en/privacy-policy", - "https://telegram.org/tos" + "https://telegram.org/tos", ])("should return false for allowed URLs with keywords %s", (url) => { expect(isUrlBlocked(url)).toBe(false); }); @@ -54,35 +54,35 @@ describe("Blocklist Functionality", () => { "https://facebook.com.someotherdomain.com", "https://www.facebook.com/profile", "https://api.twitter.com/info", - "https://instagram.com/accounts/login" + "https://instagram.com/accounts/login", ])( "should return true for URLs with blocklisted domains in subdomains or paths %s", (url) => { expect(isUrlBlocked(url)).toBe(true); - } + }, ); test.each([ "https://example.com/facebook.com", "https://example.com/redirect?url=https://twitter.com", - "https://facebook.com.policy.example.com" + "https://facebook.com.policy.example.com", ])( "should return false for URLs where blocklisted domain is part of another domain or path %s", (url) => { expect(isUrlBlocked(url)).toBe(false); - } + }, ); test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])( "should handle case variations %s", (url) => { expect(isUrlBlocked(url)).toBe(true); - } + }, ); test.each([ "https://facebook.com?redirect=https://example.com", - "https://twitter.com?query=something" + "https://twitter.com?query=something", ])("should handle query parameters %s", (url) => { expect(isUrlBlocked(url)).toBe(true); }); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index e60943e6..58fcade4 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -18,7 +18,7 @@ const socialMediaBlocklist = [ "youtube.com", "corterix.com", "southwest.com", - "ryanair.com" + "ryanair.com", ]; const allowedKeywords = [ @@ -41,7 +41,7 @@ const allowedKeywords = [ "://library.tiktok.com", "://ads.tiktok.com", "://tiktok.com/business", - "://developers.facebook.com" + "://developers.facebook.com", ]; export function isUrlBlocked(url: string): boolean { @@ -50,7 +50,7 @@ export function isUrlBlocked(url: string): boolean { // Check if the URL contains any allowed keywords as whole words if ( allowedKeywords.some((keyword) => - new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl) + new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl), ) ) { return false; @@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean { const isBlocked = socialMediaBlocklist.some((domain) => { const domainPattern = new RegExp( `(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`, - "i" + "i", ); return domainPattern.test(hostname); }); diff --git a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts index 3db7c5c1..a58f9c4e 100644 --- a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts +++ b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts @@ -1,6 +1,6 @@ export function getAdjustedMaxDepth( url: string, - maxCrawlDepth: number + maxCrawlDepth: number, ): number { const baseURLDepth = getURLDepth(url); const adjustedMaxDepth = maxCrawlDepth + baseURLDepth; diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts index f6ffcb13..f48806fd 100644 --- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts @@ -14,6 +14,6 @@ export async function scrapeCache(meta: Meta): Promise { url: entry.url, html: entry.html, statusCode: entry.statusCode, - error: entry.error + error: entry.error, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index 02ed0c3f..933d4d74 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -10,6 +10,6 @@ export async function scrapeDOCX(meta: Meta): Promise { url: response.url, statusCode: response.status, - html: (await mammoth.convertToHtml({ path: tempFilePath })).value + html: (await mammoth.convertToHtml({ path: tempFilePath })).value, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 92f2d451..af6f57c0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -4,33 +4,33 @@ import { TimeoutError } from "../../error"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; export async function scrapeURLWithFetch( - meta: Meta + meta: Meta, ): Promise { const timeout = 20000; const response = await Promise.race([ fetch(meta.url, { redirect: "follow", - headers: meta.options.headers + headers: meta.options.headers, }), (async () => { await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); throw new TimeoutError( "Fetch was unable to scrape the page before timing out", - { cause: { timeout } } + { cause: { timeout } }, ); - })() + })(), ]); specialtyScrapeCheck( meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), - Object.fromEntries(response.headers as any) + Object.fromEntries(response.headers as any), ); return { url: response.url, html: await response.text(), - statusCode: response.status + statusCode: response.status, // TODO: error? }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index c3742d26..328931ba 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -31,10 +31,10 @@ const successSchema = z.object({ actionContent: z .object({ url: z.string(), - html: z.string() + html: z.string(), }) .array() - .optional() + .optional(), }); export type FireEngineCheckStatusSuccess = z.infer; @@ -47,16 +47,16 @@ const processingSchema = z.object({ "waiting", "waiting-children", "unknown", - "prioritized" + "prioritized", ]), - processing: z.boolean() + processing: z.boolean(), }); const failedSchema = z.object({ jobId: z.string(), state: z.literal("failed"), processing: z.literal(false), - error: z.string() + error: z.string(), }); export class StillProcessingError extends Error { @@ -67,7 +67,7 @@ export class StillProcessingError extends Error { export async function fireEngineCheckStatus( logger: Logger, - jobId: string + jobId: string, ): Promise { const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; @@ -75,8 +75,8 @@ export async function fireEngineCheckStatus( { name: "fire-engine: Check status", attributes: { - jobId - } + jobId, + }, }, async (span) => { return await robustFetch({ @@ -87,12 +87,12 @@ export async function fireEngineCheckStatus( ...(Sentry.isInitialized() ? { "sentry-trace": Sentry.spanToTraceHeader(span), - baggage: Sentry.spanToBaggageHeader(span) + baggage: Sentry.spanToBaggageHeader(span), } - : {}) - } + : {}), + }, }); - } + }, ); const successParse = successSchema.safeParse(status); @@ -115,23 +115,23 @@ export async function fireEngineCheckStatus( throw new EngineError("Scrape job failed", { cause: { status, - jobId - } + jobId, + }, }); } } else { logger.debug("Check status returned response not matched by any schema", { status, - jobId + jobId, }); throw new Error( "Check status returned response not matched by any schema", { cause: { status, - jobId - } - } + jobId, + }, + }, ); } } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts index 96d73390..d5fe58cb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts @@ -10,8 +10,8 @@ export async function fireEngineDelete(logger: Logger, jobId: string) { { name: "fire-engine: Delete scrape", attributes: { - jobId - } + jobId, + }, }, async (span) => { await robustFetch({ @@ -21,15 +21,15 @@ export async function fireEngineDelete(logger: Logger, jobId: string) { ...(Sentry.isInitialized() ? { "sentry-trace": Sentry.spanToTraceHeader(span), - baggage: Sentry.spanToBaggageHeader(span) + baggage: Sentry.spanToBaggageHeader(span), } - : {}) + : {}), }, ignoreResponse: true, ignoreFailure: true, - logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }) + logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }), }); - } + }, ); // We do not care whether this fails or not. diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 851b8faf..3fc32835 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -5,13 +5,13 @@ import { FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, - FireEngineScrapeRequestTLSClient + FireEngineScrapeRequestTLSClient, } from "./scrape"; import { EngineScrapeResult } from ".."; import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, - StillProcessingError + StillProcessingError, } from "./checkStatus"; import { EngineError, SiteError, TimeoutError } from "../../error"; import * as Sentry from "@sentry/node"; @@ -27,15 +27,15 @@ async function performFireEngineScrape< Engine extends | FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright - | FireEngineScrapeRequestTLSClient + | FireEngineScrapeRequestTLSClient, >( logger: Logger, request: FireEngineScrapeRequestCommon & Engine, - timeout = defaultTimeout + timeout = defaultTimeout, ): Promise { const scrape = await fireEngineScrape( logger.child({ method: "fireEngineScrape" }), - request + request, ); const startTime = Date.now(); @@ -47,25 +47,25 @@ async function performFireEngineScrape< if (errors.length >= errorLimit) { logger.error("Error limit hit.", { errors }); throw new Error("Error limit hit. See e.cause.errors for errors.", { - cause: { errors } + cause: { errors }, }); } if (Date.now() - startTime > timeout) { logger.info( "Fire-engine was unable to scrape the page before timing out.", - { errors, timeout } + { errors, timeout }, ); throw new TimeoutError( "Fire-engine was unable to scrape the page before timing out", - { cause: { errors, timeout } } + { cause: { errors, timeout } }, ); } try { status = await fireEngineCheckStatus( logger.child({ method: "fireEngineCheckStatus" }), - scrape.jobId + scrape.jobId, ); } catch (error) { if (error instanceof StillProcessingError) { @@ -73,7 +73,7 @@ async function performFireEngineScrape< } else if (error instanceof EngineError || error instanceof SiteError) { logger.debug("Fire-engine scrape job failed.", { error, - jobId: scrape.jobId + jobId: scrape.jobId, }); throw error; } else { @@ -81,7 +81,7 @@ async function performFireEngineScrape< errors.push(error); logger.debug( `An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, - { error, jobId: scrape.jobId } + { error, jobId: scrape.jobId }, ); } } @@ -93,7 +93,7 @@ async function performFireEngineScrape< } export async function scrapeURLWithFireEngineChromeCDP( - meta: Meta + meta: Meta, ): Promise { const actions: Action[] = [ // Transform waitFor option into an action (unsupported by chrome-cdp) @@ -101,8 +101,8 @@ export async function scrapeURLWithFireEngineChromeCDP( ? [ { type: "wait" as const, - milliseconds: meta.options.waitFor - } + milliseconds: meta.options.waitFor, + }, ] : []), @@ -112,13 +112,13 @@ export async function scrapeURLWithFireEngineChromeCDP( ? [ { type: "screenshot" as const, - fullPage: meta.options.formats.includes("screenshot@fullPage") - } + fullPage: meta.options.formats.includes("screenshot@fullPage"), + }, ] : []), // Include specified actions - ...(meta.options.actions ?? []) + ...(meta.options.actions ?? []), ]; const request: FireEngineScrapeRequestCommon & @@ -130,36 +130,36 @@ export async function scrapeURLWithFireEngineChromeCDP( headers: meta.options.headers, ...(actions.length > 0 ? { - actions + actions, } : {}), priority: meta.internalOptions.priority, geolocation: meta.options.geolocation, mobile: meta.options.mobile, timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic - disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache + disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache, // TODO: scrollXPaths }; const totalWait = actions.reduce( (a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a), - 0 + 0, ); let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", - request + request, }), request, - meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity // TODO: better timeout handling + meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling ); specialtyScrapeCheck( meta.logger.child({ - method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" + method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck", }), - response.responseHeaders + response.responseHeaders, ); if ( @@ -168,20 +168,20 @@ export async function scrapeURLWithFireEngineChromeCDP( ) { meta.logger.debug( "Transforming screenshots from actions into screenshot field", - { screenshots: response.screenshots } + { screenshots: response.screenshots }, ); response.screenshot = (response.screenshots ?? [])[0]; (response.screenshots ?? []).splice(0, 1); meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, - screenshot: response.screenshot + screenshot: response.screenshot, }); } if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, - sourceURL: meta.url + sourceURL: meta.url, }); } @@ -197,15 +197,15 @@ export async function scrapeURLWithFireEngineChromeCDP( ? { actions: { screenshots: response.screenshots ?? [], - scrapes: response.actionContent ?? [] - } + scrapes: response.actionContent ?? [], + }, } - : {}) + : {}), }; } export async function scrapeURLWithFireEnginePlaywright( - meta: Meta + meta: Meta, ): Promise { const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = { @@ -220,31 +220,31 @@ export async function scrapeURLWithFireEnginePlaywright( wait: meta.options.waitFor, geolocation: meta.options.geolocation, - timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic + timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic }; let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", - request + request, }), request, meta.options.timeout !== undefined ? defaultTimeout + meta.options.waitFor - : Infinity // TODO: better timeout handling + : Infinity, // TODO: better timeout handling ); specialtyScrapeCheck( meta.logger.child({ - method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" + method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck", }), - response.responseHeaders + response.responseHeaders, ); if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, - sourceURL: meta.url + sourceURL: meta.url, }); } @@ -257,14 +257,14 @@ export async function scrapeURLWithFireEnginePlaywright( ...(response.screenshots !== undefined && response.screenshots.length > 0 ? { - screenshot: response.screenshots[0] + screenshot: response.screenshots[0], } - : {}) + : {}), }; } export async function scrapeURLWithFireEngineTLSClient( - meta: Meta + meta: Meta, ): Promise { const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = { @@ -279,29 +279,29 @@ export async function scrapeURLWithFireEngineTLSClient( geolocation: meta.options.geolocation, disableJsDom: meta.internalOptions.v0DisableJsDom, - timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic + timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic }; let response = await performFireEngineScrape( meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", - request + request, }), request, - meta.options.timeout !== undefined ? defaultTimeout : Infinity // TODO: better timeout handling + meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling ); specialtyScrapeCheck( meta.logger.child({ - method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" + method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck", }), - response.responseHeaders + response.responseHeaders, ); if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, - sourceURL: meta.url + sourceURL: meta.url, }); } @@ -310,6 +310,6 @@ export async function scrapeURLWithFireEngineTLSClient( html: response.content, error: response.pageError, - statusCode: response.pageStatusCode + statusCode: response.pageStatusCode, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index ffca4b41..de6ac3f4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -58,17 +58,17 @@ export type FireEngineScrapeRequestTLSClient = { const schema = z.object({ jobId: z.string(), - processing: z.boolean() + processing: z.boolean(), }); export async function fireEngineScrape< Engine extends | FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright - | FireEngineScrapeRequestTLSClient + | FireEngineScrapeRequestTLSClient, >( logger: Logger, - request: FireEngineScrapeRequestCommon & Engine + request: FireEngineScrapeRequestCommon & Engine, ): Promise> { const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!; @@ -78,8 +78,8 @@ export async function fireEngineScrape< { name: "fire-engine: Scrape", attributes: { - url: request.url - } + url: request.url, + }, }, async (span) => { return await robustFetch({ @@ -89,16 +89,16 @@ export async function fireEngineScrape< ...(Sentry.isInitialized() ? { "sentry-trace": Sentry.spanToTraceHeader(span), - baggage: Sentry.spanToBaggageHeader(span) + baggage: Sentry.spanToBaggageHeader(span), } - : {}) + : {}), }, body: request, logger: logger.child({ method: "fireEngineScrape/robustFetch" }), schema, - tryCount: 3 + tryCount: 3, }); - } + }, ); return scrapeRequest; diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 1d9db249..01ac0be9 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -4,7 +4,7 @@ import { scrapeDOCX } from "./docx"; import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, - scrapeURLWithFireEngineTLSClient + scrapeURLWithFireEngineTLSClient, } from "./fire-engine"; import { scrapePDF } from "./pdf"; import { scrapeURLWithScrapingBee } from "./scrapingbee"; @@ -43,7 +43,7 @@ export const engines: Engine[] = [ ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, - "fire-engine;tlsclient" as const + "fire-engine;tlsclient" as const, ] : []), ...(useScrapingBee @@ -52,7 +52,7 @@ export const engines: Engine[] = [ ...(usePlaywright ? ["playwright" as const] : []), "fetch", "pdf", - "docx" + "docx", ]; export const featureFlags = [ @@ -66,7 +66,7 @@ export const featureFlags = [ "location", "mobile", "skipTlsVerification", - "useFastMode" + "useFastMode", ] as const; export type FeatureFlag = (typeof featureFlags)[number]; @@ -86,7 +86,7 @@ export const featureFlagOptions: { useFastMode: { priority: 90 }, location: { priority: 10 }, mobile: { priority: 10 }, - skipTlsVerification: { priority: 10 } + skipTlsVerification: { priority: 10 }, } as const; export type EngineScrapeResult = { @@ -116,7 +116,7 @@ const engineHandlers: { playwright: scrapeURLWithPlaywright, fetch: scrapeURLWithFetch, pdf: scrapePDF, - docx: scrapeDOCX + docx: scrapeDOCX, }; export const engineOptions: { @@ -141,9 +141,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: false + useFastMode: false, }, - quality: 1000 // cache should always be tried first + quality: 1000, // cache should always be tried first }, "fire-engine;chrome-cdp": { features: { @@ -157,9 +157,9 @@ export const engineOptions: { location: true, mobile: true, skipTlsVerification: true, - useFastMode: false + useFastMode: false, }, - quality: 50 + quality: 50, }, "fire-engine;playwright": { features: { @@ -173,9 +173,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: false + useFastMode: false, }, - quality: 40 + quality: 40, }, scrapingbee: { features: { @@ -189,9 +189,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: false + useFastMode: false, }, - quality: 30 + quality: 30, }, scrapingbeeLoad: { features: { @@ -205,9 +205,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: false + useFastMode: false, }, - quality: 29 + quality: 29, }, playwright: { features: { @@ -221,9 +221,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: false + useFastMode: false, }, - quality: 20 + quality: 20, }, "fire-engine;tlsclient": { features: { @@ -237,9 +237,9 @@ export const engineOptions: { location: true, mobile: false, skipTlsVerification: false, - useFastMode: true + useFastMode: true, }, - quality: 10 + quality: 10, }, fetch: { features: { @@ -253,9 +253,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: true + useFastMode: true, }, - quality: 5 + quality: 5, }, pdf: { features: { @@ -269,9 +269,9 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: true + useFastMode: true, }, - quality: -10 + quality: -10, }, docx: { features: { @@ -285,10 +285,10 @@ export const engineOptions: { location: false, mobile: false, skipTlsVerification: false, - useFastMode: true + useFastMode: true, }, - quality: -10 - } + quality: -10, + }, }; export function buildFallbackList(meta: Meta): { @@ -297,7 +297,7 @@ export function buildFallbackList(meta: Meta): { }[] { const prioritySum = [...meta.featureFlags].reduce( (a, x) => a + featureFlagOptions[x].priority, - 0 + 0, ); const priorityThreshold = Math.floor(prioritySum / 2); let selectedEngines: { @@ -315,13 +315,13 @@ export function buildFallbackList(meta: Meta): { const supportedFlags = new Set([ ...Object.entries(engineOptions[engine].features) .filter( - ([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true + ([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true, ) - .map(([k, _]) => k) + .map(([k, _]) => k), ]); const supportScore = [...supportedFlags].reduce( (a, x) => a + featureFlagOptions[x].priority, - 0 + 0, ); const unsupportedFeatures = new Set([...meta.featureFlags]); @@ -338,7 +338,7 @@ export function buildFallbackList(meta: Meta): { prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], - unsupportedFeatures + unsupportedFeatures, }); } else { meta.logger.debug( @@ -348,22 +348,22 @@ export function buildFallbackList(meta: Meta): { prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], - unsupportedFeatures - } + unsupportedFeatures, + }, ); } } if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) { selectedEngines = selectedEngines.filter( - (x) => engineOptions[x.engine].quality > 0 + (x) => engineOptions[x.engine].quality > 0, ); } selectedEngines.sort( (a, b) => b.supportScore - a.supportScore || - engineOptions[b.engine].quality - engineOptions[a.engine].quality + engineOptions[b.engine].quality - engineOptions[a.engine].quality, ); return selectedEngines; @@ -371,16 +371,16 @@ export function buildFallbackList(meta: Meta): { export async function scrapeURLWithEngine( meta: Meta, - engine: Engine + engine: Engine, ): Promise { const fn = engineHandlers[engine]; const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", - engine + engine, }); const _meta = { ...meta, - logger + logger, }; return await fn(_meta); diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 62313a71..341a4f1a 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -14,10 +14,10 @@ type PDFProcessorResult = { html: string; markdown?: string }; async function scrapePDFWithLlamaParse( meta: Meta, - tempFilePath: string + tempFilePath: string, ): Promise { meta.logger.debug("Processing PDF document with LlamaIndex", { - tempFilePath + tempFilePath, }); const uploadForm = new FormData(); @@ -28,7 +28,7 @@ async function scrapePDFWithLlamaParse( name: tempFilePath, stream() { return createReadStream( - tempFilePath + tempFilePath, ) as unknown as ReadableStream; }, arrayBuffer() { @@ -41,22 +41,22 @@ async function scrapePDFWithLlamaParse( slice(start, end, contentType) { throw Error("Unimplemented in mock Blob: slice"); }, - type: "application/pdf" + type: "application/pdf", } as Blob); const upload = await robustFetch({ url: "https://api.cloud.llamaindex.ai/api/parsing/upload", method: "POST", headers: { - Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}` + Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`, }, body: uploadForm, logger: meta.logger.child({ - method: "scrapePDFWithLlamaParse/upload/robustFetch" + method: "scrapePDFWithLlamaParse/upload/robustFetch", }), schema: z.object({ - id: z.string() - }) + id: z.string(), + }), }); const jobId = upload.id; @@ -70,18 +70,18 @@ async function scrapePDFWithLlamaParse( url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, method: "GET", headers: { - Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}` + Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`, }, logger: meta.logger.child({ - method: "scrapePDFWithLlamaParse/result/robustFetch" + method: "scrapePDFWithLlamaParse/result/robustFetch", }), schema: z.object({ - markdown: z.string() - }) + markdown: z.string(), + }), }); return { markdown: result.markdown, - html: await marked.parse(result.markdown, { async: true }) + html: await marked.parse(result.markdown, { async: true }), }; } catch (e) { if (e instanceof Error && e.message === "Request sent failure status") { @@ -93,7 +93,7 @@ async function scrapePDFWithLlamaParse( throw new RemoveFeatureError(["pdf"]); } else { throw new Error("LlamaParse threw an error", { - cause: e.cause + cause: e.cause, }); } } else { @@ -109,7 +109,7 @@ async function scrapePDFWithLlamaParse( async function scrapePDFWithParsePDF( meta: Meta, - tempFilePath: string + tempFilePath: string, ): Promise { meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); @@ -118,7 +118,7 @@ async function scrapePDFWithParsePDF( return { markdown: escaped, - html: escaped + html: escaped, }; } @@ -131,7 +131,7 @@ export async function scrapePDF(meta: Meta): Promise { statusCode: file.response.status, html: content, - markdown: content + markdown: content, }; } @@ -144,22 +144,22 @@ export async function scrapePDF(meta: Meta): Promise { { ...meta, logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithLlamaParse" - }) + method: "scrapePDF/scrapePDFWithLlamaParse", + }), }, - tempFilePath + tempFilePath, ); } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { - error + error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( "LlamaParse failed to parse PDF -- falling back to parse-pdf", - { error } + { error }, ); Sentry.captureException(error); } @@ -170,9 +170,11 @@ export async function scrapePDF(meta: Meta): Promise { result = await scrapePDFWithParsePDF( { ...meta, - logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }) + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), }, - tempFilePath + tempFilePath, ); } @@ -183,6 +185,6 @@ export async function scrapePDF(meta: Meta): Promise { statusCode: response.status, html: result.html, - markdown: result.markdown + markdown: result.markdown, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index a8c16045..c92b1d90 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -5,7 +5,7 @@ import { TimeoutError } from "../../error"; import { robustFetch } from "../../lib/fetch"; export async function scrapeURLWithPlaywright( - meta: Meta + meta: Meta, ): Promise { const timeout = 20000 + meta.options.waitFor; @@ -13,35 +13,35 @@ export async function scrapeURLWithPlaywright( await robustFetch({ url: process.env.PLAYWRIGHT_MICROSERVICE_URL!, headers: { - "Content-Type": "application/json" + "Content-Type": "application/json", }, body: { url: meta.url, wait_after_load: meta.options.waitFor, timeout, - headers: meta.options.headers + headers: meta.options.headers, }, method: "POST", logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"), schema: z.object({ content: z.string(), pageStatusCode: z.number(), - pageError: z.string().optional() - }) + pageError: z.string().optional(), + }), }), (async () => { await new Promise((resolve) => setTimeout(() => resolve(null), 20000)); throw new TimeoutError( "Playwright was unable to scrape the page before timing out", - { cause: { timeout } } + { cause: { timeout } }, ); - })() + })(), ]); return { url: meta.url, // TODO: impove redirect following html: response.content, statusCode: response.pageStatusCode, - error: response.pageError + error: response.pageError, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 8388016a..50ac502b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -8,7 +8,7 @@ import { EngineError } from "../../error"; const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!); export function scrapeURLWithScrapingBee( - wait_browser: "domcontentloaded" | "networkidle2" + wait_browser: "domcontentloaded" | "networkidle2", ): (meta: Meta) => Promise { return async (meta: Meta): Promise => { let response: AxiosResponse; @@ -23,12 +23,12 @@ export function scrapeURLWithScrapingBee( json_response: true, screenshot: meta.options.formats.includes("screenshot"), screenshot_full_page: meta.options.formats.includes( - "screenshot@fullPage" - ) + "screenshot@fullPage", + ), }, headers: { - "ScrapingService-Request": "TRUE" // this is sent to the page, not to ScrapingBee - mogery - } + "ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery + }, }); } catch (error) { if (error instanceof AxiosError && error.response !== undefined) { @@ -51,25 +51,25 @@ export function scrapeURLWithScrapingBee( if (body.errors || body.body?.error || isHiddenEngineError) { meta.logger.error("ScrapingBee threw an error", { - body: body.body?.error ?? body.errors ?? body.body ?? body + body: body.body?.error ?? body.errors ?? body.body ?? body, }); throw new EngineError("Engine error #34", { - cause: { body, statusCode: response.status } + cause: { body, statusCode: response.status }, }); } if (typeof body.body !== "string") { meta.logger.error("ScrapingBee: Body is not string??", { body }); throw new EngineError("Engine error #35", { - cause: { body, statusCode: response.status } + cause: { body, statusCode: response.status }, }); } specialtyScrapeCheck( meta.logger.child({ - method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" + method: "scrapeURLWithScrapingBee/specialtyScrapeCheck", }), - body.headers + body.headers, ); return { @@ -80,9 +80,9 @@ export function scrapeURLWithScrapingBee( statusCode: response.status, ...(body.screenshot ? { - screenshot: `data:image/png;base64,${body.screenshot}` + screenshot: `data:image/png;base64,${body.screenshot}`, } - : {}) + : {}), }; }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 84a52425..e2e3ee6f 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -13,13 +13,13 @@ export async function fetchFileToBuffer(url: string): Promise<{ const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying return { response, - buffer: Buffer.from(await response.arrayBuffer()) + buffer: Buffer.from(await response.arrayBuffer()), }; } export async function downloadFile( id: string, - url: string + url: string, ): Promise<{ response: undici.Response; tempFilePath: string; @@ -32,9 +32,9 @@ export async function downloadFile( const response = await undici.fetch(url, { dispatcher: new undici.Agent({ connect: { - rejectUnauthorized: false - } - }) + rejectUnauthorized: false, + }, + }), }); // This should never happen in the current state of JS (2024), but let's check anyways. @@ -47,13 +47,13 @@ export async function downloadFile( tempFileWrite.on("finish", () => resolve(null)); tempFileWrite.on("error", (error) => { reject( - new EngineError("Failed to write to temp file", { cause: { error } }) + new EngineError("Failed to write to temp file", { cause: { error } }), ); }); }); return { response, - tempFilePath + tempFilePath, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts index 4f497e52..352f6a7e 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts @@ -3,15 +3,15 @@ import { AddFeatureError } from "../../error"; export function specialtyScrapeCheck( logger: Logger, - headers: Record | undefined + headers: Record | undefined, ) { const contentType = (Object.entries(headers ?? {}).find( - (x) => x[0].toLowerCase() === "content-type" + (x) => x[0].toLowerCase() === "content-type", ) ?? [])[1]; if (contentType === undefined) { logger.warn("Failed to check contentType -- was not present in headers", { - headers + headers, }); } else if ( contentType === "application/pdf" || @@ -23,7 +23,7 @@ export function specialtyScrapeCheck( contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith( - "application/vnd.openxmlformats-officedocument.wordprocessingml.document;" + "application/vnd.openxmlformats-officedocument.wordprocessingml.document;", ) ) { // .docx diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index c6eb45e3..ec044745 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -19,7 +19,7 @@ export class NoEnginesLeftError extends Error { constructor(fallbackList: Engine[], results: EngineResultsTracker) { super( - "All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com." + "All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.", ); this.fallbackList = fallbackList; this.results = results; @@ -40,7 +40,8 @@ export class RemoveFeatureError extends Error { constructor(featureFlags: FeatureFlag[]) { super( - "Incorrect feature flags have been discovered: " + featureFlags.join(", ") + "Incorrect feature flags have been discovered: " + + featureFlags.join(", "), ); this.featureFlags = featureFlags; } @@ -50,7 +51,7 @@ export class SiteError extends Error { public code: string; constructor(code: string) { super( - "Specified URL is failing to load in the browser. Error code: " + code + "Specified URL is failing to load in the browser. Error code: " + code, ); this.code = code; } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 0a0b6c92..a3eb6f1e 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -8,7 +8,7 @@ import { Engine, EngineScrapeResult, FeatureFlag, - scrapeURLWithEngine + scrapeURLWithEngine, } from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { @@ -17,7 +17,7 @@ import { NoEnginesLeftError, RemoveFeatureError, SiteError, - TimeoutError + TimeoutError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; @@ -50,7 +50,7 @@ export type Meta = { function buildFeatureFlags( url: string, options: ScrapeOptions, - internalOptions: InternalOptions + internalOptions: InternalOptions, ): Set { const flags: Set = new Set(); @@ -112,7 +112,7 @@ function buildMetaObject( id: string, url: string, options: ScrapeOptions, - internalOptions: InternalOptions + internalOptions: InternalOptions, ): Meta { const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")]; @@ -120,14 +120,14 @@ function buildMetaObject( options = Object.assign(options, specParams.scrapeOptions); internalOptions = Object.assign( internalOptions, - specParams.internalOptions + specParams.internalOptions, ); } const _logger = logger.child({ module: "ScrapeURL", scrapeId: id, - scrapeURL: url + scrapeURL: url, }); const logs: any[] = []; @@ -138,7 +138,7 @@ function buildMetaObject( internalOptions, logger: _logger, logs, - featureFlags: buildFeatureFlags(url, options, internalOptions) + featureFlags: buildFeatureFlags(url, options, internalOptions), }; } @@ -229,7 +229,7 @@ async function scrapeURLLoop(meta: Meta): Promise { factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, unsupportedFeatures, startedAt, - finishedAt: Date.now() + finishedAt: Date.now(), }; // NOTE: TODO: what to do when status code is bad is tough... @@ -237,35 +237,35 @@ async function scrapeURLLoop(meta: Meta): Promise { // should we just use all the fallbacks and pick the one with the longest text? - mogery if (isLongEnough || !isGoodStatusCode) { meta.logger.info("Scrape via " + engine + " deemed successful.", { - factors: { isLongEnough, isGoodStatusCode, hasNoPageError } + factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, }); result = { engine, unsupportedFeatures, - result: engineResult as EngineScrapeResult & { markdown: string } + result: engineResult as EngineScrapeResult & { markdown: string }, }; break; } } catch (error) { if (error instanceof EngineError) { meta.logger.info("Engine " + engine + " could not scrape the page.", { - error + error, }); results[engine] = { state: "error", error: safeguardCircularError(error), unexpected: false, startedAt, - finishedAt: Date.now() + finishedAt: Date.now(), }; } else if (error instanceof TimeoutError) { meta.logger.info("Engine " + engine + " timed out while scraping.", { - error + error, }); results[engine] = { state: "timeout", startedAt, - finishedAt: Date.now() + finishedAt: Date.now(), }; } else if ( error instanceof AddFeatureError || @@ -278,7 +278,7 @@ async function scrapeURLLoop(meta: Meta): Promise { error: safeguardCircularError(error), unexpected: true, startedAt, - finishedAt: Date.now() + finishedAt: Date.now(), }; error.results = results; meta.logger.warn("LLM refusal encountered", { error }); @@ -289,14 +289,14 @@ async function scrapeURLLoop(meta: Meta): Promise { Sentry.captureException(error); meta.logger.info( "An unexpected error happened while scraping with " + engine + ".", - { error } + { error }, ); results[engine] = { state: "error", error: safeguardCircularError(error), unexpected: true, startedAt, - finishedAt: Date.now() + finishedAt: Date.now(), }; } } @@ -305,7 +305,7 @@ async function scrapeURLLoop(meta: Meta): Promise { if (result === null) { throw new NoEnginesLeftError( fallbackList.map((x) => x.engine), - results + results, ); } @@ -318,15 +318,15 @@ async function scrapeURLLoop(meta: Meta): Promise { sourceURL: meta.url, url: result.result.url, statusCode: result.result.statusCode, - error: result.result.error - } + error: result.result.error, + }, }; if (result.unsupportedFeatures.size > 0) { const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`; meta.logger.warn(warning, { engine: result.engine, - unsupportedFeatures: result.unsupportedFeatures + unsupportedFeatures: result.unsupportedFeatures, }); document.warning = document.warning !== undefined @@ -340,7 +340,7 @@ async function scrapeURLLoop(meta: Meta): Promise { success: true, document, logs: meta.logs, - engines: results + engines: results, }; } @@ -348,7 +348,7 @@ export async function scrapeURL( id: string, url: string, options: ScrapeOptions, - internalOptions: InternalOptions = {} + internalOptions: InternalOptions = {}, ): Promise { const meta = buildMetaObject(id, url, options, internalOptions); try { @@ -363,10 +363,10 @@ export async function scrapeURL( meta.logger.debug( "More feature flags requested by scraper: adding " + error.featureFlags.join(", "), - { error, existingFlags: meta.featureFlags } + { error, existingFlags: meta.featureFlags }, ); meta.featureFlags = new Set( - [...meta.featureFlags].concat(error.featureFlags) + [...meta.featureFlags].concat(error.featureFlags), ); } else if ( error instanceof RemoveFeatureError && @@ -375,12 +375,12 @@ export async function scrapeURL( meta.logger.debug( "Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), - { error, existingFlags: meta.featureFlags } + { error, existingFlags: meta.featureFlags }, ); meta.featureFlags = new Set( [...meta.featureFlags].filter( - (x) => !error.featureFlags.includes(x) - ) + (x) => !error.featureFlags.includes(x), + ), ); } else { throw error; @@ -415,7 +415,7 @@ export async function scrapeURL( success: false, error, logs: meta.logs, - engines: results + engines: results, }; } } diff --git a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts index 6d71c036..7d612875 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts @@ -27,7 +27,7 @@ export function extractLinks(html: string, baseUrl: string): string[] { } catch (error) { logger.error( `Failed to construct URL for href: ${href} with base: ${baseUrl}`, - { error } + { error }, ); } } diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 0f581373..040bf0ee 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -4,7 +4,7 @@ import { Meta } from ".."; export function extractMetadata( meta: Meta, - html: string + html: string, ): Document["metadata"] { let title: string | undefined = undefined; let description: string | undefined = undefined; @@ -148,6 +148,6 @@ export function extractMetadata( publishedTime, articleTag, articleSection, - ...customMetadata + ...customMetadata, }; } diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 400c23a7..897587a9 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -20,7 +20,7 @@ export type RobustFetchParams> = { export async function robustFetch< Schema extends z.Schema, - Output = z.infer + Output = z.infer, >({ url, logger, @@ -32,7 +32,7 @@ export async function robustFetch< ignoreFailure = false, requestId = uuid(), tryCount = 1, - tryCooldown + tryCooldown, }: RobustFetchParams): Promise { const params = { url, @@ -44,7 +44,7 @@ export async function robustFetch< ignoreResponse, ignoreFailure, tryCount, - tryCooldown + tryCooldown, }; let request: Response; @@ -56,20 +56,20 @@ export async function robustFetch< ? {} : body !== undefined ? { - "Content-Type": "application/json" + "Content-Type": "application/json", } : {}), - ...(headers !== undefined ? headers : {}) + ...(headers !== undefined ? headers : {}), }, ...(body instanceof FormData ? { - body + body, } : body !== undefined ? { - body: JSON.stringify(body) + body: JSON.stringify(body), } - : {}) + : {}), }); } catch (error) { if (!ignoreFailure) { @@ -77,12 +77,12 @@ export async function robustFetch< if (tryCount > 1) { logger.debug( "Request failed, trying " + (tryCount - 1) + " more times", - { params, error, requestId } + { params, error, requestId }, ); return await robustFetch({ ...params, requestId, - tryCount: tryCount - 1 + tryCount: tryCount - 1, }); } else { logger.debug("Request failed", { params, error, requestId }); @@ -90,8 +90,8 @@ export async function robustFetch< cause: { params, requestId, - error - } + error, + }, }); } } else { @@ -106,39 +106,39 @@ export async function robustFetch< const response = { status: request.status, headers: request.headers, - body: await request.text() // NOTE: can this throw an exception? + body: await request.text(), // NOTE: can this throw an exception? }; if (request.status >= 300) { if (tryCount > 1) { logger.debug( "Request sent failure status, trying " + (tryCount - 1) + " more times", - { params, request, response, requestId } + { params, request, response, requestId }, ); if (tryCooldown !== undefined) { await new Promise((resolve) => - setTimeout(() => resolve(null), tryCooldown) + setTimeout(() => resolve(null), tryCooldown), ); } return await robustFetch({ ...params, requestId, - tryCount: tryCount - 1 + tryCount: tryCount - 1, }); } else { logger.debug("Request sent failure status", { params, request, response, - requestId + requestId, }); throw new Error("Request sent failure status", { cause: { params, request, response, - requestId - } + requestId, + }, }); } } @@ -151,15 +151,15 @@ export async function robustFetch< params, request, response, - requestId + requestId, }); throw new Error("Request sent malformed JSON", { cause: { params, request, response, - requestId - } + requestId, + }, }); } @@ -174,7 +174,7 @@ export async function robustFetch< response, requestId, error, - schema + schema, }); throw new Error("Response does not match provided schema", { cause: { @@ -183,8 +183,8 @@ export async function robustFetch< response, requestId, error, - schema - } + schema, + }, }); } else { logger.debug("Parsing response with provided schema failed", { @@ -193,7 +193,7 @@ export async function robustFetch< response, requestId, error, - schema + schema, }); throw new Error("Parsing response with provided schema failed", { cause: { @@ -202,8 +202,8 @@ export async function robustFetch< response, requestId, error, - schema - } + schema, + }, }); } } diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 7701aeaf..3afbabd5 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -47,14 +47,14 @@ const excludeNonMainTags = [ ".widget", "#widget", ".cookie", - "#cookie" + "#cookie", ]; const forceIncludeMainTags = ["#main"]; export const removeUnwantedElements = ( html: string, - scrapeOptions: ScrapeOptions + scrapeOptions: ScrapeOptions, ) => { const soup = load(html); @@ -89,11 +89,11 @@ export const removeUnwantedElements = ( const attributes = element.attribs; const tagNameMatches = regexPattern.test(element.name); const attributesMatch = Object.keys(attributes).some((attr) => - regexPattern.test(`${attr}="${attributes[attr]}"`) + regexPattern.test(`${attr}="${attributes[attr]}"`), ); if (tag.startsWith("*.")) { classMatch = Object.keys(attributes).some((attr) => - regexPattern.test(`class="${attributes[attr]}"`) + regexPattern.test(`class="${attributes[attr]}"`), ); } return tagNameMatches || attributesMatch || classMatch; @@ -110,7 +110,7 @@ export const removeUnwantedElements = ( if (scrapeOptions.onlyMainContent) { excludeNonMainTags.forEach((tag) => { const elementsToRemove = soup(tag).filter( - forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join("") + forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join(""), ); elementsToRemove.remove(); diff --git a/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts b/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts index 0810dc93..8a3d6c3e 100644 --- a/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts +++ b/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts @@ -42,10 +42,10 @@ export const urlSpecificParams: Record = { // }, "digikey.com": { scrapeOptions: {}, - internalOptions: { forceEngine: "fire-engine;tlsclient" } + internalOptions: { forceEngine: "fire-engine;tlsclient" }, }, "lorealparis.hu": { scrapeOptions: {}, - internalOptions: { forceEngine: "fire-engine;tlsclient" } - } + internalOptions: { forceEngine: "fire-engine;tlsclient" }, + }, }; diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index 8bef0c2c..8b783821 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -13,7 +13,7 @@ const testEngines: (Engine | undefined)[] = [ "fire-engine;tlsclient", "scrapingbee", "scrapingbeeLoad", - "fetch" + "fetch", ]; const testEnginesScreenshot: (Engine | undefined)[] = [ @@ -21,7 +21,7 @@ const testEnginesScreenshot: (Engine | undefined)[] = [ "fire-engine;chrome-cdp", "fire-engine;playwright", "scrapingbee", - "scrapingbeeLoad" + "scrapingbeeLoad", ]; describe("Standalone scrapeURL tests", () => { @@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -46,26 +46,26 @@ describe("Standalone scrapeURL tests", () => { expect(out.document.metadata.error).toBeUndefined(); expect(out.document.metadata.title).toBe("Roast My Website"); expect(out.document.metadata.description).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(out.document.metadata.keywords).toBe( - "Roast My Website,Roast,Website,GitHub,Firecrawl" + "Roast My Website,Roast,Website,GitHub,Firecrawl", ); expect(out.document.metadata.robots).toBe("follow, index"); expect(out.document.metadata.ogTitle).toBe("Roast My Website"); expect(out.document.metadata.ogDescription).toBe( - "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️" + "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️", ); expect(out.document.metadata.ogUrl).toBe( - "https://www.roastmywebsite.ai" + "https://www.roastmywebsite.ai", ); expect(out.document.metadata.ogImage).toBe( - "https://www.roastmywebsite.ai/og.png" + "https://www.roastmywebsite.ai/og.png", ); expect(out.document.metadata.ogLocaleAlternate).toStrictEqual([]); expect(out.document.metadata.ogSiteName).toBe("Roast My Website"); expect(out.document.metadata.sourceURL).toBe( - "https://www.roastmywebsite.ai/" + "https://www.roastmywebsite.ai/", ); expect(out.document.metadata.statusCode).toBe(200); } @@ -76,9 +76,9 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-formats-markdown-html", "https://roastmywebsite.ai", scrapeOptions.parse({ - formats: ["markdown", "html"] + formats: ["markdown", "html"], }), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -100,9 +100,9 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-onlyMainContent-false", "https://www.scrapethissite.com/", scrapeOptions.parse({ - onlyMainContent: false + onlyMainContent: false, }), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -123,9 +123,9 @@ describe("Standalone scrapeURL tests", () => { "https://www.scrapethissite.com/", scrapeOptions.parse({ onlyMainContent: false, - excludeTags: [".nav", "#footer", "strong"] + excludeTags: [".nav", "#footer", "strong"], }), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -264,10 +264,10 @@ describe("Standalone scrapeURL tests", () => { expect(out.document.markdown).toContain("Explore Sandbox"); expect(out.document).toHaveProperty("metadata"); expect(out.document.metadata.sourceURL).toBe( - "https://scrapethissite.com/" + "https://scrapethissite.com/", ); expect(out.document.metadata.url).toBe( - "https://www.scrapethissite.com/" + "https://www.scrapethissite.com/", ); expect(out.document.metadata.statusCode).toBe(200); expect(out.document.metadata.error).toBeUndefined(); @@ -283,9 +283,9 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-screenshot", "https://www.scrapethissite.com/", scrapeOptions.parse({ - formats: ["screenshot"] + formats: ["screenshot"], }), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -296,8 +296,8 @@ describe("Standalone scrapeURL tests", () => { expect(typeof out.document.screenshot).toBe("string"); expect( out.document.screenshot!.startsWith( - "https://service.firecrawl.dev/storage/v1/object/public/media/" - ) + "https://service.firecrawl.dev/storage/v1/object/public/media/", + ), ); // TODO: attempt to fetch screenshot expect(out.document).toHaveProperty("metadata"); @@ -311,9 +311,9 @@ describe("Standalone scrapeURL tests", () => { "test:scrape-screenshot-fullPage", "https://www.scrapethissite.com/", scrapeOptions.parse({ - formats: ["screenshot@fullPage"] + formats: ["screenshot@fullPage"], }), - { forceEngine } + { forceEngine }, ); // expect(out.logs.length).toBeGreaterThan(0); @@ -324,8 +324,8 @@ describe("Standalone scrapeURL tests", () => { expect(typeof out.document.screenshot).toBe("string"); expect( out.document.screenshot!.startsWith( - "https://service.firecrawl.dev/storage/v1/object/public/media/" - ) + "https://service.firecrawl.dev/storage/v1/object/public/media/", + ), ); // TODO: attempt to fetch screenshot expect(out.document).toHaveProperty("metadata"); @@ -333,14 +333,14 @@ describe("Standalone scrapeURL tests", () => { expect(out.document.metadata.error).toBeUndefined(); } }, 30000); - } + }, ); it("Scrape of a PDF file", async () => { const out = await scrapeURL( "test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", - scrapeOptions.parse({}) + scrapeOptions.parse({}), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -358,7 +358,7 @@ describe("Standalone scrapeURL tests", () => { const out = await scrapeURL( "test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", - scrapeOptions.parse({}) + scrapeOptions.parse({}), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -367,7 +367,7 @@ describe("Standalone scrapeURL tests", () => { expect(out.document.warning).toBeUndefined(); expect(out.document).toHaveProperty("metadata"); expect(out.document.markdown).toContain( - "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" + "SERIES A PREFERRED STOCK PURCHASE AGREEMENT", ); expect(out.document.metadata.statusCode).toBe(200); expect(out.document.metadata.error).toBeUndefined(); @@ -388,13 +388,13 @@ describe("Standalone scrapeURL tests", () => { properties: { company_mission: { type: "string" }, supports_sso: { type: "boolean" }, - is_open_source: { type: "boolean" } + is_open_source: { type: "boolean" }, }, required: ["company_mission", "supports_sso", "is_open_source"], - additionalProperties: false - } - } - }) + additionalProperties: false, + }, + }, + }), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -423,13 +423,13 @@ describe("Standalone scrapeURL tests", () => { properties: { company_mission: { type: "string" }, supports_sso: { type: "boolean" }, - is_open_source: { type: "boolean" } + is_open_source: { type: "boolean" }, }, required: ["company_mission", "supports_sso", "is_open_source"], - additionalProperties: false - } - } - }) + additionalProperties: false, + }, + }, + }), ); // expect(out.logs.length).toBeGreaterThan(0); @@ -460,7 +460,7 @@ describe("Standalone scrapeURL tests", () => { message: value.message, name: value.name, cause: value.cause, - stack: value.stack + stack: value.stack, }; } else { return value; @@ -486,6 +486,6 @@ describe("Standalone scrapeURL tests", () => { expect(out.document.metadata.statusCode).toBe(200); } }, - 30000 + 30000, ); }); diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts index 4a31da1f..523a8419 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -11,7 +11,7 @@ export function saveToCache(meta: Meta, document: Document): Document { if (document.rawHtml === undefined) { throw new Error( - "rawHtml is undefined -- this transformer is being called out of order" + "rawHtml is undefined -- this transformer is being called out of order", ); } @@ -22,7 +22,7 @@ export function saveToCache(meta: Meta, document: Document): Document { html: document.rawHtml!, statusCode: document.metadata.statusCode!, url: document.metadata.url ?? document.metadata.sourceURL!, - error: document.metadata.error ?? undefined + error: document.metadata.error ?? undefined, }; saveEntryToCache(key, entry); diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index 5afceda2..e14896ef 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -11,33 +11,33 @@ import { saveToCache } from "./cache"; export type Transformer = ( meta: Meta, - document: Document + document: Document, ) => Document | Promise; export function deriveMetadataFromRawHTML( meta: Meta, - document: Document + document: Document, ): Document { if (document.rawHtml === undefined) { throw new Error( - "rawHtml is undefined -- this transformer is being called out of order" + "rawHtml is undefined -- this transformer is being called out of order", ); } document.metadata = { ...extractMetadata(meta, document.rawHtml), - ...document.metadata + ...document.metadata, }; return document; } export function deriveHTMLFromRawHTML( meta: Meta, - document: Document + document: Document, ): Document { if (document.rawHtml === undefined) { throw new Error( - "rawHtml is undefined -- this transformer is being called out of order" + "rawHtml is undefined -- this transformer is being called out of order", ); } @@ -47,11 +47,11 @@ export function deriveHTMLFromRawHTML( export async function deriveMarkdownFromHTML( _meta: Meta, - document: Document + document: Document, ): Promise { if (document.html === undefined) { throw new Error( - "html is undefined -- this transformer is being called out of order" + "html is undefined -- this transformer is being called out of order", ); } @@ -64,7 +64,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document { if (meta.options.formats.includes("links")) { if (document.html === undefined) { throw new Error( - "html is undefined -- this transformer is being called out of order" + "html is undefined -- this transformer is being called out of order", ); } @@ -76,7 +76,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document { export function coerceFieldsToFormats( meta: Meta, - document: Document + document: Document, ): Document { const formats = new Set(meta.options.formats); @@ -84,7 +84,7 @@ export function coerceFieldsToFormats( delete document.markdown; } else if (formats.has("markdown") && document.markdown === undefined) { meta.logger.warn( - "Request had format: markdown, but there was no markdown field in the result." + "Request had format: markdown, but there was no markdown field in the result.", ); } @@ -92,7 +92,7 @@ export function coerceFieldsToFormats( delete document.rawHtml; } else if (formats.has("rawHtml") && document.rawHtml === undefined) { meta.logger.warn( - "Request had format: rawHtml, but there was no rawHtml field in the result." + "Request had format: rawHtml, but there was no rawHtml field in the result.", ); } @@ -100,7 +100,7 @@ export function coerceFieldsToFormats( delete document.html; } else if (formats.has("html") && document.html === undefined) { meta.logger.warn( - "Request had format: html, but there was no html field in the result." + "Request had format: html, but there was no html field in the result.", ); } @@ -110,7 +110,7 @@ export function coerceFieldsToFormats( document.screenshot !== undefined ) { meta.logger.warn( - "Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug." + "Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug.", ); delete document.screenshot; } else if ( @@ -118,29 +118,29 @@ export function coerceFieldsToFormats( document.screenshot === undefined ) { meta.logger.warn( - "Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result." + "Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result.", ); } if (!formats.has("links") && document.links !== undefined) { meta.logger.warn( - "Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug." + "Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug.", ); delete document.links; } else if (formats.has("links") && document.links === undefined) { meta.logger.warn( - "Request had format: links, but there was no links field in the result." + "Request had format: links, but there was no links field in the result.", ); } if (!formats.has("extract") && document.extract !== undefined) { meta.logger.warn( - "Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug." + "Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.", ); delete document.extract; } else if (formats.has("extract") && document.extract === undefined) { meta.logger.warn( - "Request had format: extract, but there was no extract field in the result." + "Request had format: extract, but there was no extract field in the result.", ); } @@ -161,12 +161,12 @@ export const transformerStack: Transformer[] = [ uploadScreenshot, performLLMExtract, coerceFieldsToFormats, - removeBase64Images + removeBase64Images, ]; export async function executeTransformers( meta: Meta, - document: Document + document: Document, ): Promise { const executions: [string, number][] = []; @@ -174,8 +174,8 @@ export async function executeTransformers( const _meta = { ...meta, logger: meta.logger.child({ - method: "executeTransformers/" + transformer.name - }) + method: "executeTransformers/" + transformer.name, + }), }; const start = Date.now(); document = await transformer(_meta, document); diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index f09073ee..6380edb8 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -25,8 +25,8 @@ function normalizeSchema(x: any): any { x["$defs"] = Object.fromEntries( Object.entries(x["$defs"]).map(([name, schema]) => [ name, - normalizeSchema(schema) - ]) + normalizeSchema(schema), + ]), ); } @@ -50,15 +50,15 @@ function normalizeSchema(x: any): any { return { ...x, properties: Object.fromEntries( - Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]) + Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]), ), required: Object.keys(x.properties), - additionalProperties: false + additionalProperties: false, }; } else if (x && x.type === "array") { return { ...x, - items: normalizeSchema(x.items) + items: normalizeSchema(x.items), }; } else { return x; @@ -70,7 +70,7 @@ export async function generateOpenAICompletions( options: ExtractOptions, markdown?: string, previousWarning?: string, - isExtractEndpoint?: boolean + isExtractEndpoint?: boolean, ): Promise<{ extract: any; numTokens: number; warning: string | undefined }> { let extract: any; let warning: string | undefined; @@ -125,19 +125,19 @@ export async function generateOpenAICompletions( schema = { type: "object", properties: { - items: options.schema + items: options.schema, }, required: ["items"], - additionalProperties: false + additionalProperties: false, }; } else if (schema && typeof schema === "object" && !schema.type) { schema = { type: "object", properties: Object.fromEntries( - Object.entries(schema).map(([key, value]) => [key, { type: value }]) + Object.entries(schema).map(([key, value]) => [key, { type: value }]), ), required: Object.keys(schema), - additionalProperties: false + additionalProperties: false, }; } @@ -149,19 +149,19 @@ export async function generateOpenAICompletions( messages: [ { role: "system", - content: options.systemPrompt + content: options.systemPrompt, }, { role: "user", - content: [{ type: "text", text: markdown }] + content: [{ type: "text", text: markdown }], }, { role: "user", content: options.prompt !== undefined ? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}` - : "Transform the above content into structured JSON output." - } + : "Transform the above content into structured JSON output.", + }, ], response_format: options.schema ? { @@ -169,10 +169,10 @@ export async function generateOpenAICompletions( json_schema: { name: "websiteContent", schema: schema, - strict: true - } + strict: true, + }, } - : { type: "json_object" } + : { type: "json_object" }, }); if (jsonCompletion.choices[0].message.refusal !== null) { @@ -187,16 +187,16 @@ export async function generateOpenAICompletions( extract = JSON.parse(jsonCompletion.choices[0].message.content); } else { const extractData = JSON.parse( - jsonCompletion.choices[0].message.content + jsonCompletion.choices[0].message.content, ); extract = options.schema ? extractData.data.extract : extractData; } } catch (e) { logger.error("Failed to parse returned JSON, no schema specified.", { - error: e + error: e, }); throw new LLMRefusalError( - "Failed to parse returned JSON. Please specify a schema in the extract object." + "Failed to parse returned JSON. Please specify a schema in the extract object.", ); } } @@ -215,16 +215,16 @@ export async function generateOpenAICompletions( export async function performLLMExtract( meta: Meta, - document: Document + document: Document, ): Promise { if (meta.options.formats.includes("extract")) { const { extract, warning } = await generateOpenAICompletions( meta.logger.child({ - method: "performLLMExtract/generateOpenAICompletions" + method: "performLLMExtract/generateOpenAICompletions", }), meta.options.extract!, document.markdown, - document.warning + document.warning, ); document.extract = extract; document.warning = warning; diff --git a/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts index 3bc408ff..aa4e937f 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts @@ -7,7 +7,7 @@ export function removeBase64Images(meta: Meta, document: Document): Document { if (meta.options.removeBase64Images && document.markdown !== undefined) { document.markdown = document.markdown.replace( regex, - "$1()" + "$1()", ); } return document; diff --git a/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts b/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts index ed01af69..83df17b8 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts @@ -23,8 +23,8 @@ export function uploadScreenshot(meta: Meta, document: Document): Document { { cacheControl: "3600", upsert: false, - contentType: document.screenshot.split(":")[1].split(";")[0] - } + contentType: document.screenshot.split(":")[1].split(";")[0], + }, ); document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`; diff --git a/apps/api/src/search/fireEngine.ts b/apps/api/src/search/fireEngine.ts index 3fa9c588..26277523 100644 --- a/apps/api/src/search/fireEngine.ts +++ b/apps/api/src/search/fireEngine.ts @@ -15,7 +15,7 @@ export async function fireEngineMap( location?: string; numResults: number; page?: number; - } + }, ): Promise { try { let data = JSON.stringify({ @@ -25,12 +25,12 @@ export async function fireEngineMap( location: options.location, tbs: options.tbs, numResults: options.numResults, - page: options.page ?? 1 + page: options.page ?? 1, }); if (!process.env.FIRE_ENGINE_BETA_URL) { console.warn( - "(v1/map Beta) Results might differ from cloud offering currently." + "(v1/map Beta) Results might differ from cloud offering currently.", ); return []; } @@ -39,9 +39,9 @@ export async function fireEngineMap( method: "POST", headers: { "Content-Type": "application/json", - "X-Disable-Cache": "true" + "X-Disable-Cache": "true", }, - body: data + body: data, }); if (response.ok) { diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index a7c78fc9..74620651 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -11,7 +11,7 @@ const _useragent_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0" + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", ]; function get_useragent(): string { @@ -27,14 +27,14 @@ async function _req( proxies: any, timeout: number, tbs: string | undefined = undefined, - filter: string | undefined = undefined + filter: string | undefined = undefined, ) { const params = { q: term, num: results, // Number of results to return hl: lang, gl: country, - start: start + start: start, }; if (tbs) { params["tbs"] = tbs; @@ -45,11 +45,11 @@ async function _req( try { const resp = await axios.get("https://www.google.com/search", { headers: { - "User-Agent": get_useragent() + "User-Agent": get_useragent(), }, params: params, proxy: proxies, - timeout: timeout + timeout: timeout, }); return resp; } catch (error) { @@ -70,7 +70,7 @@ export async function googleSearch( country = "us", proxy = undefined as string | undefined, sleep_interval = 0, - timeout = 5000 + timeout = 5000, ): Promise { let proxies: any = null; if (proxy) { @@ -98,7 +98,7 @@ export async function googleSearch( proxies, timeout, tbs, - filter + filter, ); const $ = cheerio.load(resp.data); const result_block = $("div.g"); @@ -117,7 +117,7 @@ export async function googleSearch( const title = $(element).find("h3"); const ogImage = $(element).find("img").eq(1).attr("src"); const description_box = $(element).find( - "div[style='-webkit-line-clamp:2']" + "div[style='-webkit-line-clamp:2']", ); const answerBox = $(element).find(".mod").text(); if (description_box) { @@ -129,7 +129,7 @@ export async function googleSearch( } }); await new Promise((resolve) => - setTimeout(resolve, sleep_interval * 1000) + setTimeout(resolve, sleep_interval * 1000), ); } catch (error) { if (error.message === "Too many requests") { diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 978a57e0..82a6b68f 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -16,7 +16,7 @@ export async function search({ location = undefined, proxy = undefined, sleep_interval = 0, - timeout = 5000 + timeout = 5000, }: { query: string; advanced?: boolean; @@ -38,7 +38,7 @@ export async function search({ filter, lang, country, - location + location, }); } if (process.env.SEARCHAPI_API_KEY) { @@ -48,7 +48,7 @@ export async function search({ filter, lang, country, - location + location, }); } return await googleSearch( @@ -61,7 +61,7 @@ export async function search({ country, proxy, sleep_interval, - timeout + timeout, ); } catch (error) { logger.error(`Error in search function: ${error}`); diff --git a/apps/api/src/search/searchapi.ts b/apps/api/src/search/searchapi.ts index ea21c8d3..896c64c6 100644 --- a/apps/api/src/search/searchapi.ts +++ b/apps/api/src/search/searchapi.ts @@ -16,7 +16,7 @@ interface SearchOptions { export async function searchapi_search( q: string, - options: SearchOptions + options: SearchOptions, ): Promise { const params = { q: q, @@ -25,7 +25,7 @@ export async function searchapi_search( location: options.location, num: options.num_results, page: options.page ?? 1, - engine: process.env.SEARCHAPI_ENGINE || "google" + engine: process.env.SEARCHAPI_ENGINE || "google", }; const url = `https://www.searchapi.io/api/v1/search`; @@ -35,9 +35,9 @@ export async function searchapi_search( headers: { Authorization: `Bearer ${process.env.SEARCHAPI_API_KEY}`, "Content-Type": "application/json", - "X-SearchApi-Source": "Firecrawl" + "X-SearchApi-Source": "Firecrawl", }, - params: params + params: params, }); if (response.status === 401) { @@ -50,7 +50,7 @@ export async function searchapi_search( return data.organic_results.map((a: any) => ({ url: a.link, title: a.title, - description: a.snippet + description: a.snippet, })); } else { return []; diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/serper.ts index 4abf720d..88ff7cc0 100644 --- a/apps/api/src/search/serper.ts +++ b/apps/api/src/search/serper.ts @@ -14,7 +14,7 @@ export async function serper_search( location?: string; num_results: number; page?: number; - } + }, ): Promise { let data = JSON.stringify({ q: q, @@ -23,7 +23,7 @@ export async function serper_search( location: options.location, tbs: options.tbs, num: options.num_results, - page: options.page ?? 1 + page: options.page ?? 1, }); let config = { @@ -31,16 +31,16 @@ export async function serper_search( url: "https://google.serper.dev/search", headers: { "X-API-KEY": process.env.SERPER_API_KEY, - "Content-Type": "application/json" + "Content-Type": "application/json", }, - data: data + data: data, }; const response = await axios(config); if (response && response.data && Array.isArray(response.data.organic)) { return response.data.organic.map((a) => ({ url: a.link, title: a.title, - description: a.snippet + description: a.snippet, })); } else { return []; diff --git a/apps/api/src/services/alerts/index.ts b/apps/api/src/services/alerts/index.ts index 3aaea3aa..44f2b8a0 100644 --- a/apps/api/src/services/alerts/index.ts +++ b/apps/api/src/services/alerts/index.ts @@ -17,15 +17,15 @@ export async function checkAlerts() { const activeJobs = await scrapeQueue.getActiveCount(); if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { logger.warn( - `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` + `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`, ); sendSlackWebhook( `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`, - true + true, ); } else { logger.info( - `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}` + `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`, ); } } catch (error) { @@ -39,11 +39,11 @@ export async function checkAlerts() { if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { logger.warn( - `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` + `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`, ); sendSlackWebhook( `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}. Scale up the number of workers with fly scale count worker=20`, - true + true, ); } }; diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index 11280f28..ad8f9186 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -4,18 +4,18 @@ import { logger } from "../../../src/lib/logger"; export async function sendSlackWebhook( message: string, alertEveryone: boolean = false, - webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? "" + webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? "", ) { const messagePrefix = alertEveryone ? " " : ""; const payload = { - text: `${messagePrefix} ${message}` + text: `${messagePrefix} ${message}`, }; try { const response = await axios.post(webhookUrl, payload, { headers: { - "Content-Type": "application/json" - } + "Content-Type": "application/json", + }, }); logger.info("Webhook sent successfully:", response.data); } catch (error) { diff --git a/apps/api/src/services/billing/auto_charge.ts b/apps/api/src/services/billing/auto_charge.ts index 3411c921..45fdf1f5 100644 --- a/apps/api/src/services/billing/auto_charge.ts +++ b/apps/api/src/services/billing/auto_charge.ts @@ -22,7 +22,7 @@ const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds */ export async function autoCharge( chunk: AuthCreditUsageChunk, - autoRechargeThreshold: number + autoRechargeThreshold: number, ): Promise<{ success: boolean; message: string; @@ -38,13 +38,13 @@ export async function autoCharge( const cooldownValue = await getValue(cooldownKey); if (cooldownValue) { logger.info( - `Auto-recharge for team ${chunk.team_id} is in cooldown period` + `Auto-recharge for team ${chunk.team_id} is in cooldown period`, ); return { success: false, message: "Auto-recharge is in cooldown period", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } @@ -53,7 +53,7 @@ export async function autoCharge( [resource], 5000, async ( - signal + signal, ): Promise<{ success: boolean; message: string; @@ -81,7 +81,7 @@ export async function autoCharge( success: false, message: "Error fetching customer data", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } @@ -90,7 +90,7 @@ export async function autoCharge( // Attempt to create a payment intent const paymentStatus = await createPaymentIntent( chunk.team_id, - customer.stripe_customer_id + customer.stripe_customer_id, ); // If payment is successful or requires further action, issue credits @@ -100,7 +100,7 @@ export async function autoCharge( ) { issueCreditsSuccess = await issueCredits( chunk.team_id, - AUTO_RECHARGE_CREDITS + AUTO_RECHARGE_CREDITS, ); } @@ -109,7 +109,7 @@ export async function autoCharge( team_id: chunk.team_id, initial_payment_status: paymentStatus.return_status, credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0, - stripe_charge_id: paymentStatus.charge_id + stripe_charge_id: paymentStatus.charge_id, }); // Send a notification if credits were successfully issued @@ -120,7 +120,7 @@ export async function autoCharge( chunk.sub_current_period_start, chunk.sub_current_period_end, chunk, - true + true, ); // Set cooldown period @@ -139,7 +139,7 @@ export async function autoCharge( sendSlackWebhook( `Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`, false, - process.env.SLACK_ADMIN_WEBHOOK_URL + process.env.SLACK_ADMIN_WEBHOOK_URL, ).catch((error) => { logger.debug(`Error sending slack notification: ${error}`); }); @@ -156,8 +156,8 @@ export async function autoCharge( chunk: { ...chunk, remaining_credits: - chunk.remaining_credits + AUTO_RECHARGE_CREDITS - } + chunk.remaining_credits + AUTO_RECHARGE_CREDITS, + }, }; } else { logger.error("No Stripe customer ID found for user"); @@ -165,7 +165,7 @@ export async function autoCharge( success: false, message: "No Stripe customer ID found for user", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } } else { @@ -174,7 +174,7 @@ export async function autoCharge( success: false, message: "No sub_user_id found in chunk", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } } @@ -182,9 +182,9 @@ export async function autoCharge( success: false, message: "No need to auto-recharge", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; - } + }, ); } catch (error) { logger.error(`Failed to acquire lock for auto-recharge: ${error}`); @@ -192,7 +192,7 @@ export async function autoCharge( success: false, message: "Failed to acquire lock for auto-recharge", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index f25e165e..bbd04cc0 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -19,18 +19,18 @@ const FREE_CREDITS = 500; export async function billTeam( team_id: string, subscription_id: string | null | undefined, - credits: number + credits: number, ) { return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })( team_id, subscription_id, - credits + credits, ); } export async function supaBillTeam( team_id: string, subscription_id: string | null | undefined, - credits: number + credits: number, ) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; @@ -41,7 +41,7 @@ export async function supaBillTeam( _team_id: team_id, sub_id: subscription_id ?? null, fetch_subscription: subscription_id === undefined, - credits + credits, }); if (error) { @@ -58,9 +58,9 @@ export async function supaBillTeam( ...acuc, credits_used: acuc.credits_used + credits, adjusted_credits_used: acuc.adjusted_credits_used + credits, - remaining_credits: acuc.remaining_credits - credits + remaining_credits: acuc.remaining_credits - credits, } - : null + : null, ); } })(); @@ -76,12 +76,12 @@ export type CheckTeamCreditsResponse = { export async function checkTeamCredits( chunk: AuthCreditUsageChunk | null, team_id: string, - credits: number + credits: number, ): Promise { return withAuth(supaCheckTeamCredits, { success: true, message: "No DB, bypassed", - remainingCredits: Infinity + remainingCredits: Infinity, })(chunk, team_id, credits); } @@ -89,14 +89,14 @@ export async function checkTeamCredits( export async function supaCheckTeamCredits( chunk: AuthCreditUsageChunk | null, team_id: string, - credits: number + credits: number, ): Promise { // WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery if (team_id === "preview") { return { success: true, message: "Preview team, no credits used", - remainingCredits: Infinity + remainingCredits: Infinity, }; } else if (chunk === null) { throw new Error("NULL ACUC passed to supaCheckTeamCredits"); @@ -141,7 +141,7 @@ export async function supaCheckTeamCredits( success: true, message: autoChargeResult.message, remainingCredits: autoChargeResult.remainingCredits, - chunk: autoChargeResult.chunk + chunk: autoChargeResult.chunk, }; } } @@ -155,7 +155,7 @@ export async function supaCheckTeamCredits( NotificationType.LIMIT_REACHED, chunk.sub_current_period_start, chunk.sub_current_period_end, - chunk + chunk, ); } return { @@ -163,7 +163,7 @@ export async function supaCheckTeamCredits( message: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { // Send email notification for approaching credit limit @@ -172,7 +172,7 @@ export async function supaCheckTeamCredits( NotificationType.APPROACHING_LIMIT, chunk.sub_current_period_start, chunk.sub_current_period_end, - chunk + chunk, ); } @@ -180,13 +180,13 @@ export async function supaCheckTeamCredits( success: true, message: "Sufficient credits available", remainingCredits: chunk.remaining_credits, - chunk + chunk, }; } // Count the total credits used by a team within the current billing period and return the remaining credits. export async function countCreditsAndRemainingForCurrentBillingPeriod( - team_id: string + team_id: string, ) { // 1. Retrieve the team's active subscription based on the team_id. const { data: subscription, error: subscriptionError } = @@ -206,7 +206,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( (total, coupon) => total + coupon.credits, - 0 + 0, ); } @@ -221,20 +221,20 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( if (creditUsageError || !creditUsages) { throw new Error( - `Failed to retrieve credit usage for team_id: ${team_id}` + `Failed to retrieve credit usage for team_id: ${team_id}`, ); } const totalCreditsUsed = creditUsages.reduce( (acc, usage) => acc + usage.credits_used, - 0 + 0, ); const remainingCredits = FREE_CREDITS + couponCredits - totalCreditsUsed; return { totalCreditsUsed: totalCreditsUsed, remainingCredits, - totalCredits: FREE_CREDITS + couponCredits + totalCredits: FREE_CREDITS + couponCredits, }; } @@ -247,13 +247,13 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( if (creditUsageError || !creditUsages) { throw new Error( - `Failed to retrieve credit usage for subscription_id: ${subscription.id}` + `Failed to retrieve credit usage for subscription_id: ${subscription.id}`, ); } const totalCreditsUsed = creditUsages.reduce( (acc, usage) => acc + usage.credits_used, - 0 + 0, ); const { data: price, error: priceError } = await supabase_service @@ -264,7 +264,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( if (priceError || !price) { throw new Error( - `Failed to retrieve price for price_id: ${subscription.price_id}` + `Failed to retrieve price for price_id: ${subscription.price_id}`, ); } @@ -273,6 +273,6 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod( return { totalCreditsUsed, remainingCredits, - totalCredits: price.credits + totalCredits: price.credits, }; } diff --git a/apps/api/src/services/billing/issue_credits.ts b/apps/api/src/services/billing/issue_credits.ts index 3f013a1c..ce84db1b 100644 --- a/apps/api/src/services/billing/issue_credits.ts +++ b/apps/api/src/services/billing/issue_credits.ts @@ -8,7 +8,7 @@ export async function issueCredits(team_id: string, credits: number) { credits: credits, status: "active", // indicates that this coupon was issued from auto recharge - from_auto_recharge: true + from_auto_recharge: true, }); if (error) { diff --git a/apps/api/src/services/billing/stripe.ts b/apps/api/src/services/billing/stripe.ts index c5b76445..0d0b17cf 100644 --- a/apps/api/src/services/billing/stripe.ts +++ b/apps/api/src/services/billing/stripe.ts @@ -5,7 +5,7 @@ const stripe = new Stripe(process.env.STRIPE_SECRET_KEY ?? ""); async function getCustomerDefaultPaymentMethod(customerId: string) { const paymentMethods = await stripe.customers.listPaymentMethods(customerId, { - limit: 3 + limit: 3, }); return paymentMethods.data[0] ?? null; } @@ -13,14 +13,14 @@ async function getCustomerDefaultPaymentMethod(customerId: string) { type ReturnStatus = "succeeded" | "requires_action" | "failed"; export async function createPaymentIntent( team_id: string, - customer_id: string + customer_id: string, ): Promise<{ return_status: ReturnStatus; charge_id: string }> { try { const defaultPaymentMethod = await getCustomerDefaultPaymentMethod(customer_id); if (!defaultPaymentMethod) { logger.error( - `No default payment method found for customer: ${customer_id}` + `No default payment method found for customer: ${customer_id}`, ); return { return_status: "failed", charge_id: "" }; } @@ -32,7 +32,7 @@ export async function createPaymentIntent( payment_method_types: [defaultPaymentMethod?.type ?? "card"], payment_method: defaultPaymentMethod?.id, off_session: true, - confirm: true + confirm: true, }); if (paymentIntent.status === "succeeded") { @@ -51,7 +51,7 @@ export async function createPaymentIntent( } } catch (error) { logger.error( - `Failed to create or confirm PaymentIntent for team: ${team_id}` + `Failed to create or confirm PaymentIntent for team: ${team_id}`, ); console.error(error); return { return_status: "failed", charge_id: "" }; diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index bfdc84ce..86f88529 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -12,8 +12,8 @@ export async function logCrawl(job_id: string, team_id: string) { .insert([ { job_id: job_id, - team_id: team_id - } + team_id: team_id, + }, ]); } catch (error) { logger.error(`Error logging crawl job to supabase:\n${error}`); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index c3111dd7..b0754622 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -24,8 +24,8 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { job.docs = [ { content: "REDACTED DUE TO AUTHORIZATION HEADER", - html: "REDACTED DUE TO AUTHORIZATION HEADER" - } + html: "REDACTED DUE TO AUTHORIZATION HEADER", + }, ]; } const jobColumn = { @@ -43,7 +43,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { origin: job.origin, num_tokens: job.num_tokens, retry: !!job.retry, - crawl_id: job.crawl_id + crawl_id: job.crawl_id, }; if (force) { @@ -57,10 +57,10 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { if (error) { logger.error( "Failed to log job due to Supabase error -- trying again", - { error, scrapeId: job.job_id } + { error, scrapeId: job.job_id }, ); await new Promise((resolve) => - setTimeout(() => resolve(), 75) + setTimeout(() => resolve(), 75), ); } else { done = true; @@ -69,7 +69,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { } catch (error) { logger.error( "Failed to log job due to thrown error -- trying again", - { error, scrapeId: job.job_id } + { error, scrapeId: job.job_id }, ); await new Promise((resolve) => setTimeout(() => resolve(), 75)); } @@ -86,7 +86,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { if (error) { logger.error(`Error logging job: ${error.message}`, { error, - scrapeId: job.job_id + scrapeId: job.job_id, }); } else { logger.debug("Job logged successfully!", { scrapeId: job.job_id }); @@ -97,7 +97,7 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { let phLog = { distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user ...(job.team_id !== "preview" && { - groups: { team: job.team_id } + groups: { team: job.team_id }, }), //* Identifying event on this team event: "job-logged", properties: { @@ -112,8 +112,8 @@ export async function logJob(job: FirecrawlJob, force: boolean = false) { page_options: job.scrapeOptions, origin: job.origin, num_tokens: job.num_tokens, - retry: job.retry - } + retry: job.retry, + }, }; if (job.mode !== "single_urls") { posthog.capture(phLog); diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 3ccaf777..6e076330 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -8,7 +8,7 @@ configDotenv(); export async function logScrape( scrapeLog: ScrapeLog, - pageOptions?: PageOptions + pageOptions?: PageOptions, ) { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; if (!useDbAuthentication) { @@ -42,8 +42,8 @@ export async function logScrape( date_added: new Date().toISOString(), html: "Removed to save db space", ipv4_support: scrapeLog.ipv4_support, - ipv6_support: scrapeLog.ipv6_support - } + ipv6_support: scrapeLog.ipv6_support, + }, ]); if (error) { diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index 22c23865..6f310e5e 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -14,25 +14,25 @@ const emailTemplates: Record< > = { [NotificationType.APPROACHING_LIMIT]: { subject: "You've used 80% of your credit limit - Firecrawl", - html: "Hey there,

You are approaching your credit limit for this billing period. Your usage right now is around 80% of your total credit limit. Consider upgrading your plan to avoid hitting the limit. Check out our pricing page for more info.


Thanks,
Firecrawl Team
" + html: "Hey there,

You are approaching your credit limit for this billing period. Your usage right now is around 80% of your total credit limit. Consider upgrading your plan to avoid hitting the limit. Check out our pricing page for more info.


Thanks,
Firecrawl Team
", }, [NotificationType.LIMIT_REACHED]: { subject: "Credit Limit Reached! Take action now to resume usage - Firecrawl", - html: "Hey there,

You have reached your credit limit for this billing period. To resume usage, please upgrade your plan. Check out our pricing page for more info.


Thanks,
Firecrawl Team
" + html: "Hey there,

You have reached your credit limit for this billing period. To resume usage, please upgrade your plan. Check out our pricing page for more info.


Thanks,
Firecrawl Team
", }, [NotificationType.RATE_LIMIT_REACHED]: { subject: "Rate Limit Reached - Firecrawl", - html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit." + html: "Hey there,

You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our pricing page for more info.

If you have any questions, feel free to reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team

Ps. this email is only sent once every 7 days if you reach a rate limit.", }, [NotificationType.AUTO_RECHARGE_SUCCESS]: { subject: "Auto recharge successful - Firecrawl", - html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at firecrawl.dev/pricing to avoid hitting the limit.


Thanks,
Firecrawl Team
" + html: "Hey there,

Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at firecrawl.dev/pricing to avoid hitting the limit.


Thanks,
Firecrawl Team
", }, [NotificationType.AUTO_RECHARGE_FAILED]: { subject: "Auto recharge failed - Firecrawl", - html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team
" - } + html: "Hey there,

Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at help@firecrawl.com


Thanks,
Firecrawl Team
", + }, }; export async function sendNotification( @@ -41,7 +41,7 @@ export async function sendNotification( startDateString: string | null, endDateString: string | null, chunk: AuthCreditUsageChunk, - bypassRecentChecks: boolean = false + bypassRecentChecks: boolean = false, ) { return withAuth(sendNotificationInternal, undefined)( team_id, @@ -49,13 +49,13 @@ export async function sendNotification( startDateString, endDateString, chunk, - bypassRecentChecks + bypassRecentChecks, ); } export async function sendEmailNotification( email: string, - notificationType: NotificationType + notificationType: NotificationType, ) { const resend = new Resend(process.env.RESEND_API_KEY); @@ -65,7 +65,7 @@ export async function sendEmailNotification( to: [email], reply_to: "help@firecrawl.com", subject: emailTemplates[notificationType].subject, - html: emailTemplates[notificationType].html + html: emailTemplates[notificationType].html, }); if (error) { @@ -84,7 +84,7 @@ export async function sendNotificationInternal( startDateString: string | null, endDateString: string | null, chunk: AuthCreditUsageChunk, - bypassRecentChecks: boolean = false + bypassRecentChecks: boolean = false, ): Promise<{ success: boolean }> { if (team_id === "preview") { return { success: true }; @@ -125,7 +125,7 @@ export async function sendNotificationInternal( if (recentError) { logger.debug( - `Error fetching recent notifications: ${recentError.message}` + `Error fetching recent notifications: ${recentError.message}`, ); return { success: false }; } @@ -136,7 +136,7 @@ export async function sendNotificationInternal( } console.log( - `Sending notification for team_id: ${team_id} and notificationType: ${notificationType}` + `Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`, ); // get the emails from the user with the team_id const { data: emails, error: emailsError } = await supabase_service @@ -160,15 +160,15 @@ export async function sendNotificationInternal( team_id: team_id, notification_type: notificationType, sent_date: new Date().toISOString(), - timestamp: new Date().toISOString() - } + timestamp: new Date().toISOString(), + }, ]); if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) { sendSlackWebhook( `${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`, false, - process.env.SLACK_ADMIN_WEBHOOK_URL + process.env.SLACK_ADMIN_WEBHOOK_URL, ).catch((error) => { logger.debug(`Error sending slack notification: ${error}`); }); @@ -180,6 +180,6 @@ export async function sendNotificationInternal( } return { success: true }; - } + }, ); } diff --git a/apps/api/src/services/notification/notification_string.ts b/apps/api/src/services/notification/notification_string.ts index 72bc60c4..46da76e0 100644 --- a/apps/api/src/services/notification/notification_string.ts +++ b/apps/api/src/services/notification/notification_string.ts @@ -2,7 +2,7 @@ import { NotificationType } from "../../types"; // depending on the notification type, return the appropriate string export function getNotificationString( - notificationType: NotificationType + notificationType: NotificationType, ): string { switch (notificationType) { case NotificationType.APPROACHING_LIMIT: diff --git a/apps/api/src/services/posthog.ts b/apps/api/src/services/posthog.ts index 69f370ec..3f56123c 100644 --- a/apps/api/src/services/posthog.ts +++ b/apps/api/src/services/posthog.ts @@ -6,7 +6,7 @@ export default function PostHogClient(apiKey: string) { const posthogClient = new PostHog(apiKey, { host: process.env.POSTHOG_HOST, flushAt: 1, - flushInterval: 0 + flushInterval: 0, }); return posthogClient; } @@ -21,7 +21,7 @@ export const posthog = process.env.POSTHOG_API_KEY ? PostHogClient(process.env.POSTHOG_API_KEY) : (() => { logger.warn( - "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." + "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more.", ); return new MockPostHog(); })(); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index b4bd799b..bd2b9121 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -8,14 +8,14 @@ import { getConcurrencyLimitActiveJobs, getConcurrencyLimitMax, pushConcurrencyLimitActiveJob, - pushConcurrencyLimitedJob + pushConcurrencyLimitedJob, } from "../lib/concurrency-limit"; async function addScrapeJobRaw( webScraperOptions: any, options: any, jobId: string, - jobPriority: number = 10 + jobPriority: number = 10, ) { let concurrencyLimited = false; @@ -39,9 +39,9 @@ async function addScrapeJobRaw( opts: { ...options, priority: jobPriority, - jobId: jobId + jobId: jobId, }, - priority: jobPriority + priority: jobPriority, }); } else { if ( @@ -55,7 +55,7 @@ async function addScrapeJobRaw( await getScrapeQueue().add(jobId, webScraperOptions, { ...options, priority: jobPriority, - jobId + jobId, }); } } @@ -64,7 +64,7 @@ export async function addScrapeJob( webScraperOptions: WebScraperOptions, options: any = {}, jobId: string = uuidv4(), - jobPriority: number = 10 + jobPriority: number = 10, ) { if (Sentry.isInitialized()) { const size = JSON.stringify(webScraperOptions).length; @@ -75,8 +75,8 @@ export async function addScrapeJob( attributes: { "messaging.message.id": jobId, "messaging.destination.name": getScrapeQueue().name, - "messaging.message.body.size": size - } + "messaging.message.body.size": size, + }, }, async (span) => { await addScrapeJobRaw( @@ -85,14 +85,14 @@ export async function addScrapeJob( sentry: { trace: Sentry.spanToTraceHeader(span), baggage: Sentry.spanToBaggageHeader(span), - size - } + size, + }, }, options, jobId, - jobPriority + jobPriority, ); - } + }, ); } else { await addScrapeJobRaw(webScraperOptions, options, jobId, jobPriority); @@ -106,19 +106,19 @@ export async function addScrapeJobs( jobId: string; priority: number; }; - }[] + }[], ) { // TODO: better await Promise.all( jobs.map((job) => - addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority) - ) + addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority), + ), ); } export function waitForJob( jobId: string, - timeout: number + timeout: number, ): Promise { return new Promise((resolve, reject) => { const start = Date.now(); diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 3970a6e7..3cfd8c91 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -5,7 +5,7 @@ import IORedis from "ioredis"; let scrapeQueue: Queue; export const redisConnection = new IORedis(process.env.REDIS_URL!, { - maxRetriesPerRequest: null + maxRetriesPerRequest: null, }); export const scrapeQueueName = "{scrapeQueue}"; @@ -18,13 +18,13 @@ export function getScrapeQueue() { connection: redisConnection, defaultJobOptions: { removeOnComplete: { - age: 90000 // 25 hours + age: 90000, // 25 hours }, removeOnFail: { - age: 90000 // 25 hours - } - } - } + age: 90000, // 25 hours + }, + }, + }, // { // settings: { // lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index dc352d36..29f4b84f 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -5,7 +5,7 @@ import { CustomError } from "../lib/custom-error"; import { getScrapeQueue, redisConnection, - scrapeQueueName + scrapeQueueName, } from "./queue-service"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; @@ -24,14 +24,14 @@ import { getCrawl, getCrawlJobs, lockURL, - normalizeURL + normalizeURL, } from "../lib/crawl-redis"; import { StoredCrawl } from "../lib/crawl-redis"; import { addScrapeJob } from "./queue-jobs"; import { addJobPriority, deleteJobPriority, - getJobPriority + getJobPriority, } from "../../src/lib/job-priority"; import { PlanType, RateLimiterMode } from "../types"; import { getJobs } from "..//controllers/v1/crawl-status"; @@ -42,7 +42,7 @@ import { cleanOldConcurrencyLimitEntries, pushConcurrencyLimitActiveJob, removeConcurrencyLimitActiveJob, - takeConcurrencyLimitedJob + takeConcurrencyLimitedJob, } from "../lib/concurrency-limit"; configDotenv(); @@ -74,7 +74,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const jobIDs = await getCrawlJobs(job.data.crawl_id); const jobs = (await getJobs(jobIDs)).sort( - (a, b) => a.timestamp - b.timestamp + (a, b) => a.timestamp - b.timestamp, ); // const jobStatuses = await Promise.all(jobs.map((x) => x.getState())); const jobStatus = sc.cancelled // || jobStatuses.some((x) => x === "failed") @@ -87,7 +87,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { ? Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue - : null + : null, ) .filter((x) => x !== null); @@ -103,7 +103,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { url: sc.originUrl!, scrapeOptions: sc.scrapeOptions, crawlerOptions: sc.crawlerOptions, - origin: job.data.origin + origin: job.data.origin, }); const data = { @@ -112,12 +112,12 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { links: fullDocs.map((doc) => { return { content: doc, - source: doc?.metadata?.sourceURL ?? doc?.url ?? "" + source: doc?.metadata?.sourceURL ?? doc?.url ?? "", }; - }) + }), }, project_id: job.data.project_id, - docs: fullDocs + docs: fullDocs, }; // v0 web hooks, call when done with all the data @@ -130,7 +130,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { job.data.v1, job.data.crawlerOptions !== null ? "crawl.completed" - : "batch_scrape.completed" + : "batch_scrape.completed", ); } } else { @@ -147,7 +147,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { job.data.v1, job.data.crawlerOptions !== null ? "crawl.completed" - : "batch_scrape.completed" + : "batch_scrape.completed", ); } @@ -166,9 +166,9 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"), crawlerOptions: sc.crawlerOptions, - origin: job.data.origin + origin: job.data.origin, }, - true + true, ); } } @@ -180,7 +180,7 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { method: "processJobInternal", jobId: job.id, scrapeId: job.id, - crawlId: job.data?.crawl_id ?? undefined + crawlId: job.data?.crawl_id ?? undefined, }); const extendLockInterval = setInterval(async () => { @@ -196,7 +196,7 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => { try { if (job.data.crawl_id && process.env.USE_DB_AUTHENTICATION === "true") { logger.debug( - "Job succeeded -- has crawl associated, putting null in Redis" + "Job succeeded -- has crawl associated, putting null in Redis", ); await job.moveToCompleted(null, token, false); } else { @@ -237,7 +237,7 @@ let cantAcceptConnectionCount = 0; const workerFun = async ( queue: Queue, - processJobInternal: (token: string, job: Job) => Promise + processJobInternal: (token: string, job: Job) => Promise, ) => { const logger = _logger.child({ module: "queue-worker", method: "workerFun" }); @@ -246,7 +246,7 @@ const workerFun = async ( lockDuration: 1 * 60 * 1000, // 1 minute // lockRenewTime: 15 * 1000, // 15 seconds stalledInterval: 30 * 1000, // 30 seconds - maxStalledCount: 10 // 10 times + maxStalledCount: 10, // 10 times }); worker.startStalledCheckTimer(); @@ -267,7 +267,7 @@ const workerFun = async ( if (cantAcceptConnectionCount >= 25) { logger.error("WORKER STALLED", { cpuUsage: await monitor.checkCpuUsage(), - memoryUsage: await monitor.checkMemoryUsage() + memoryUsage: await monitor.checkMemoryUsage(), }); } @@ -295,13 +295,13 @@ const workerFun = async ( nextJob.id, { ...nextJob.data, - concurrencyLimitHit: true + concurrencyLimitHit: true, }, { ...nextJob.opts, jobId: nextJob.id, - priority: nextJob.priority - } + priority: nextJob.priority, + }, ); } } @@ -311,7 +311,7 @@ const workerFun = async ( Sentry.continueTrace( { sentryTrace: job.data.sentry.trace, - baggage: job.data.sentry.baggage + baggage: job.data.sentry.baggage, }, () => { Sentry.startSpan( @@ -319,8 +319,8 @@ const workerFun = async ( name: "Scrape job", attributes: { job: job.id, - worker: process.env.FLY_MACHINE_ID ?? worker.id - } + worker: process.env.FLY_MACHINE_ID ?? worker.id, + }, }, async (span) => { await Sentry.startSpan( @@ -333,8 +333,8 @@ const workerFun = async ( "messaging.message.body.size": job.data.sentry.size, "messaging.message.receive.latency": Date.now() - (job.processedOn ?? job.timestamp), - "messaging.message.retry.count": job.attemptsMade - } + "messaging.message.retry.count": job.attemptsMade, + }, }, async () => { let res; @@ -349,11 +349,11 @@ const workerFun = async ( } else { span.setStatus({ code: 1 }); // OK } - } + }, ); - } + }, ); - } + }, ); } else { Sentry.startSpan( @@ -361,12 +361,12 @@ const workerFun = async ( name: "Scrape job", attributes: { job: job.id, - worker: process.env.FLY_MACHINE_ID ?? worker.id - } + worker: process.env.FLY_MACHINE_ID ?? worker.id, + }, }, () => { processJobInternal(token, job).finally(() => afterJobDone(job)); - } + }, ); } @@ -385,7 +385,7 @@ async function processJob(job: Job & { id: string }, token: string) { method: "processJob", jobId: job.id, scrapeId: job.id, - crawlId: job.data?.crawl_id ?? undefined + crawlId: job.data?.crawl_id ?? undefined, }); logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); @@ -403,7 +403,7 @@ async function processJob(job: Job & { id: string }, token: string) { document: null, project_id: job.data.project_id, error: - "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error." + "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.", }; return data; } @@ -413,23 +413,23 @@ async function processJob(job: Job & { id: string }, token: string) { current: 1, total: 100, current_step: "SCRAPING", - current_url: "" + current_url: "", }); const start = Date.now(); const pipeline = await Promise.race([ startWebScraperPipeline({ job, - token + token, }), ...(job.data.scrapeOptions.timeout !== undefined ? [ (async () => { await sleep(job.data.scrapeOptions.timeout); throw new Error("timeout"); - })() + })(), ] - : []) + : []), ]); if (!pipeline.success) { @@ -450,17 +450,17 @@ async function processJob(job: Job & { id: string }, token: string) { links: [ { content: doc, - source: doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? "" - } - ] + source: doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? "", + }, + ], }, project_id: job.data.project_id, - document: doc + document: doc, }; if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) { logger.debug("Calling webhook with success...", { - webhook: job.data.webhook + webhook: job.data.webhook, }); await callWebhook( job.data.team_id, @@ -469,7 +469,7 @@ async function processJob(job: Job & { id: string }, token: string) { job.data.webhook, job.data.v1, job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", - true + true, ); } @@ -484,18 +484,18 @@ async function processJob(job: Job & { id: string }, token: string) { ) { logger.debug( "Was redirected, removing old URL and locking new URL...", - { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url } + { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, ); // Remove the old URL from visited unique due to checking for limit // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL) await redisConnection.srem( "crawl:" + job.data.crawl_id + ":visited_unique", - normalizeURL(doc.metadata.sourceURL, sc) + normalizeURL(doc.metadata.sourceURL, sc), ); const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( - normalizeURL(doc.metadata.sourceURL, sc) + normalizeURL(doc.metadata.sourceURL, sc), ); // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before. @@ -525,9 +525,9 @@ async function processJob(job: Job & { id: string }, token: string) { crawlerOptions: sc.crawlerOptions, scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, - crawl_id: job.data.crawl_id + crawl_id: job.data.crawl_id, }, - true + true, ); logger.debug("Declaring job as done..."); @@ -538,19 +538,19 @@ async function processJob(job: Job & { id: string }, token: string) { const crawler = crawlToCrawler( job.data.crawl_id, sc, - doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl! + doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!, ); const links = crawler.filterLinks( crawler.extractLinksFromHTML( rawHtml ?? "", - doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl! + doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!, ), Infinity, - sc.crawlerOptions?.maxDepth ?? 10 + sc.crawlerOptions?.maxDepth ?? 10, ); logger.debug("Discovered " + links.length + " links...", { - linksLength: links.length + linksLength: links.length, }); for (const link of links) { @@ -559,7 +559,7 @@ async function processJob(job: Job & { id: string }, token: string) { const jobPriority = await getJobPriority({ plan: sc.plan as PlanType, team_id: sc.team_id, - basePriority: job.data.crawl_id ? 20 : 10 + basePriority: job.data.crawl_id ? 20 : 10, }); const jobId = uuidv4(); @@ -568,7 +568,7 @@ async function processJob(job: Job & { id: string }, token: string) { jobPriority + " for URL " + JSON.stringify(link), - { jobPriority, url: link } + { jobPriority, url: link }, ); // console.log("plan: ", sc.plan); @@ -587,22 +587,22 @@ async function processJob(job: Job & { id: string }, token: string) { origin: job.data.origin, crawl_id: job.data.crawl_id, webhook: job.data.webhook, - v1: job.data.v1 + v1: job.data.v1, }, {}, jobId, - jobPriority + jobPriority, ); await addCrawlJob(job.data.crawl_id, jobId); logger.debug("Added job for URL " + JSON.stringify(link), { jobPriority, url: link, - newJobId: jobId + newJobId: jobId, }); } else { logger.debug("Could not lock URL " + JSON.stringify(link), { - url: link + url: link, }); } } @@ -627,8 +627,8 @@ async function processJob(job: Job & { id: string }, token: string) { Sentry.captureException(error, { data: { - job: job.id - } + job: job.id, + }, }); if (error instanceof CustomError) { @@ -650,7 +650,7 @@ async function processJob(job: Job & { id: string }, token: string) { ? error : typeof error === "string" ? new Error(error) - : new Error(JSON.stringify(error)) + : new Error(JSON.stringify(error)), }; if (!job.data.v1 && (job.data.mode === "crawl" || job.data.crawl_id)) { @@ -660,7 +660,7 @@ async function processJob(job: Job & { id: string }, token: string) { data, job.data.webhook, job.data.v1, - job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page" + job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page", ); } // if (job.data.v1) { @@ -699,9 +699,9 @@ async function processJob(job: Job & { id: string }, token: string) { crawlerOptions: sc.crawlerOptions, scrapeOptions: job.data.scrapeOptions, origin: job.data.origin, - crawl_id: job.data.crawl_id + crawl_id: job.data.crawl_id, }, - true + true, ); await finishCrawlIfNeeded(job, sc); diff --git a/apps/api/src/services/rate-limiter.test.ts b/apps/api/src/services/rate-limiter.test.ts index 5c25a8d7..098a657c 100644 --- a/apps/api/src/services/rate-limiter.test.ts +++ b/apps/api/src/services/rate-limiter.test.ts @@ -2,7 +2,7 @@ import { getRateLimiter, serverRateLimiter, testSuiteRateLimiter, - redisRateLimitClient + redisRateLimitClient, } from "./rate-limiter"; import { RateLimiterMode } from "../../src/types"; import { RateLimiterRedis } from "rate-limiter-flexible"; @@ -33,13 +33,13 @@ describe("Rate Limiter Service", () => { it("should return the testSuiteRateLimiter for specific tokens", () => { const limiter = getRateLimiter( "crawl" as RateLimiterMode, - "test-prefix:a01ccae" + "test-prefix:a01ccae", ); expect(limiter).toBe(testSuiteRateLimiter); const limiter2 = getRateLimiter( "scrape" as RateLimiterMode, - "test-prefix:6254cf9" + "test-prefix:6254cf9", ); expect(limiter2).toBe(testSuiteRateLimiter); }); @@ -47,7 +47,7 @@ describe("Rate Limiter Service", () => { it("should return the serverRateLimiter if mode is not found", () => { const limiter = getRateLimiter( "nonexistent" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter.points).toBe(serverRateLimiter.points); }); @@ -56,28 +56,28 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "crawl" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(2); const limiter2 = getRateLimiter( "scrape" as RateLimiterMode, "test-prefix:someToken", - "standard" + "standard", ); expect(limiter2.points).toBe(100); const limiter3 = getRateLimiter( "search" as RateLimiterMode, "test-prefix:someToken", - "growth" + "growth", ); expect(limiter3.points).toBe(500); const limiter4 = getRateLimiter( "crawlStatus" as RateLimiterMode, "test-prefix:someToken", - "growth" + "growth", ); expect(limiter4.points).toBe(250); }); @@ -85,13 +85,13 @@ describe("Rate Limiter Service", () => { it("should return the default rate limiter if plan is not provided", () => { const limiter = getRateLimiter( "crawl" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter.points).toBe(3); const limiter2 = getRateLimiter( "scrape" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(20); }); @@ -103,7 +103,7 @@ describe("Rate Limiter Service", () => { storeClient: redisRateLimitClient, keyPrefix, points, - duration: 60 + duration: 60, }); expect(limiter.keyPrefix).toBe(keyPrefix); @@ -115,13 +115,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "preview" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(5); const limiter2 = getRateLimiter( "preview" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(5); }); @@ -130,13 +130,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "account" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(100); const limiter2 = getRateLimiter( "account" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(100); }); @@ -145,13 +145,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "crawlStatus" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(150); const limiter2 = getRateLimiter( "crawlStatus" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(250); }); @@ -160,13 +160,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "crawl" as RateLimiterMode, "test-prefix:someTokenCRAWL", - "free" + "free", ); const consumePoints = 1; const res = await limiter.consume( "test-prefix:someTokenCRAWL", - consumePoints + consumePoints, ); expect(res.remainingPoints).toBe(1); }); @@ -174,7 +174,7 @@ describe("Rate Limiter Service", () => { it("should consume points correctly for 'scrape' mode (DEFAULT)", async () => { const limiter = getRateLimiter( "scrape" as RateLimiterMode, - "test-prefix:someTokenX" + "test-prefix:someTokenX", ); const consumePoints = 4; @@ -186,7 +186,7 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "scrape" as RateLimiterMode, "test-prefix:someTokenXY", - "hobby" + "hobby", ); expect(limiter.points).toBe(20); @@ -201,21 +201,21 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "crawl" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(2); const limiter2 = getRateLimiter( "crawl" as RateLimiterMode, "test-prefix:someToken", - "starter" + "starter", ); expect(limiter2.points).toBe(10); const limiter3 = getRateLimiter( "crawl" as RateLimiterMode, "test-prefix:someToken", - "standard" + "standard", ); expect(limiter3.points).toBe(5); }); @@ -224,28 +224,28 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "scrape" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(10); const limiter2 = getRateLimiter( "scrape" as RateLimiterMode, "test-prefix:someToken", - "starter" + "starter", ); expect(limiter2.points).toBe(100); const limiter3 = getRateLimiter( "scrape" as RateLimiterMode, "test-prefix:someToken", - "standard" + "standard", ); expect(limiter3.points).toBe(100); const limiter4 = getRateLimiter( "scrape" as RateLimiterMode, "test-prefix:someToken", - "growth" + "growth", ); expect(limiter4.points).toBe(1000); }); @@ -254,21 +254,21 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "search" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(5); const limiter2 = getRateLimiter( "search" as RateLimiterMode, "test-prefix:someToken", - "starter" + "starter", ); expect(limiter2.points).toBe(50); const limiter3 = getRateLimiter( "search" as RateLimiterMode, "test-prefix:someToken", - "standard" + "standard", ); expect(limiter3.points).toBe(50); }); @@ -277,13 +277,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "preview" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(5); const limiter2 = getRateLimiter( "preview" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(5); }); @@ -292,13 +292,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "account" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(100); const limiter2 = getRateLimiter( "account" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(100); }); @@ -307,13 +307,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "crawlStatus" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(150); const limiter2 = getRateLimiter( "crawlStatus" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(250); }); @@ -322,13 +322,13 @@ describe("Rate Limiter Service", () => { const limiter = getRateLimiter( "testSuite" as RateLimiterMode, "test-prefix:someToken", - "free" + "free", ); expect(limiter.points).toBe(10000); const limiter2 = getRateLimiter( "testSuite" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); expect(limiter2.points).toBe(10000); }); @@ -336,7 +336,7 @@ describe("Rate Limiter Service", () => { it("should throw an error when consuming more points than available", async () => { const limiter = getRateLimiter( "crawl" as RateLimiterMode, - "test-prefix:someToken" + "test-prefix:someToken", ); const consumePoints = limiter.points + 1; @@ -357,7 +357,7 @@ describe("Rate Limiter Service", () => { storeClient: redisRateLimitClient, keyPrefix, points, - duration + duration, }); const consumePoints = 5; diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 8067f862..5b8e39ca 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -18,7 +18,7 @@ const RATE_LIMITS = { etier2c: 300, etier1a: 1000, etier2a: 300, - etierscale1: 150 + etierscale1: 150, }, scrape: { default: 20, @@ -35,7 +35,7 @@ const RATE_LIMITS = { etier2c: 2500, etier1a: 1000, etier2a: 2500, - etierscale1: 1500 + etierscale1: 1500, }, search: { default: 20, @@ -52,7 +52,7 @@ const RATE_LIMITS = { etier2c: 2500, etier1a: 1000, etier2a: 2500, - etierscale1: 1500 + etierscale1: 1500, }, map: { default: 20, @@ -69,28 +69,28 @@ const RATE_LIMITS = { etier2c: 2500, etier1a: 1000, etier2a: 2500, - etierscale1: 1500 + etierscale1: 1500, }, preview: { free: 5, - default: 5 + default: 5, }, account: { free: 100, - default: 100 + default: 100, }, crawlStatus: { free: 300, - default: 500 + default: 500, }, testSuite: { free: 10000, - default: 10000 - } + default: 10000, + }, }; export const redisRateLimitClient = new Redis( - process.env.REDIS_RATE_LIMIT_URL! + process.env.REDIS_RATE_LIMIT_URL!, ); const createRateLimiter = (keyPrefix, points) => @@ -98,54 +98,54 @@ const createRateLimiter = (keyPrefix, points) => storeClient: redisRateLimitClient, keyPrefix, points, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); export const serverRateLimiter = createRateLimiter( "server", - RATE_LIMITS.account.default + RATE_LIMITS.account.default, ); export const testSuiteRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "test-suite", points: 10000, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); export const devBRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "dev-b", points: 1200, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); export const manualRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "manual", points: 2000, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); export const scrapeStatusRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "scrape-status", points: 400, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); export const etier1aRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "etier1a", points: 10000, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); export const etier2aRateLimiter = new RateLimiterRedis({ storeClient: redisRateLimitClient, keyPrefix: "etier2a", points: 2500, - duration: 60 // Duration in seconds + duration: 60, // Duration in seconds }); const testSuiteTokens = [ @@ -165,7 +165,7 @@ const testSuiteTokens = [ "fd769b2", "4c2638d", "cbb3462", // don't remove (s-ai) - "824abcd" // don't remove (s-ai) + "824abcd", // don't remove (s-ai) ]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; @@ -178,7 +178,7 @@ export function getRateLimiterPoints( mode: RateLimiterMode, token?: string, plan?: string, - teamId?: string + teamId?: string, ): number { const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} @@ -193,7 +193,7 @@ export function getRateLimiter( mode: RateLimiterMode, token?: string, plan?: string, - teamId?: string + teamId?: string, ): RateLimiterRedis { if (token && testSuiteTokens.some((testToken) => token.includes(testToken))) { return testSuiteRateLimiter; @@ -221,6 +221,6 @@ export function getRateLimiter( return createRateLimiter( `${mode}-${makePlanKey(plan)}`, - getRateLimiterPoints(mode, token, plan, teamId) + getRateLimiterPoints(mode, token, plan, teamId), ); } diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index 04fcbd5e..d2c7dd3a 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -39,7 +39,7 @@ const setValue = async ( key: string, value: string, expire?: number, - nx = false + nx = false, ) => { if (expire && !nx) { await redisRateLimitClient.set(key, value, "EX", expire); diff --git a/apps/api/src/services/redlock.ts b/apps/api/src/services/redlock.ts index 757346f9..923cfc3d 100644 --- a/apps/api/src/services/redlock.ts +++ b/apps/api/src/services/redlock.ts @@ -21,6 +21,6 @@ export const redlock = new Redlock( // The minimum remaining time on a lock before an extension is automatically // attempted with the `using` API. - automaticExtensionThreshold: 500 // time in ms - } + automaticExtensionThreshold: 500, // time in ms + }, ); diff --git a/apps/api/src/services/sentry.ts b/apps/api/src/services/sentry.ts index 41f19362..927b33c3 100644 --- a/apps/api/src/services/sentry.ts +++ b/apps/api/src/services/sentry.ts @@ -11,6 +11,6 @@ if (process.env.SENTRY_DSN) { tracesSampleRate: process.env.SENTRY_ENVIRONMENT === "dev" ? 1.0 : 0.045, profilesSampleRate: 1.0, serverName: process.env.FLY_MACHINE_ID, - environment: process.env.SENTRY_ENVIRONMENT ?? "production" + environment: process.env.SENTRY_ENVIRONMENT ?? "production", }); } diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 521a82ca..4ab63815 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -15,12 +15,12 @@ class SupabaseService { if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null logger.warn( - "Authentication is disabled. Supabase client will not be initialized." + "Authentication is disabled. Supabase client will not be initialized.", ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { logger.error( - "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable", ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -52,6 +52,6 @@ export const supabase_service: SupabaseClient = new Proxy( } // Otherwise, delegate access to the Supabase client. return Reflect.get(client, prop, receiver); - } - } + }, + }, ) as unknown as SupabaseClient; diff --git a/apps/api/src/services/system-monitor.ts b/apps/api/src/services/system-monitor.ts index 4fa4c478..886de6ff 100644 --- a/apps/api/src/services/system-monitor.ts +++ b/apps/api/src/services/system-monitor.ts @@ -137,7 +137,7 @@ class SystemMonitor { } } catch (error) { logger.warn( - `Unable to read cpuset.cpus.effective, defaulting to OS CPUs: ${error}` + `Unable to read cpuset.cpus.effective, defaulting to OS CPUs: ${error}`, ); cpus = os.cpus().map((cpu, index) => index); } diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index dfee11f6..6b580a36 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -14,12 +14,12 @@ export const callWebhook = async ( specified?: z.infer, v1 = false, eventType: WebhookEventType = "crawl.page", - awaitWebhook: boolean = false + awaitWebhook: boolean = false, ) => { try { const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace( "{{JOB_ID}}", - id + id, ); const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; let webhookUrl = @@ -36,7 +36,7 @@ export const callWebhook = async ( .limit(1); if (error) { logger.error( - `Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}` + `Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`, ); return null; } @@ -54,7 +54,7 @@ export const callWebhook = async ( specified, v1, eventType, - awaitWebhook + awaitWebhook, }); if (!webhookUrl) { @@ -75,7 +75,7 @@ export const callWebhook = async ( dataToSend.push({ content: data.result.links[i].content.content, markdown: data.result.links[i].content.markdown, - metadata: data.result.links[i].content.metadata + metadata: data.result.links[i].content.metadata, }); } } @@ -98,19 +98,19 @@ export const callWebhook = async ( ? data?.error || undefined : eventType === "crawl.page" ? data?.error || undefined - : undefined + : undefined, }, { headers: { "Content-Type": "application/json", - ...webhookUrl.headers + ...webhookUrl.headers, }, - timeout: v1 ? 10000 : 30000 // 10 seconds timeout (v1) - } + timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) + }, ); } catch (error) { logger.error( - `Axios error (0) sending webhook for team ID: ${teamId}, error: ${error.message}` + `Axios error (0) sending webhook for team ID: ${teamId}, error: ${error.message}`, ); } } else { @@ -130,24 +130,24 @@ export const callWebhook = async ( ? data?.error || undefined : eventType === "crawl.page" ? data?.error || undefined - : undefined + : undefined, }, { headers: { "Content-Type": "application/json", - ...webhookUrl.headers - } - } + ...webhookUrl.headers, + }, + }, ) .catch((error) => { logger.error( - `Axios error sending webhook for team ID: ${teamId}, error: ${error.message}` + `Axios error sending webhook for team ID: ${teamId}, error: ${error.message}`, ); }); } } catch (error) { logger.debug( - `Error sending webhook for team ID: ${teamId}, error: ${error.message}` + `Error sending webhook for team ID: ${teamId}, error: ${error.message}`, ); } }; diff --git a/apps/api/src/supabase_types.ts b/apps/api/src/supabase_types.ts index 8f9e1b64..00b2efbb 100644 --- a/apps/api/src/supabase_types.ts +++ b/apps/api/src/supabase_types.ts @@ -40,7 +40,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; company: { @@ -77,7 +77,7 @@ export interface Database { columns: ["pricing_plan_id"]; referencedRelation: "pricing_plan"; referencedColumns: ["id"]; - } + }, ]; }; constants: { @@ -126,7 +126,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; customers: { @@ -157,7 +157,7 @@ export interface Database { columns: ["user_id"]; referencedRelation: "users"; referencedColumns: ["id"]; - } + }, ]; }; data: { @@ -236,7 +236,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; data_partitioned: { @@ -390,7 +390,7 @@ export interface Database { columns: ["company_id"]; referencedRelation: "company"; referencedColumns: ["company_id"]; - } + }, ]; }; message: { @@ -439,7 +439,7 @@ export interface Database { columns: ["conversation_id"]; referencedRelation: "conversation"; referencedColumns: ["conversation_id"]; - } + }, ]; }; model_configuration: { @@ -479,7 +479,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; monthly_message_counts: { @@ -507,7 +507,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; prices: { @@ -560,7 +560,7 @@ export interface Database { columns: ["product_id"]; referencedRelation: "products"; referencedColumns: ["id"]; - } + }, ]; }; pricing_plan: { @@ -747,7 +747,7 @@ export interface Database { columns: ["user_id"]; referencedRelation: "users"; referencedColumns: ["id"]; - } + }, ]; }; suggested_questions: { @@ -775,7 +775,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; user_notifications: { @@ -821,7 +821,7 @@ export interface Database { columns: ["user_id"]; referencedRelation: "users"; referencedColumns: ["id"]; - } + }, ]; }; users: { @@ -864,7 +864,7 @@ export interface Database { columns: ["id"]; referencedRelation: "users"; referencedColumns: ["id"]; - } + }, ]; }; z_testcomp_92511: { @@ -934,7 +934,7 @@ export interface Database { columns: ["project_id"]; referencedRelation: "mendable_project"; referencedColumns: ["id"]; - } + }, ]; }; }; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index cfae8f23..5325a0ad 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -3,7 +3,7 @@ import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, - webhookSchema + webhookSchema, } from "./controllers/v1/types"; import { ExtractorOptions, Document } from "./lib/entities"; import { InternalOptions } from "./scraper/scrapeURL"; @@ -127,7 +127,7 @@ export enum RateLimiterMode { Scrape = "scrape", Preview = "preview", Search = "search", - Map = "map" + Map = "map", } export type AuthResponse = @@ -149,7 +149,7 @@ export enum NotificationType { LIMIT_REACHED = "limitReached", RATE_LIMIT_REACHED = "rateLimitReached", AUTO_RECHARGE_SUCCESS = "autoRechargeSuccess", - AUTO_RECHARGE_FAILED = "autoRechargeFailed" + AUTO_RECHARGE_FAILED = "autoRechargeFailed", } export type ScrapeLog = {

You are approaching your credit limit for this billing period. Your usage right now is around 80% of your total credit limit. Consider upgrading your plan to avoid hitting the limit. Check out our pricing page for more info.