From 22a5e85899eb893c9a68f53201e13f5fb569bc46 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 12:26:55 -0300 Subject: [PATCH 1/6] Update index.ts --- apps/api/src/index.ts | 102 ++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 58370158..1edf3759 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry" +import "./services/sentry"; import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -12,9 +12,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; -import http from 'node:http'; -import https from 'node:https'; -import CacheableLookup from 'cacheable-lookup'; +import http from "node:http"; +import https from "node:https"; +import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; @@ -31,11 +31,11 @@ Logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup({ // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup:false + lookup: false, }); cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent) +cacheable.install(https.globalAgent); if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); @@ -115,9 +115,7 @@ if (cluster.isMaster) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([ - scrapeQueue.getWaitingCount(), - ]); + const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs @@ -190,38 +188,77 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { - if (err instanceof ZodError) { - res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); - } else { + app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: Response, + next: NextFunction + ) => { + if (err instanceof ZodError) { + res + .status(400) + .json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); + } } - }); + ); Sentry.setupExpressErrorHandler(app); - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); - } - } + app.use( + ( + err: unknown, + req: Request<{}, ErrorResponse, undefined>, + res: ResponseWithSentry, + next: NextFunction + ) => { + if ( + err instanceof SyntaxError && + "status" in err && + err.status === 400 && + "body" in err + ) { + return res + .status(400) + .json({ success: false, error: "Bad request, malformed JSON" }); + } - Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); - }); + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error( + "Error occurred in request! (" + + req.path + + ") -- ID " + + id + + " -- " + + verbose + ); + res + .status(500) + .json({ + success: false, + error: + "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + + id, + }); + } + ); Logger.info(`Worker ${process.pid} started`); } - - // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); @@ -230,6 +267,3 @@ if (cluster.isMaster) { // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); - - - From ca9a781eb7fbadf7aee7dd6926aea3a0b1ca5e07 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 12:27:55 -0300 Subject: [PATCH 2/6] Update index.ts --- apps/api/src/index.ts | 106 +++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 1edf3759..7d8817af 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -1,5 +1,5 @@ import "dotenv/config"; -import "./services/sentry"; +import "./services/sentry" import * as Sentry from "@sentry/node"; import express, { NextFunction, Request, Response } from "express"; import bodyParser from "body-parser"; @@ -12,9 +12,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; -import http from "node:http"; -import https from "node:https"; -import CacheableLookup from "cacheable-lookup"; +import http from 'node:http'; +import https from 'node:https'; +import CacheableLookup from 'cacheable-lookup'; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; @@ -31,11 +31,11 @@ Logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup({ // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme - lookup: false, + lookup:false }); cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent); +cacheable.install(https.globalAgent) if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); @@ -115,7 +115,9 @@ if (cluster.isMaster) { app.get(`/serverHealthCheck`, async (req, res) => { try { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]); + const [waitingJobs] = await Promise.all([ + scrapeQueue.getWaitingCount(), + ]); const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs @@ -188,77 +190,42 @@ if (cluster.isMaster) { res.send({ isProduction: global.isProduction }); }); - app.use( - ( - err: unknown, - req: Request<{}, ErrorResponse, undefined>, - res: Response, - next: NextFunction - ) => { - if (err instanceof ZodError) { - res - .status(400) - .json({ success: false, error: "Bad Request", details: err.errors }); - } else { + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { + if (err instanceof ZodError) { + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { next(err); - } } - ); + }); Sentry.setupExpressErrorHandler(app); - app.use( - ( - err: unknown, - req: Request<{}, ErrorResponse, undefined>, - res: ResponseWithSentry, - next: NextFunction - ) => { - if ( - err instanceof SyntaxError && - "status" in err && - err.status === 400 && - "body" in err - ) { - return res - .status(400) - .json({ success: false, error: "Bad request, malformed JSON" }); - } - - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); - } - } - - Logger.error( - "Error occurred in request! (" + - req.path + - ") -- ID " + - id + - " -- " + - verbose - ); - res - .status(500) - .json({ - success: false, - error: - "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + - id, - }); + app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { + if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { + return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); } - ); + + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); + }); Logger.info(`Worker ${process.pid} started`); } + + // const sq = getScrapeQueue(); // sq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); @@ -267,3 +234,6 @@ if (cluster.isMaster) { // sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); // sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); // sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); + + + From 17e419a7fb82dacba45692ea676f0487e66d5f70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 9 Sep 2024 21:06:23 -0300 Subject: [PATCH 3/6] Nick: --- .../scraper/WebScraper/scrapers/fireEngine.ts | 2 +- apps/api/src/scraper/WebScraper/single_url.ts | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e7361c5c..a3f393c8 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -55,7 +55,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); let waitParam = reqParams["params"]?.wait ?? waitFor; - let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; + let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8bafd203..2be65899 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -96,15 +96,15 @@ function getScrapingFallbackOrder( "fetch", ].filter(Boolean); - if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { - defaultOrder = [ - "fire-engine", - useFireEngine ? undefined : "playwright", - ...defaultOrder.filter( - (scraper) => scraper !== "fire-engine" && scraper !== "playwright" - ), - ].filter(Boolean); - } + // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { + // defaultOrder = [ + // "fire-engine", + // useFireEngine ? undefined : "playwright", + // ...defaultOrder.filter( + // (scraper) => scraper !== "fire-engine" && scraper !== "playwright" + // ), + // ].filter(Boolean); + // } const filteredDefaultOrder = defaultOrder.filter( (scraper: (typeof baseScrapers)[number]) => From 26f2095de61103e854ef95326b6e0570b2494879 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 09:24:23 +0200 Subject: [PATCH 4/6] fix(v1): proper Invalid URL handling --- apps/api/src/controllers/v1/types.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 63ec1dd4..f812f981 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -30,7 +30,14 @@ export const url = z.preprocess( "URL must have a valid top-level domain or be a valid path" ) .refine( - (x) => checkUrl(x as string), + (x) => { + try { + checkUrl(x as string) + return true; + } catch (_) { + return false; + } + }, "Invalid URL" ) .refine( From b4dbf7553750a54040ff47fea9042d2858aaa9cd Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Tue, 10 Sep 2024 10:25:14 +0200 Subject: [PATCH 5/6] fix(v1): check if url is string in blocklistMiddleware Fixes FIRECRAWL-SCRAPER-JS-9Z --- apps/api/src/routes/v1.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index daa9bf43..484ab5dc 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -83,7 +83,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) } function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (req.body.url && isUrlBlocked(req.body.url)) { + if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { if (!res.headersSent) { return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); } From a17e1cac929ace616e371b4df4100a1029300609 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 10 Sep 2024 06:53:24 -0300 Subject: [PATCH 6/6] Rate bump --- apps/api/src/services/rate-limiter.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index dade8493..7cfff35b 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -6,7 +6,7 @@ const RATE_LIMITS = { crawl: { default: 3, free: 2, - starter: 3, + starter: 10, standard: 5, standardOld: 40, scale: 50, @@ -19,9 +19,9 @@ const RATE_LIMITS = { scrape: { default: 20, free: 10, - starter: 20, + starter: 100, standard: 100, - standardOld: 40, + standardOld: 100, scale: 500, hobby: 20, standardNew: 100, @@ -32,8 +32,8 @@ const RATE_LIMITS = { search: { default: 20, free: 5, - starter: 20, - standard: 40, + starter: 50, + standard: 50, standardOld: 40, scale: 500, hobby: 10, @@ -45,9 +45,9 @@ const RATE_LIMITS = { map:{ default: 20, free: 5, - starter: 20, - standard: 40, - standardOld: 40, + starter: 50, + standard: 50, + standardOld: 50, scale: 500, hobby: 10, standardNew: 50,