From 42d677fe3c1f75f41dee344f38a2b4d80ec99cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 18 Sep 2024 20:04:54 +0200 Subject: [PATCH 01/10] feat(fire-engine): port waitFor and screenshot to use actions --- apps/api/src/lib/entities.ts | 14 ++++++++++- .../scraper/WebScraper/scrapers/fireEngine.ts | 23 +++++++------------ apps/api/src/scraper/WebScraper/single_url.ts | 22 ++++++++++++++---- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index d7ec2a83..ef3ee642 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -10,6 +10,17 @@ export interface Progress { currentDocument?: Document; } +export type Action = { + type: "wait", + milliseconds: number, +} | { + type: "click", + selector: string, +} | { + type: "screenshot", + fullPage?: boolean, +}; + export type PageOptions = { includeMarkdown?: boolean; includeExtract?: boolean; @@ -29,7 +40,8 @@ export type PageOptions = { includeLinks?: boolean; useFastMode?: boolean; // beta disableJsDom?: boolean; // beta - atsv?: boolean; // beta + atsv?: boolean; // anti-bot solver, beta + actions?: Action[]; // beta }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 80ac7924..99737d3a 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -1,5 +1,5 @@ import axios from "axios"; -import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities"; +import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities"; import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; @@ -20,9 +20,7 @@ import * as Sentry from "@sentry/node"; */ export async function scrapWithFireEngine({ url, - waitFor = 0, - screenshot = false, - fullPageScreenshot = false, + actions, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false }, fireEngineOptions = {}, headers, @@ -31,9 +29,7 @@ export async function scrapWithFireEngine({ teamId, }: { url: string; - waitFor?: number; - screenshot?: boolean; - fullPageScreenshot?: boolean; + actions?: Action[]; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; @@ -54,10 +50,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); - let waitParam = reqParams["params"]?.wait ?? waitFor; let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; - let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -75,7 +68,7 @@ export async function scrapWithFireEngine({ } Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); // atsv is only available for beta customers @@ -101,9 +94,6 @@ export async function scrapWithFireEngine({ process.env.FIRE_ENGINE_BETA_URL + endpoint, { url: url, - wait: waitParam, - screenshot: screenshotParam, - fullPageScreenshot: fullPageScreenshotParam, headers: headers, disableJsDom: pageOptions?.disableJsDom ?? false, priority, @@ -112,6 +102,7 @@ export async function scrapWithFireEngine({ ...fireEngineOptionsParam, atsv: pageOptions?.atsv ?? false, scrollXPaths: pageOptions?.scrollXPaths ?? [], + actions: actions, }, { headers: { @@ -125,8 +116,10 @@ export async function scrapWithFireEngine({ ); }); + const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => x.milliseconds + a, 0); + let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); - while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { + while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) { await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8143bab0..9d9b7c40 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -195,9 +195,17 @@ export async function scrapSingleUrl( if (process.env.FIRE_ENGINE_BETA_URL) { const response = await scrapWithFireEngine({ url, - waitFor: pageOptions.waitFor, - screenshot: pageOptions.screenshot, - fullPageScreenshot: pageOptions.fullPageScreenshot, + actions: [ + ...(pageOptions.waitFor ? [{ + type: "wait" as const, + milliseconds: pageOptions.waitFor, + }] : []), + ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{ + type: "screenshot" as const, + fullPage: !!pageOptions.fullPageScreenshot, + }] : []), + ...(pageOptions.actions ?? []), + ], pageOptions: pageOptions, headers: pageOptions.headers, fireEngineOptions: { @@ -267,8 +275,12 @@ export async function scrapSingleUrl( case "fire-engine": customScrapedContent = await scrapWithFireEngine({ url: customScraperResult.url, - waitFor: customScraperResult.waitAfterLoad, - screenshot: false, + actions: customScraperResult.waitAfterLoad ? ([ + { + type: "wait", + milliseconds: customScraperResult.waitAfterLoad, + } + ]) : ([]), pageOptions: customScraperResult.pageOptions, }); if (screenshot) { From 093c064bff5c4665097217984bb90bcfc6c7a98d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 18 Sep 2024 20:39:25 +0200 Subject: [PATCH 02/10] feat(v1): add public actions api --- apps/api/src/controllers/v1/types.ts | 23 ++++- apps/api/src/lib/entities.ts | 5 +- .../scraper/WebScraper/scrapers/fireEngine.ts | 9 +- apps/api/src/scraper/WebScraper/single_url.ts | 92 +++++++++---------- 4 files changed, 74 insertions(+), 55 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ab811067..19bb2d81 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; -import { ExtractorOptions, PageOptions } from "../../lib/entities"; +import { Action, ExtractorOptions, PageOptions } from "../../lib/entities"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { PlanType } from "../../types"; @@ -57,6 +57,21 @@ export const extractOptions = z.object({ export type ExtractOptions = z.infer; +export const actionsSchema = z.array(z.union([ + z.object({ + type: z.literal("wait"), + milliseconds: z.number().int().positive().finite(), + }), + z.object({ + type: z.literal("click"), + selector: z.string(), + }), + z.object({ + type: z.literal("screenshot"), + fullPage: z.boolean().default(false), + }), +])); + export const scrapeOptions = z.object({ formats: z .enum([ @@ -80,6 +95,7 @@ export const scrapeOptions = z.object({ waitFor: z.number().int().nonnegative().finite().safe().default(0), extract: extractOptions.optional(), parsePDF: z.boolean().default(true), + actions: actionsSchema.optional(), }).strict(strictMessage) @@ -185,6 +201,9 @@ export type Document = { rawHtml?: string; links?: string[]; screenshot?: string; + actions?: { + screenshots: string[]; + }; metadata: { title?: string; description?: string; @@ -336,6 +355,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { screenshot: x.formats.includes("screenshot"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"), parsePDF: x.parsePDF, + actions: x.actions as Action[], // no strict null checking grrrr - mogery }; } @@ -370,6 +390,7 @@ export function legacyDocumentConverter(doc: any): Document { html: doc.html, extract: doc.llm_extraction, screenshot: doc.screenshot ?? doc.fullPageScreenshot, + actions: doc.actions ?? undefined, metadata: { ...doc.metadata, pageError: undefined, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index ef3ee642..cb8e5c56 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -110,6 +110,9 @@ export class Document { childrenLinks?: string[]; provider?: string; warning?: string; + actions?: { + screenshots: string[]; + } index?: number; linksOnPage?: string[]; // Add this new field as a separate property @@ -149,7 +152,7 @@ export class SearchResult { export interface FireEngineResponse { html: string; - screenshot: string; + screenshots?: string[]; pageStatusCode?: number; pageError?: string; } diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 99737d3a..41c79add 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -136,7 +136,7 @@ export async function scrapWithFireEngine({ Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`); logParams.error_message = "Request timed out"; - return { html: "", screenshot: "", pageStatusCode: null, pageError: "" }; + return { html: "", pageStatusCode: null, pageError: "" }; } if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) { @@ -155,7 +155,6 @@ export async function scrapWithFireEngine({ return { html: "", - screenshot: "", pageStatusCode, pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error, }; @@ -171,7 +170,7 @@ export async function scrapWithFireEngine({ logParams.success = true; logParams.response_code = pageStatusCode; logParams.error_message = pageError; - return { html: content, screenshot: "", pageStatusCode, pageError }; + return { html: content, pageStatusCode, pageError }; } else { const data = checkStatusResponse.data; @@ -183,7 +182,7 @@ export async function scrapWithFireEngine({ logParams.error_message = data.pageError ?? data.error; return { html: data.content ?? "", - screenshot: data.screenshot ?? "", + screenshots: data.screenshots, pageStatusCode: data.pageStatusCode, pageError: data.pageError ?? data.error, }; @@ -196,7 +195,7 @@ export async function scrapWithFireEngine({ Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; } - return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; + return { html: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 9d9b7c40..224a8db3 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -69,8 +69,13 @@ function getScrapingFallbackOrder( defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false, - isHeadersPresent: boolean = false + isHeadersPresent: boolean = false, + isActionsPresent: boolean = false, ) { + if (isActionsPresent) { + return useFireEngine ? ["fire-engine;chrome-cdp"] : []; + } + const availableScrapers = baseScrapers.filter((scraper) => { switch (scraper) { case "scrapingBee": @@ -170,6 +175,9 @@ export async function scrapSingleUrl( let scraperResponse: { text: string; screenshot: string; + actions?: { + screenshots: string[]; + }; metadata: { pageStatusCode?: number; pageError?: string | null }; } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; @@ -217,7 +225,14 @@ export async function scrapSingleUrl( teamId, }); scraperResponse.text = response.html; - scraperResponse.screenshot = response.screenshot; + if (pageOptions.screenshot || pageOptions.fullPageScreenshot) { + scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? ""; + } + if (pageOptions.actions) { + scraperResponse.actions = { + screenshots: response.screenshots ?? [], + }; + } scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageError = response.pageError; } @@ -283,9 +298,6 @@ export async function scrapSingleUrl( ]) : ([]), pageOptions: customScraperResult.pageOptions, }); - if (screenshot) { - customScrapedContent.screenshot = screenshot; - } break; case "pdf": const { content, pageStatusCode, pageError } = @@ -295,7 +307,6 @@ export async function scrapSingleUrl( ); customScrapedContent = { html: content, - screenshot, pageStatusCode, pageError, }; @@ -305,7 +316,6 @@ export async function scrapSingleUrl( if (customScrapedContent) { scraperResponse.text = customScrapedContent.html; - screenshot = customScrapedContent.screenshot; } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); @@ -325,16 +335,18 @@ export async function scrapSingleUrl( html: cleanedHtml, rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, + actions: scraperResponse.actions, pageStatusCode: scraperResponse.metadata.pageStatusCode, pageError: scraperResponse.metadata.pageError || undefined, }; }; - let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { + let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", + actions: undefined, pageStatusCode: 200, pageError: undefined, }; @@ -350,7 +362,8 @@ export async function scrapSingleUrl( defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), - pageOptions && pageOptions.headers && pageOptions.headers !== undefined + pageOptions && pageOptions.headers && pageOptions.headers !== undefined, + pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0, ); for (const scraper of scrapersInOrder) { @@ -367,6 +380,7 @@ export async function scrapSingleUrl( html = attempt.html ?? ""; rawHtml = attempt.rawHtml ?? ""; screenshot = attempt.screenshot ?? ""; + actions = attempt.actions ?? undefined; if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; @@ -404,45 +418,27 @@ export async function scrapSingleUrl( linksOnPage = extractLinks(rawHtml, urlToScrap); } - let document: Document; - if (screenshot && screenshot.length > 0) { - document = { - content: text, - markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, - html: pageOptions.includeHtml ? html : undefined, - rawHtml: - pageOptions.includeRawHtml || - (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) - ? rawHtml - : undefined, - linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, - metadata: { - ...metadata, - screenshot: screenshot, - sourceURL: urlToScrap, - pageStatusCode: pageStatusCode, - pageError: pageError, - }, - }; - } else { - document = { - content: text, - markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, - html: pageOptions.includeHtml ? html : undefined, - rawHtml: - pageOptions.includeRawHtml || - (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) - ? rawHtml - : undefined, - metadata: { - ...metadata, - sourceURL: urlToScrap, - pageStatusCode: pageStatusCode, - pageError: pageError, - }, - linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, - }; - } + let document: Document = { + content: text, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, + html: pageOptions.includeHtml ? html : undefined, + rawHtml: + pageOptions.includeRawHtml || + (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) + ? rawHtml + : undefined, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, + actions, + metadata: { + ...metadata, + ...(screenshot && screenshot.length > 0 ? ({ + screenshot, + }) : {}), + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError, + }, + }; return document; } catch (error) { From 20d1855ad5b81af9ca394ebac82c9eb13c17ed0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 18 Sep 2024 20:47:56 +0200 Subject: [PATCH 03/10] feat(js-sdk): actions integration --- apps/js-sdk/firecrawl/src/index.ts | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6c859bee..4c30d5fd 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -58,7 +58,7 @@ export interface FirecrawlDocumentMetadata { * Document interface for Firecrawl. * Represents a document retrieved or processed by Firecrawl. */ -export interface FirecrawlDocument { +export interface FirecrawlDocument { url?: string; markdown?: string; html?: string; @@ -67,6 +67,7 @@ export interface FirecrawlDocument { extract?: T; screenshot?: string; metadata?: FirecrawlDocumentMetadata; + actions: ActionsSchema; } /** @@ -83,19 +84,35 @@ export interface CrawlScrapeOptions { timeout?: number; } -export interface ScrapeParams extends CrawlScrapeOptions { +export type Action = { + type: "wait", + milliseconds: number, +} | { + type: "click", + selector: string, +} | { + type: "screenshot", + fullPage?: boolean, +}; + +export interface ScrapeParams extends CrawlScrapeOptions { extract?: { prompt?: string; schema?: LLMSchema; systemPrompt?: string; }; + actions?: ActionsSchema; +} + +export interface ActionsResult { + screenshots: string[]; } /** * Response interface for scraping operations. * Defines the structure of the response received after a scraping operation. */ -export interface ScrapeResponse extends FirecrawlDocument { +export interface ScrapeResponse extends FirecrawlDocument { success: true; warning?: string; error?: string; @@ -200,10 +217,10 @@ export default class FirecrawlApp { * @param params - Additional parameters for the scrape request. * @returns The response from the scrape operation. */ - async scrapeUrl( + async scrapeUrl( url: string, - params?: ScrapeParams - ): Promise> | ErrorResponse> { + params?: ScrapeParams + ): Promise, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse> { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, From 712ca316150b24f63417a7033a7409feed3872bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 18 Sep 2024 21:34:09 +0200 Subject: [PATCH 04/10] minor fixes --- apps/api/src/scraper/WebScraper/index.ts | 3 ++- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 4 ++-- apps/api/src/scraper/WebScraper/single_url.ts | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 2f7efa47..c564c471 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -591,7 +591,8 @@ export class WebScraperDataProvider { screenshot: options.pageOptions?.screenshot ?? false, useFastMode: options.pageOptions?.useFastMode ?? false, disableJsDom: options.pageOptions?.disableJsDom ?? false, - atsv: options.pageOptions?.atsv ?? false + atsv: options.pageOptions?.atsv ?? false, + actions: options.pageOptions?.actions ?? undefined, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 41c79add..deca5498 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -116,7 +116,7 @@ export async function scrapWithFireEngine({ ); }); - const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => x.milliseconds + a, 0); + const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0); let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) { @@ -141,7 +141,7 @@ export async function scrapWithFireEngine({ if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) { Logger.debug( - `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}` + `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}` ); logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 224a8db3..d61fb828 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -153,7 +153,8 @@ export async function scrapSingleUrl( onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], useFastMode: pageOptions.useFastMode ?? false, disableJsDom: pageOptions.disableJsDom ?? false, - atsv: pageOptions.atsv ?? false + atsv: pageOptions.atsv ?? false, + actions: pageOptions.actions ?? undefined, } if (extractorOptions) { From 01f42b980dba6d631731ce68ac09f6d009a3dd62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 19 Sep 2024 19:21:13 +0200 Subject: [PATCH 05/10] feat(scrape): add error tallying instead of empty response --- apps/api/src/controllers/v1/scrape.ts | 13 ++++++------- apps/api/src/scraper/WebScraper/single_url.ts | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index f0744c22..6d006bce 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -64,22 +64,21 @@ export async function scrapeController( success: false, error: "Request timed out", }); - } else { + } else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) { return res.status(500).json({ success: false, - error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ - extractorOptions && extractorOptions.mode !== "markdown" - ? " - Could be due to LLM parsing issues" - : "" - }`, + error: "All scraping methods failed for URL: " + req.body.url, + details: JSON.parse(e).errors as string[], }); + } else { + throw e; } } await job.remove(); if (!doc) { - console.error("!!! PANIC DOC IS", doc, job); + // console.error("!!! PANIC DOC IS", doc, job); return res.status(200).json({ success: true, warning: "No page found", diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d61fb828..0a3adf5c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -351,6 +351,9 @@ export async function scrapSingleUrl( pageStatusCode: 200, pageError: undefined, }; + + const errors: Record = {}; + try { let urlKey = urlToScrap; try { @@ -392,6 +395,12 @@ export async function scrapSingleUrl( pageError = undefined; } + if (attempt.pageError) { + errors[scraper] = attempt.pageError; + } else { + errors[scraper] = null; + } + if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); break; @@ -443,12 +452,17 @@ export async function scrapSingleUrl( return document; } catch (error) { - Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); + Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); ScrapeEvents.insert(jobId, { type: "error", message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), stack: error.stack, }); + + if (error instanceof Error && error.message.startsWith("All scraping methods failed")) { + throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)})); + } + return { content: "", markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined, From 3dd912ec91c5f4b2d5610ad3577176e691890e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 20 Sep 2024 21:02:53 +0200 Subject: [PATCH 06/10] feat(actions): add typeText, pressKey, fix playwright screenshot/waitFor --- apps/api/src/controllers/v1/types.ts | 8 ++++++ apps/api/src/lib/entities.ts | 6 ++++ .../scraper/WebScraper/scrapers/fireEngine.ts | 12 ++++++++ apps/api/src/scraper/WebScraper/single_url.ts | 28 +++++++++++-------- apps/js-sdk/firecrawl/src/index.ts | 8 +++++- 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 19bb2d81..67d66742 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -70,6 +70,14 @@ export const actionsSchema = z.array(z.union([ type: z.literal("screenshot"), fullPage: z.boolean().default(false), }), + z.object({ + type: z.literal("typeText"), + text: z.string(), + }), + z.object({ + type: z.literal("pressKey"), + key: z.string(), + }), ])); export const scrapeOptions = z.object({ diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index cb8e5c56..1186583a 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -19,6 +19,12 @@ export type Action = { } | { type: "screenshot", fullPage?: boolean, +} | { + type: "typeText", + text: string, +} | { + type: "pressKey", + key: string, }; export type PageOptions = { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index deca5498..ba747d07 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -21,6 +21,9 @@ import * as Sentry from "@sentry/node"; export async function scrapWithFireEngine({ url, actions, + waitFor = 0, + screenshot = false, + fullPageScreenshot = false, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false }, fireEngineOptions = {}, headers, @@ -30,6 +33,9 @@ export async function scrapWithFireEngine({ }: { url: string; actions?: Action[]; + waitFor?: number; + screenshot?: boolean; + fullPageScreenshot?: boolean; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; @@ -50,7 +56,10 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); + let waitParam = reqParams["params"]?.wait ?? waitFor; let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; + let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -95,6 +104,9 @@ export async function scrapWithFireEngine({ { url: url, headers: headers, + wait: waitParam, + screenshot: screenshotParam, + fullPageScreenshot: fullPageScreenshotParam, disableJsDom: pageOptions?.disableJsDom ?? false, priority, engine, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0a3adf5c..93fb0ce7 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -204,17 +204,23 @@ export async function scrapSingleUrl( if (process.env.FIRE_ENGINE_BETA_URL) { const response = await scrapWithFireEngine({ url, - actions: [ - ...(pageOptions.waitFor ? [{ - type: "wait" as const, - milliseconds: pageOptions.waitFor, - }] : []), - ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{ - type: "screenshot" as const, - fullPage: !!pageOptions.fullPageScreenshot, - }] : []), - ...(pageOptions.actions ?? []), - ], + ...(engine === "chrome-cdp" ? ({ + actions: [ + ...(pageOptions.waitFor ? [{ + type: "wait" as const, + milliseconds: pageOptions.waitFor, + }] : []), + ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{ + type: "screenshot" as const, + fullPage: !!pageOptions.fullPageScreenshot, + }] : []), + ...(pageOptions.actions ?? []), + ], + }) : ({ + waitFor: pageOptions.waitFor, + screenshot: pageOptions.screenshot, + fullPageScreenshot: pageOptions.fullPageScreenshot, + })), pageOptions: pageOptions, headers: pageOptions.headers, fireEngineOptions: { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 4c30d5fd..23241a5d 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -93,7 +93,13 @@ export type Action = { } | { type: "screenshot", fullPage?: boolean, -}; +} | { + type: "typeText", + text: string, +} | { + type: "pressKey", + key: string, +};; export interface ScrapeParams extends CrawlScrapeOptions { extract?: { From d663bbf0cacfcd3bf0ba8757279fc3cd61494e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 20 Sep 2024 21:41:53 +0200 Subject: [PATCH 07/10] feat(actions): add scroll --- apps/api/src/controllers/v1/types.ts | 4 ++++ apps/api/src/lib/entities.ts | 3 +++ apps/js-sdk/firecrawl/src/index.ts | 5 ++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 67d66742..b7ca551b 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -78,6 +78,10 @@ export const actionsSchema = z.array(z.union([ type: z.literal("pressKey"), key: z.string(), }), + z.object({ + type: z.literal("scroll"), + direction: z.enum(["up", "down"]), + }), ])); export const scrapeOptions = z.object({ diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 1186583a..7723f0f2 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -25,6 +25,9 @@ export type Action = { } | { type: "pressKey", key: string, +} | { + type: "scroll", + direction: "up" | "down" }; export type PageOptions = { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 23241a5d..7004d86f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -99,7 +99,10 @@ export type Action = { } | { type: "pressKey", key: string, -};; +} | { + type: "scroll", + direction: "up" | "down", +}; export interface ScrapeParams extends CrawlScrapeOptions { extract?: { From 815bfc8f07b3493ed91ff90afda123e7d84f44f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 20 Sep 2024 21:42:09 +0200 Subject: [PATCH 08/10] feat(scrape): scroll down/up with actions if fullpagescreenshot revert this if unneeded --- apps/api/src/scraper/WebScraper/single_url.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 93fb0ce7..fc3a0efb 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -210,10 +210,26 @@ export async function scrapSingleUrl( type: "wait" as const, milliseconds: pageOptions.waitFor, }] : []), + ...(pageOptions.fullPageScreenshot ? [ + { + type: "scroll" as const, + direction: "down" as const, + }, + { + type: "wait" as const, + milliseconds: 300, + }, + ] : []), ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{ type: "screenshot" as const, fullPage: !!pageOptions.fullPageScreenshot, }] : []), + ...(pageOptions.fullPageScreenshot ? [ + { + type: "scroll" as const, + direction: "up" as const, + } + ] : []), ...(pageOptions.actions ?? []), ], }) : ({ From e1a34b0a99521f19356f6e13d304e45be56c69fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 20 Sep 2024 21:43:22 +0200 Subject: [PATCH 09/10] Revert "feat(scrape): scroll down/up with actions if fullpagescreenshot" This reverts commit 815bfc8f07b3493ed91ff90afda123e7d84f44f9. --- apps/api/src/scraper/WebScraper/single_url.ts | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fc3a0efb..93fb0ce7 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -210,26 +210,10 @@ export async function scrapSingleUrl( type: "wait" as const, milliseconds: pageOptions.waitFor, }] : []), - ...(pageOptions.fullPageScreenshot ? [ - { - type: "scroll" as const, - direction: "down" as const, - }, - { - type: "wait" as const, - milliseconds: 300, - }, - ] : []), ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{ type: "screenshot" as const, fullPage: !!pageOptions.fullPageScreenshot, }] : []), - ...(pageOptions.fullPageScreenshot ? [ - { - type: "scroll" as const, - direction: "up" as const, - } - ] : []), ...(pageOptions.actions ?? []), ], }) : ({ From 3fc5ce17d2739ff9ffaf509315c6c9b5ba1a794b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 20 Sep 2024 18:35:30 -0400 Subject: [PATCH 10/10] Nick: fixed error handling for v0 scrape --- apps/api/src/controllers/v0/scrape.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index c46ebc62..de6cf032 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -285,11 +285,19 @@ export async function scrapeController(req: Request, res: Response) { } catch (error) { Sentry.captureException(error); Logger.error(error); - return res.status(500).json({ - error: + if (typeof error === "string" && error.startsWith("{\"type\":\"all\",")) { + return res.status(500).json({ + success: false, + error: "All scraping methods failed for URL: " + req.body.url, + details: JSON.parse(error).errors as string[], + }); + } else { + return res.status(500).json({ + error: typeof error === "string" ? error : error?.message ?? "Internal Server Error", - }); + }); + } } }