Merge pull request #682 from mendableai/feat/actions

feat: Actions
This commit is contained in:
Nicolas
2024-09-20 18:43:19 -04:00
committed by GitHub
8 changed files with 209 additions and 85 deletions
+11 -3
View File
@@ -285,11 +285,19 @@ export async function scrapeController(req: Request, res: Response) {
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); Logger.error(error);
return res.status(500).json({ if (typeof error === "string" && error.startsWith("{\"type\":\"all\",")) {
error: return res.status(500).json({
success: false,
error: "All scraping methods failed for URL: " + req.body.url,
details: JSON.parse(error).errors as string[],
});
} else {
return res.status(500).json({
error:
typeof error === "string" typeof error === "string"
? error ? error
: error?.message ?? "Internal Server Error", : error?.message ?? "Internal Server Error",
}); });
}
} }
} }
+6 -7
View File
@@ -64,22 +64,21 @@ export async function scrapeController(
success: false, success: false,
error: "Request timed out", error: "Request timed out",
}); });
} else { } else if (typeof e === "string" && e.startsWith("{\"type\":\"all\",")) {
return res.status(500).json({ return res.status(500).json({
success: false, success: false,
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ error: "All scraping methods failed for URL: " + req.body.url,
extractorOptions && extractorOptions.mode !== "markdown" details: JSON.parse(e).errors as string[],
? " - Could be due to LLM parsing issues"
: ""
}`,
}); });
} else {
throw e;
} }
} }
await job.remove(); await job.remove();
if (!doc) { if (!doc) {
console.error("!!! PANIC DOC IS", doc, job); // console.error("!!! PANIC DOC IS", doc, job);
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
warning: "No page found", warning: "No page found",
+34 -1
View File
@@ -1,7 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { z } from "zod"; import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { ExtractorOptions, PageOptions } from "../../lib/entities"; import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
@@ -57,6 +57,33 @@ export const extractOptions = z.object({
export type ExtractOptions = z.infer<typeof extractOptions>; export type ExtractOptions = z.infer<typeof extractOptions>;
export const actionsSchema = z.array(z.union([
z.object({
type: z.literal("wait"),
milliseconds: z.number().int().positive().finite(),
}),
z.object({
type: z.literal("click"),
selector: z.string(),
}),
z.object({
type: z.literal("screenshot"),
fullPage: z.boolean().default(false),
}),
z.object({
type: z.literal("typeText"),
text: z.string(),
}),
z.object({
type: z.literal("pressKey"),
key: z.string(),
}),
z.object({
type: z.literal("scroll"),
direction: z.enum(["up", "down"]),
}),
]));
export const scrapeOptions = z.object({ export const scrapeOptions = z.object({
formats: z formats: z
.enum([ .enum([
@@ -80,6 +107,7 @@ export const scrapeOptions = z.object({
waitFor: z.number().int().nonnegative().finite().safe().default(0), waitFor: z.number().int().nonnegative().finite().safe().default(0),
extract: extractOptions.optional(), extract: extractOptions.optional(),
parsePDF: z.boolean().default(true), parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
}).strict(strictMessage) }).strict(strictMessage)
@@ -185,6 +213,9 @@ export type Document = {
rawHtml?: string; rawHtml?: string;
links?: string[]; links?: string[];
screenshot?: string; screenshot?: string;
actions?: {
screenshots: string[];
};
metadata: { metadata: {
title?: string; title?: string;
description?: string; description?: string;
@@ -336,6 +367,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
screenshot: x.formats.includes("screenshot"), screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF, parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
}; };
} }
@@ -370,6 +402,7 @@ export function legacyDocumentConverter(doc: any): Document {
html: doc.html, html: doc.html,
extract: doc.llm_extraction, extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot, screenshot: doc.screenshot ?? doc.fullPageScreenshot,
actions: doc.actions ?? undefined,
metadata: { metadata: {
...doc.metadata, ...doc.metadata,
pageError: undefined, pageError: undefined,
+26 -2
View File
@@ -10,6 +10,26 @@ export interface Progress {
currentDocument?: Document; currentDocument?: Document;
} }
export type Action = {
type: "wait",
milliseconds: number,
} | {
type: "click",
selector: string,
} | {
type: "screenshot",
fullPage?: boolean,
} | {
type: "typeText",
text: string,
} | {
type: "pressKey",
key: string,
} | {
type: "scroll",
direction: "up" | "down"
};
export type PageOptions = { export type PageOptions = {
includeMarkdown?: boolean; includeMarkdown?: boolean;
includeExtract?: boolean; includeExtract?: boolean;
@@ -29,7 +49,8 @@ export type PageOptions = {
includeLinks?: boolean; includeLinks?: boolean;
useFastMode?: boolean; // beta useFastMode?: boolean; // beta
disableJsDom?: boolean; // beta disableJsDom?: boolean; // beta
atsv?: boolean; // beta atsv?: boolean; // anti-bot solver, beta
actions?: Action[]; // beta
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@@ -98,6 +119,9 @@ export class Document {
childrenLinks?: string[]; childrenLinks?: string[];
provider?: string; provider?: string;
warning?: string; warning?: string;
actions?: {
screenshots: string[];
}
index?: number; index?: number;
linksOnPage?: string[]; // Add this new field as a separate property linksOnPage?: string[]; // Add this new field as a separate property
@@ -137,7 +161,7 @@ export class SearchResult {
export interface FireEngineResponse { export interface FireEngineResponse {
html: string; html: string;
screenshot: string; screenshots?: string[];
pageStatusCode?: number; pageStatusCode?: number;
pageError?: string; pageError?: string;
} }
+2 -1
View File
@@ -591,7 +591,8 @@ export class WebScraperDataProvider {
screenshot: options.pageOptions?.screenshot ?? false, screenshot: options.pageOptions?.screenshot ?? false,
useFastMode: options.pageOptions?.useFastMode ?? false, useFastMode: options.pageOptions?.useFastMode ?? false,
disableJsDom: options.pageOptions?.disableJsDom ?? false, disableJsDom: options.pageOptions?.disableJsDom ?? false,
atsv: options.pageOptions?.atsv ?? false atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
}; };
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths = this.replaceAllPathsWithAbsolutePaths =
@@ -1,5 +1,5 @@
import axios from "axios"; import axios from "axios";
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities"; import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log"; import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url"; import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { fetchAndProcessPdf } from "../utils/pdfProcessor";
@@ -20,6 +20,7 @@ import * as Sentry from "@sentry/node";
*/ */
export async function scrapWithFireEngine({ export async function scrapWithFireEngine({
url, url,
actions,
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false, fullPageScreenshot = false,
@@ -31,6 +32,7 @@ export async function scrapWithFireEngine({
teamId, teamId,
}: { }: {
url: string; url: string;
actions?: Action[];
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean; fullPageScreenshot?: boolean;
@@ -75,7 +77,7 @@ export async function scrapWithFireEngine({
} }
Logger.info( Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
); );
// atsv is only available for beta customers // atsv is only available for beta customers
@@ -101,10 +103,10 @@ export async function scrapWithFireEngine({
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
{ {
url: url, url: url,
headers: headers,
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam, fullPageScreenshot: fullPageScreenshotParam,
headers: headers,
disableJsDom: pageOptions?.disableJsDom ?? false, disableJsDom: pageOptions?.disableJsDom ?? false,
priority, priority,
engine, engine,
@@ -112,6 +114,7 @@ export async function scrapWithFireEngine({
...fireEngineOptionsParam, ...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false, atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [], scrollXPaths: pageOptions?.scrollXPaths ?? [],
actions: actions,
}, },
{ {
headers: { headers: {
@@ -125,8 +128,10 @@ export async function scrapWithFireEngine({
); );
}); });
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal) {
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
} }
@@ -143,12 +148,12 @@ export async function scrapWithFireEngine({
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`); Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" }; return { html: "", pageStatusCode: null, pageError: "" };
} }
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) { if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
Logger.debug( Logger.debug(
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}` `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
); );
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error; logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
@@ -162,7 +167,6 @@ export async function scrapWithFireEngine({
return { return {
html: "", html: "",
screenshot: "",
pageStatusCode, pageStatusCode,
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error, pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
}; };
@@ -178,7 +182,7 @@ export async function scrapWithFireEngine({
logParams.success = true; logParams.success = true;
logParams.response_code = pageStatusCode; logParams.response_code = pageStatusCode;
logParams.error_message = pageError; logParams.error_message = pageError;
return { html: content, screenshot: "", pageStatusCode, pageError }; return { html: content, pageStatusCode, pageError };
} else { } else {
const data = checkStatusResponse.data; const data = checkStatusResponse.data;
@@ -190,7 +194,7 @@ export async function scrapWithFireEngine({
logParams.error_message = data.pageError ?? data.error; logParams.error_message = data.pageError ?? data.error;
return { return {
html: data.content ?? "", html: data.content ?? "",
screenshot: data.screenshot ?? "", screenshots: data.screenshots,
pageStatusCode: data.pageStatusCode, pageStatusCode: data.pageStatusCode,
pageError: data.pageError ?? data.error, pageError: data.pageError ?? data.error,
}; };
@@ -203,7 +207,7 @@ export async function scrapWithFireEngine({
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`); Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error; logParams.error_message = error.message || error;
} }
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; return { html: "", pageStatusCode: null, pageError: logParams.error_message };
} finally { } finally {
const endTime = Date.now(); const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
+84 -55
View File
@@ -69,8 +69,13 @@ function getScrapingFallbackOrder(
defaultScraper?: string, defaultScraper?: string,
isWaitPresent: boolean = false, isWaitPresent: boolean = false,
isScreenshotPresent: boolean = false, isScreenshotPresent: boolean = false,
isHeadersPresent: boolean = false isHeadersPresent: boolean = false,
isActionsPresent: boolean = false,
) { ) {
if (isActionsPresent) {
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
}
const availableScrapers = baseScrapers.filter((scraper) => { const availableScrapers = baseScrapers.filter((scraper) => {
switch (scraper) { switch (scraper) {
case "scrapingBee": case "scrapingBee":
@@ -148,7 +153,8 @@ export async function scrapSingleUrl(
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
useFastMode: pageOptions.useFastMode ?? false, useFastMode: pageOptions.useFastMode ?? false,
disableJsDom: pageOptions.disableJsDom ?? false, disableJsDom: pageOptions.disableJsDom ?? false,
atsv: pageOptions.atsv ?? false atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
} }
if (extractorOptions) { if (extractorOptions) {
@@ -170,6 +176,9 @@ export async function scrapSingleUrl(
let scraperResponse: { let scraperResponse: {
text: string; text: string;
screenshot: string; screenshot: string;
actions?: {
screenshots: string[];
};
metadata: { pageStatusCode?: number; pageError?: string | null }; metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} }; } = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
@@ -195,9 +204,23 @@ export async function scrapSingleUrl(
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
const response = await scrapWithFireEngine({ const response = await scrapWithFireEngine({
url, url,
waitFor: pageOptions.waitFor, ...(engine === "chrome-cdp" ? ({
screenshot: pageOptions.screenshot, actions: [
fullPageScreenshot: pageOptions.fullPageScreenshot, ...(pageOptions.waitFor ? [{
type: "wait" as const,
milliseconds: pageOptions.waitFor,
}] : []),
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
type: "screenshot" as const,
fullPage: !!pageOptions.fullPageScreenshot,
}] : []),
...(pageOptions.actions ?? []),
],
}) : ({
waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
})),
pageOptions: pageOptions, pageOptions: pageOptions,
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: { fireEngineOptions: {
@@ -209,7 +232,14 @@ export async function scrapSingleUrl(
teamId, teamId,
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
}
if (pageOptions.actions) {
scraperResponse.actions = {
screenshots: response.screenshots ?? [],
};
}
scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError; scraperResponse.metadata.pageError = response.pageError;
} }
@@ -267,13 +297,14 @@ export async function scrapSingleUrl(
case "fire-engine": case "fire-engine":
customScrapedContent = await scrapWithFireEngine({ customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url, url: customScraperResult.url,
waitFor: customScraperResult.waitAfterLoad, actions: customScraperResult.waitAfterLoad ? ([
screenshot: false, {
type: "wait",
milliseconds: customScraperResult.waitAfterLoad,
}
]) : ([]),
pageOptions: customScraperResult.pageOptions, pageOptions: customScraperResult.pageOptions,
}); });
if (screenshot) {
customScrapedContent.screenshot = screenshot;
}
break; break;
case "pdf": case "pdf":
const { content, pageStatusCode, pageError } = const { content, pageStatusCode, pageError } =
@@ -283,7 +314,6 @@ export async function scrapSingleUrl(
); );
customScrapedContent = { customScrapedContent = {
html: content, html: content,
screenshot,
pageStatusCode, pageStatusCode,
pageError, pageError,
}; };
@@ -293,7 +323,6 @@ export async function scrapSingleUrl(
if (customScrapedContent) { if (customScrapedContent) {
scraperResponse.text = customScrapedContent.html; scraperResponse.text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot;
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
@@ -313,19 +342,24 @@ export async function scrapSingleUrl(
html: cleanedHtml, html: cleanedHtml,
rawHtml: scraperResponse.text, rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot, screenshot: scraperResponse.screenshot,
actions: scraperResponse.actions,
pageStatusCode: scraperResponse.metadata.pageStatusCode, pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined, pageError: scraperResponse.metadata.pageError || undefined,
}; };
}; };
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
text: "", text: "",
html: "", html: "",
rawHtml: "", rawHtml: "",
screenshot: "", screenshot: "",
actions: undefined,
pageStatusCode: 200, pageStatusCode: 200,
pageError: undefined, pageError: undefined,
}; };
const errors: Record<string, string> = {};
try { try {
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
@@ -338,7 +372,8 @@ export async function scrapSingleUrl(
defaultScraper, defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
pageOptions && pageOptions.headers && pageOptions.headers !== undefined pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
); );
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
@@ -355,6 +390,7 @@ export async function scrapSingleUrl(
html = attempt.html ?? ""; html = attempt.html ?? "";
rawHtml = attempt.rawHtml ?? ""; rawHtml = attempt.rawHtml ?? "";
screenshot = attempt.screenshot ?? ""; screenshot = attempt.screenshot ?? "";
actions = attempt.actions ?? undefined;
if (attempt.pageStatusCode) { if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode; pageStatusCode = attempt.pageStatusCode;
@@ -365,6 +401,12 @@ export async function scrapSingleUrl(
pageError = undefined; pageError = undefined;
} }
if (attempt.pageError) {
errors[scraper] = attempt.pageError;
} else {
errors[scraper] = null;
}
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
break; break;
@@ -392,54 +434,41 @@ export async function scrapSingleUrl(
linksOnPage = extractLinks(rawHtml, urlToScrap); linksOnPage = extractLinks(rawHtml, urlToScrap);
} }
let document: Document; let document: Document = {
if (screenshot && screenshot.length > 0) { content: text,
document = { markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
content: text, html: pageOptions.includeHtml ? html : undefined,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, rawHtml:
html: pageOptions.includeHtml ? html : undefined, pageOptions.includeRawHtml ||
rawHtml: (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
pageOptions.includeRawHtml || ? rawHtml
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) : undefined,
? rawHtml linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
: undefined, actions,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, metadata: {
metadata: { ...metadata,
...metadata, ...(screenshot && screenshot.length > 0 ? ({
screenshot: screenshot, screenshot,
sourceURL: urlToScrap, }) : {}),
pageStatusCode: pageStatusCode, sourceURL: urlToScrap,
pageError: pageError, pageStatusCode: pageStatusCode,
}, pageError: pageError,
}; },
} else { };
document = {
content: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
metadata: {
...metadata,
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError,
},
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
};
}
return document; return document;
} catch (error) { } catch (error) {
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); Logger.error(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
ScrapeEvents.insert(jobId, { ScrapeEvents.insert(jobId, {
type: "error", type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack, stack: error.stack,
}); });
if (error instanceof Error && error.message.startsWith("All scraping methods failed")) {
throw new Error(JSON.stringify({"type": "all", "errors": Object.values(errors)}));
}
return { return {
content: "", content: "",
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined, markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
+32 -6
View File
@@ -58,7 +58,7 @@ export interface FirecrawlDocumentMetadata {
* Document interface for Firecrawl. * Document interface for Firecrawl.
* Represents a document retrieved or processed by Firecrawl. * Represents a document retrieved or processed by Firecrawl.
*/ */
export interface FirecrawlDocument<T = any> { export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult | never) = never> {
url?: string; url?: string;
markdown?: string; markdown?: string;
html?: string; html?: string;
@@ -67,6 +67,7 @@ export interface FirecrawlDocument<T = any> {
extract?: T; extract?: T;
screenshot?: string; screenshot?: string;
metadata?: FirecrawlDocumentMetadata; metadata?: FirecrawlDocumentMetadata;
actions: ActionsSchema;
} }
/** /**
@@ -83,19 +84,44 @@ export interface CrawlScrapeOptions {
timeout?: number; timeout?: number;
} }
export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any> extends CrawlScrapeOptions { export type Action = {
type: "wait",
milliseconds: number,
} | {
type: "click",
selector: string,
} | {
type: "screenshot",
fullPage?: boolean,
} | {
type: "typeText",
text: string,
} | {
type: "pressKey",
key: string,
} | {
type: "scroll",
direction: "up" | "down",
};
export interface ScrapeParams<LLMSchema extends zt.ZodSchema = any, ActionsSchema extends (Action[] | undefined) = undefined> extends CrawlScrapeOptions {
extract?: { extract?: {
prompt?: string; prompt?: string;
schema?: LLMSchema; schema?: LLMSchema;
systemPrompt?: string; systemPrompt?: string;
}; };
actions?: ActionsSchema;
}
export interface ActionsResult {
screenshots: string[];
} }
/** /**
* Response interface for scraping operations. * Response interface for scraping operations.
* Defines the structure of the response received after a scraping operation. * Defines the structure of the response received after a scraping operation.
*/ */
export interface ScrapeResponse<LLMResult = any> extends FirecrawlDocument<LLMResult> { export interface ScrapeResponse<LLMResult = any, ActionsSchema extends (ActionsResult | never) = never> extends FirecrawlDocument<LLMResult, ActionsSchema> {
success: true; success: true;
warning?: string; warning?: string;
error?: string; error?: string;
@@ -200,10 +226,10 @@ export default class FirecrawlApp {
* @param params - Additional parameters for the scrape request. * @param params - Additional parameters for the scrape request.
* @returns The response from the scrape operation. * @returns The response from the scrape operation.
*/ */
async scrapeUrl<T extends zt.ZodSchema>( async scrapeUrl<T extends zt.ZodSchema, ActionsSchema extends (Action[] | undefined) = undefined>(
url: string, url: string,
params?: ScrapeParams<T> params?: ScrapeParams<T, ActionsSchema>
): Promise<ScrapeResponse<zt.infer<T>> | ErrorResponse> { ): Promise<ScrapeResponse<zt.infer<T>, ActionsSchema extends Action[] ? ActionsResult : never> | ErrorResponse> {
const headers: AxiosRequestHeaders = { const headers: AxiosRequestHeaders = {
"Content-Type": "application/json", "Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`, Authorization: `Bearer ${this.apiKey}`,