diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ab811067..19bb2d81 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; -import { ExtractorOptions, PageOptions } from "../../lib/entities"; +import { Action, ExtractorOptions, PageOptions } from "../../lib/entities"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { PlanType } from "../../types"; @@ -57,6 +57,21 @@ export const extractOptions = z.object({ export type ExtractOptions = z.infer; +export const actionsSchema = z.array(z.union([ + z.object({ + type: z.literal("wait"), + milliseconds: z.number().int().positive().finite(), + }), + z.object({ + type: z.literal("click"), + selector: z.string(), + }), + z.object({ + type: z.literal("screenshot"), + fullPage: z.boolean().default(false), + }), +])); + export const scrapeOptions = z.object({ formats: z .enum([ @@ -80,6 +95,7 @@ export const scrapeOptions = z.object({ waitFor: z.number().int().nonnegative().finite().safe().default(0), extract: extractOptions.optional(), parsePDF: z.boolean().default(true), + actions: actionsSchema.optional(), }).strict(strictMessage) @@ -185,6 +201,9 @@ export type Document = { rawHtml?: string; links?: string[]; screenshot?: string; + actions?: { + screenshots: string[]; + }; metadata: { title?: string; description?: string; @@ -336,6 +355,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { screenshot: x.formats.includes("screenshot"), fullPageScreenshot: x.formats.includes("screenshot@fullPage"), parsePDF: x.parsePDF, + actions: x.actions as Action[], // no strict null checking grrrr - mogery }; } @@ -370,6 +390,7 @@ export function legacyDocumentConverter(doc: any): Document { html: doc.html, extract: doc.llm_extraction, screenshot: doc.screenshot ?? doc.fullPageScreenshot, + actions: doc.actions ?? undefined, metadata: { ...doc.metadata, pageError: undefined, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index ef3ee642..cb8e5c56 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -110,6 +110,9 @@ export class Document { childrenLinks?: string[]; provider?: string; warning?: string; + actions?: { + screenshots: string[]; + } index?: number; linksOnPage?: string[]; // Add this new field as a separate property @@ -149,7 +152,7 @@ export class SearchResult { export interface FireEngineResponse { html: string; - screenshot: string; + screenshots?: string[]; pageStatusCode?: number; pageError?: string; } diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 99737d3a..41c79add 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -136,7 +136,7 @@ export async function scrapWithFireEngine({ Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`); logParams.error_message = "Request timed out"; - return { html: "", screenshot: "", pageStatusCode: null, pageError: "" }; + return { html: "", pageStatusCode: null, pageError: "" }; } if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) { @@ -155,7 +155,6 @@ export async function scrapWithFireEngine({ return { html: "", - screenshot: "", pageStatusCode, pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error, }; @@ -171,7 +170,7 @@ export async function scrapWithFireEngine({ logParams.success = true; logParams.response_code = pageStatusCode; logParams.error_message = pageError; - return { html: content, screenshot: "", pageStatusCode, pageError }; + return { html: content, pageStatusCode, pageError }; } else { const data = checkStatusResponse.data; @@ -183,7 +182,7 @@ export async function scrapWithFireEngine({ logParams.error_message = data.pageError ?? data.error; return { html: data.content ?? "", - screenshot: data.screenshot ?? "", + screenshots: data.screenshots, pageStatusCode: data.pageStatusCode, pageError: data.pageError ?? data.error, }; @@ -196,7 +195,7 @@ export async function scrapWithFireEngine({ Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; } - return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; + return { html: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { const endTime = Date.now(); logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 9d9b7c40..224a8db3 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -69,8 +69,13 @@ function getScrapingFallbackOrder( defaultScraper?: string, isWaitPresent: boolean = false, isScreenshotPresent: boolean = false, - isHeadersPresent: boolean = false + isHeadersPresent: boolean = false, + isActionsPresent: boolean = false, ) { + if (isActionsPresent) { + return useFireEngine ? ["fire-engine;chrome-cdp"] : []; + } + const availableScrapers = baseScrapers.filter((scraper) => { switch (scraper) { case "scrapingBee": @@ -170,6 +175,9 @@ export async function scrapSingleUrl( let scraperResponse: { text: string; screenshot: string; + actions?: { + screenshots: string[]; + }; metadata: { pageStatusCode?: number; pageError?: string | null }; } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; @@ -217,7 +225,14 @@ export async function scrapSingleUrl( teamId, }); scraperResponse.text = response.html; - scraperResponse.screenshot = response.screenshot; + if (pageOptions.screenshot || pageOptions.fullPageScreenshot) { + scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? ""; + } + if (pageOptions.actions) { + scraperResponse.actions = { + screenshots: response.screenshots ?? [], + }; + } scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageError = response.pageError; } @@ -283,9 +298,6 @@ export async function scrapSingleUrl( ]) : ([]), pageOptions: customScraperResult.pageOptions, }); - if (screenshot) { - customScrapedContent.screenshot = screenshot; - } break; case "pdf": const { content, pageStatusCode, pageError } = @@ -295,7 +307,6 @@ export async function scrapSingleUrl( ); customScrapedContent = { html: content, - screenshot, pageStatusCode, pageError, }; @@ -305,7 +316,6 @@ export async function scrapSingleUrl( if (customScrapedContent) { scraperResponse.text = customScrapedContent.html; - screenshot = customScrapedContent.screenshot; } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); @@ -325,16 +335,18 @@ export async function scrapSingleUrl( html: cleanedHtml, rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, + actions: scraperResponse.actions, pageStatusCode: scraperResponse.metadata.pageStatusCode, pageError: scraperResponse.metadata.pageError || undefined, }; }; - let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { + let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", + actions: undefined, pageStatusCode: 200, pageError: undefined, }; @@ -350,7 +362,8 @@ export async function scrapSingleUrl( defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), - pageOptions && pageOptions.headers && pageOptions.headers !== undefined + pageOptions && pageOptions.headers && pageOptions.headers !== undefined, + pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0, ); for (const scraper of scrapersInOrder) { @@ -367,6 +380,7 @@ export async function scrapSingleUrl( html = attempt.html ?? ""; rawHtml = attempt.rawHtml ?? ""; screenshot = attempt.screenshot ?? ""; + actions = attempt.actions ?? undefined; if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; @@ -404,45 +418,27 @@ export async function scrapSingleUrl( linksOnPage = extractLinks(rawHtml, urlToScrap); } - let document: Document; - if (screenshot && screenshot.length > 0) { - document = { - content: text, - markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, - html: pageOptions.includeHtml ? html : undefined, - rawHtml: - pageOptions.includeRawHtml || - (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) - ? rawHtml - : undefined, - linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, - metadata: { - ...metadata, - screenshot: screenshot, - sourceURL: urlToScrap, - pageStatusCode: pageStatusCode, - pageError: pageError, - }, - }; - } else { - document = { - content: text, - markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, - html: pageOptions.includeHtml ? html : undefined, - rawHtml: - pageOptions.includeRawHtml || - (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) - ? rawHtml - : undefined, - metadata: { - ...metadata, - sourceURL: urlToScrap, - pageStatusCode: pageStatusCode, - pageError: pageError, - }, - linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, - }; - } + let document: Document = { + content: text, + markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, + html: pageOptions.includeHtml ? html : undefined, + rawHtml: + pageOptions.includeRawHtml || + (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) + ? rawHtml + : undefined, + linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, + actions, + metadata: { + ...metadata, + ...(screenshot && screenshot.length > 0 ? ({ + screenshot, + }) : {}), + sourceURL: urlToScrap, + pageStatusCode: pageStatusCode, + pageError: pageError, + }, + }; return document; } catch (error) {