feat(v1): add public actions api
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { ExtractorOptions, PageOptions } from "../../lib/entities";
|
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
|
|
||||||
@@ -57,6 +57,21 @@ export const extractOptions = z.object({
|
|||||||
|
|
||||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||||
|
|
||||||
|
export const actionsSchema = z.array(z.union([
|
||||||
|
z.object({
|
||||||
|
type: z.literal("wait"),
|
||||||
|
milliseconds: z.number().int().positive().finite(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("click"),
|
||||||
|
selector: z.string(),
|
||||||
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal("screenshot"),
|
||||||
|
fullPage: z.boolean().default(false),
|
||||||
|
}),
|
||||||
|
]));
|
||||||
|
|
||||||
export const scrapeOptions = z.object({
|
export const scrapeOptions = z.object({
|
||||||
formats: z
|
formats: z
|
||||||
.enum([
|
.enum([
|
||||||
@@ -80,6 +95,7 @@ export const scrapeOptions = z.object({
|
|||||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||||
extract: extractOptions.optional(),
|
extract: extractOptions.optional(),
|
||||||
parsePDF: z.boolean().default(true),
|
parsePDF: z.boolean().default(true),
|
||||||
|
actions: actionsSchema.optional(),
|
||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
|
||||||
@@ -185,6 +201,9 @@ export type Document = {
|
|||||||
rawHtml?: string;
|
rawHtml?: string;
|
||||||
links?: string[];
|
links?: string[];
|
||||||
screenshot?: string;
|
screenshot?: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
};
|
||||||
metadata: {
|
metadata: {
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
@@ -336,6 +355,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
screenshot: x.formats.includes("screenshot"),
|
screenshot: x.formats.includes("screenshot"),
|
||||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||||
parsePDF: x.parsePDF,
|
parsePDF: x.parsePDF,
|
||||||
|
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -370,6 +390,7 @@ export function legacyDocumentConverter(doc: any): Document {
|
|||||||
html: doc.html,
|
html: doc.html,
|
||||||
extract: doc.llm_extraction,
|
extract: doc.llm_extraction,
|
||||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||||
|
actions: doc.actions ?? undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
...doc.metadata,
|
...doc.metadata,
|
||||||
pageError: undefined,
|
pageError: undefined,
|
||||||
|
|||||||
@@ -110,6 +110,9 @@ export class Document {
|
|||||||
childrenLinks?: string[];
|
childrenLinks?: string[];
|
||||||
provider?: string;
|
provider?: string;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
}
|
||||||
|
|
||||||
index?: number;
|
index?: number;
|
||||||
linksOnPage?: string[]; // Add this new field as a separate property
|
linksOnPage?: string[]; // Add this new field as a separate property
|
||||||
@@ -149,7 +152,7 @@ export class SearchResult {
|
|||||||
|
|
||||||
export interface FireEngineResponse {
|
export interface FireEngineResponse {
|
||||||
html: string;
|
html: string;
|
||||||
screenshot: string;
|
screenshots?: string[];
|
||||||
pageStatusCode?: number;
|
pageStatusCode?: number;
|
||||||
pageError?: string;
|
pageError?: string;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
|
||||||
logParams.error_message = "Request timed out";
|
logParams.error_message = "Request timed out";
|
||||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
|
return { html: "", pageStatusCode: null, pageError: "" };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
|
||||||
@@ -155,7 +155,6 @@ export async function scrapWithFireEngine({
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
html: "",
|
html: "",
|
||||||
screenshot: "",
|
|
||||||
pageStatusCode,
|
pageStatusCode,
|
||||||
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
|
||||||
};
|
};
|
||||||
@@ -171,7 +170,7 @@ export async function scrapWithFireEngine({
|
|||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
logParams.response_code = pageStatusCode;
|
logParams.response_code = pageStatusCode;
|
||||||
logParams.error_message = pageError;
|
logParams.error_message = pageError;
|
||||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
return { html: content, pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const data = checkStatusResponse.data;
|
const data = checkStatusResponse.data;
|
||||||
|
|
||||||
@@ -183,7 +182,7 @@ export async function scrapWithFireEngine({
|
|||||||
logParams.error_message = data.pageError ?? data.error;
|
logParams.error_message = data.pageError ?? data.error;
|
||||||
return {
|
return {
|
||||||
html: data.content ?? "",
|
html: data.content ?? "",
|
||||||
screenshot: data.screenshot ?? "",
|
screenshots: data.screenshots,
|
||||||
pageStatusCode: data.pageStatusCode,
|
pageStatusCode: data.pageStatusCode,
|
||||||
pageError: data.pageError ?? data.error,
|
pageError: data.pageError ?? data.error,
|
||||||
};
|
};
|
||||||
@@ -196,7 +195,7 @@ export async function scrapWithFireEngine({
|
|||||||
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`);
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
}
|
}
|
||||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||||
|
|||||||
@@ -69,8 +69,13 @@ function getScrapingFallbackOrder(
|
|||||||
defaultScraper?: string,
|
defaultScraper?: string,
|
||||||
isWaitPresent: boolean = false,
|
isWaitPresent: boolean = false,
|
||||||
isScreenshotPresent: boolean = false,
|
isScreenshotPresent: boolean = false,
|
||||||
isHeadersPresent: boolean = false
|
isHeadersPresent: boolean = false,
|
||||||
|
isActionsPresent: boolean = false,
|
||||||
) {
|
) {
|
||||||
|
if (isActionsPresent) {
|
||||||
|
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
|
||||||
|
}
|
||||||
|
|
||||||
const availableScrapers = baseScrapers.filter((scraper) => {
|
const availableScrapers = baseScrapers.filter((scraper) => {
|
||||||
switch (scraper) {
|
switch (scraper) {
|
||||||
case "scrapingBee":
|
case "scrapingBee":
|
||||||
@@ -170,6 +175,9 @@ export async function scrapSingleUrl(
|
|||||||
let scraperResponse: {
|
let scraperResponse: {
|
||||||
text: string;
|
text: string;
|
||||||
screenshot: string;
|
screenshot: string;
|
||||||
|
actions?: {
|
||||||
|
screenshots: string[];
|
||||||
|
};
|
||||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||||
} = { text: "", screenshot: "", metadata: {} };
|
} = { text: "", screenshot: "", metadata: {} };
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
@@ -217,7 +225,14 @@ export async function scrapSingleUrl(
|
|||||||
teamId,
|
teamId,
|
||||||
});
|
});
|
||||||
scraperResponse.text = response.html;
|
scraperResponse.text = response.html;
|
||||||
scraperResponse.screenshot = response.screenshot;
|
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
|
||||||
|
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
|
||||||
|
}
|
||||||
|
if (pageOptions.actions) {
|
||||||
|
scraperResponse.actions = {
|
||||||
|
screenshots: response.screenshots ?? [],
|
||||||
|
};
|
||||||
|
}
|
||||||
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
|
||||||
scraperResponse.metadata.pageError = response.pageError;
|
scraperResponse.metadata.pageError = response.pageError;
|
||||||
}
|
}
|
||||||
@@ -283,9 +298,6 @@ export async function scrapSingleUrl(
|
|||||||
]) : ([]),
|
]) : ([]),
|
||||||
pageOptions: customScraperResult.pageOptions,
|
pageOptions: customScraperResult.pageOptions,
|
||||||
});
|
});
|
||||||
if (screenshot) {
|
|
||||||
customScrapedContent.screenshot = screenshot;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case "pdf":
|
case "pdf":
|
||||||
const { content, pageStatusCode, pageError } =
|
const { content, pageStatusCode, pageError } =
|
||||||
@@ -295,7 +307,6 @@ export async function scrapSingleUrl(
|
|||||||
);
|
);
|
||||||
customScrapedContent = {
|
customScrapedContent = {
|
||||||
html: content,
|
html: content,
|
||||||
screenshot,
|
|
||||||
pageStatusCode,
|
pageStatusCode,
|
||||||
pageError,
|
pageError,
|
||||||
};
|
};
|
||||||
@@ -305,7 +316,6 @@ export async function scrapSingleUrl(
|
|||||||
|
|
||||||
if (customScrapedContent) {
|
if (customScrapedContent) {
|
||||||
scraperResponse.text = customScrapedContent.html;
|
scraperResponse.text = customScrapedContent.html;
|
||||||
screenshot = customScrapedContent.screenshot;
|
|
||||||
}
|
}
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
@@ -325,16 +335,18 @@ export async function scrapSingleUrl(
|
|||||||
html: cleanedHtml,
|
html: cleanedHtml,
|
||||||
rawHtml: scraperResponse.text,
|
rawHtml: scraperResponse.text,
|
||||||
screenshot: scraperResponse.screenshot,
|
screenshot: scraperResponse.screenshot,
|
||||||
|
actions: scraperResponse.actions,
|
||||||
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
pageStatusCode: scraperResponse.metadata.pageStatusCode,
|
||||||
pageError: scraperResponse.metadata.pageError || undefined,
|
pageError: scraperResponse.metadata.pageError || undefined,
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
|
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
|
||||||
text: "",
|
text: "",
|
||||||
html: "",
|
html: "",
|
||||||
rawHtml: "",
|
rawHtml: "",
|
||||||
screenshot: "",
|
screenshot: "",
|
||||||
|
actions: undefined,
|
||||||
pageStatusCode: 200,
|
pageStatusCode: 200,
|
||||||
pageError: undefined,
|
pageError: undefined,
|
||||||
};
|
};
|
||||||
@@ -350,7 +362,8 @@ export async function scrapSingleUrl(
|
|||||||
defaultScraper,
|
defaultScraper,
|
||||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
|
||||||
|
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
@@ -367,6 +380,7 @@ export async function scrapSingleUrl(
|
|||||||
html = attempt.html ?? "";
|
html = attempt.html ?? "";
|
||||||
rawHtml = attempt.rawHtml ?? "";
|
rawHtml = attempt.rawHtml ?? "";
|
||||||
screenshot = attempt.screenshot ?? "";
|
screenshot = attempt.screenshot ?? "";
|
||||||
|
actions = attempt.actions ?? undefined;
|
||||||
|
|
||||||
if (attempt.pageStatusCode) {
|
if (attempt.pageStatusCode) {
|
||||||
pageStatusCode = attempt.pageStatusCode;
|
pageStatusCode = attempt.pageStatusCode;
|
||||||
@@ -404,45 +418,27 @@ export async function scrapSingleUrl(
|
|||||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||||
}
|
}
|
||||||
|
|
||||||
let document: Document;
|
let document: Document = {
|
||||||
if (screenshot && screenshot.length > 0) {
|
content: text,
|
||||||
document = {
|
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
||||||
content: text,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
rawHtml:
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
pageOptions.includeRawHtml ||
|
||||||
rawHtml:
|
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
||||||
pageOptions.includeRawHtml ||
|
? rawHtml
|
||||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
: undefined,
|
||||||
? rawHtml
|
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||||
: undefined,
|
actions,
|
||||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
metadata: {
|
||||||
metadata: {
|
...metadata,
|
||||||
...metadata,
|
...(screenshot && screenshot.length > 0 ? ({
|
||||||
screenshot: screenshot,
|
screenshot,
|
||||||
sourceURL: urlToScrap,
|
}) : {}),
|
||||||
pageStatusCode: pageStatusCode,
|
sourceURL: urlToScrap,
|
||||||
pageError: pageError,
|
pageStatusCode: pageStatusCode,
|
||||||
},
|
pageError: pageError,
|
||||||
};
|
},
|
||||||
} else {
|
};
|
||||||
document = {
|
|
||||||
content: text,
|
|
||||||
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
|
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
|
||||||
rawHtml:
|
|
||||||
pageOptions.includeRawHtml ||
|
|
||||||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
|
|
||||||
? rawHtml
|
|
||||||
: undefined,
|
|
||||||
metadata: {
|
|
||||||
...metadata,
|
|
||||||
sourceURL: urlToScrap,
|
|
||||||
pageStatusCode: pageStatusCode,
|
|
||||||
pageError: pageError,
|
|
||||||
},
|
|
||||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
Reference in New Issue
Block a user