Merge branch 'mendableai:main' into feat/add-go-sdk
This commit is contained in:
@@ -84,6 +84,11 @@
|
|||||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
|
"fullPageScreenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
"waitFor": {
|
"waitFor": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
@@ -317,6 +322,11 @@
|
|||||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
|
"fullPageScreenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
"waitFor": {
|
"waitFor": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ export const defaultPageOptions = {
|
|||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
|
fullPageScreenshot: false,
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ export type PageOptions = {
|
|||||||
fetchPageContent?: boolean;
|
fetchPageContent?: boolean;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
|
fullPageScreenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
parsePDF?: boolean;
|
parsePDF?: boolean;
|
||||||
@@ -42,8 +43,8 @@ export type SearchOptions = {
|
|||||||
|
|
||||||
export type CrawlerOptions = {
|
export type CrawlerOptions = {
|
||||||
returnOnlyUrls?: boolean;
|
returnOnlyUrls?: boolean;
|
||||||
includes?: string[];
|
includes?: string | string[];
|
||||||
excludes?: string[];
|
excludes?: string | string[];
|
||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
maxDepth?: number;
|
maxDepth?: number;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
|
|||||||
@@ -131,13 +131,13 @@ const saveJob = async (job: Job, result: any) => {
|
|||||||
|
|
||||||
if (error) throw new Error(error.message);
|
if (error) throw new Error(error.message);
|
||||||
try {
|
try {
|
||||||
await job.moveToCompleted(null);
|
await job.moveToCompleted(null, false, false);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// I think the job won't exist here anymore
|
// I think the job won't exist here anymore
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
await job.moveToCompleted(result);
|
await job.moveToCompleted(result, false, false);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// I think the job won't exist here anymore
|
// I think the job won't exist here anymore
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,8 +27,8 @@ export class WebScraperDataProvider {
|
|||||||
private bullJobId: string;
|
private bullJobId: string;
|
||||||
private urls: string[] = [""];
|
private urls: string[] = [""];
|
||||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||||
private includes: string[];
|
private includes: string | string[];
|
||||||
private excludes: string[];
|
private excludes: string | string[];
|
||||||
private maxCrawledLinks: number;
|
private maxCrawledLinks: number;
|
||||||
private maxCrawledDepth: number = 10;
|
private maxCrawledDepth: number = 10;
|
||||||
private returnOnlyUrls: boolean;
|
private returnOnlyUrls: boolean;
|
||||||
@@ -171,8 +171,8 @@ export class WebScraperDataProvider {
|
|||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: this.jobId,
|
jobId: this.jobId,
|
||||||
initialUrl: this.urls[0],
|
initialUrl: this.urls[0],
|
||||||
includes: this.includes,
|
includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','),
|
||||||
excludes: this.excludes,
|
excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','),
|
||||||
maxCrawledLinks: this.maxCrawledLinks,
|
maxCrawledLinks: this.maxCrawledLinks,
|
||||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
@@ -445,6 +445,10 @@ export class WebScraperDataProvider {
|
|||||||
const url = new URL(document.metadata.sourceURL);
|
const url = new URL(document.metadata.sourceURL);
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
|
||||||
|
if (!Array.isArray(this.excludes)) {
|
||||||
|
this.excludes = this.excludes.split(',');
|
||||||
|
}
|
||||||
|
|
||||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||||
// Check if the link should be excluded
|
// Check if the link should be excluded
|
||||||
if (
|
if (
|
||||||
@@ -456,6 +460,10 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!Array.isArray(this.includes)) {
|
||||||
|
this.includes = this.includes.split(',');
|
||||||
|
}
|
||||||
|
|
||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
// Check if the link matches the include patterns, if any are specified
|
// Check if the link matches the include patterns, if any are specified
|
||||||
if (this.includes.length > 0) {
|
if (this.includes.length > 0) {
|
||||||
@@ -567,8 +575,15 @@ export class WebScraperDataProvider {
|
|||||||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||||
false;
|
false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||||
|
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||||
|
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||||
|
}
|
||||||
|
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||||
this.allowBackwardCrawling =
|
this.allowBackwardCrawling =
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger";
|
|||||||
* @param url The URL to scrape
|
* @param url The URL to scrape
|
||||||
* @param waitFor The time to wait for the page to load
|
* @param waitFor The time to wait for the page to load
|
||||||
* @param screenshot Whether to take a screenshot
|
* @param screenshot Whether to take a screenshot
|
||||||
|
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||||
* @param pageOptions The options for the page
|
* @param pageOptions The options for the page
|
||||||
* @param headers The headers to send with the request
|
* @param headers The headers to send with the request
|
||||||
* @param options The options for the request
|
* @param options The options for the request
|
||||||
@@ -20,6 +21,7 @@ export async function scrapWithFireEngine({
|
|||||||
url,
|
url,
|
||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
|
fullPageScreenshot = false,
|
||||||
pageOptions = { parsePDF: true },
|
pageOptions = { parsePDF: true },
|
||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
@@ -28,6 +30,7 @@ export async function scrapWithFireEngine({
|
|||||||
url: string;
|
url: string;
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
|
fullPageScreenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
@@ -49,6 +52,7 @@ export async function scrapWithFireEngine({
|
|||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
|
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||||
|
|
||||||
|
|
||||||
@@ -61,7 +65,7 @@ export async function scrapWithFireEngine({
|
|||||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||||
|
|
||||||
Logger.info(
|
Logger.info(
|
||||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
@@ -71,6 +75,7 @@ export async function scrapWithFireEngine({
|
|||||||
url: url,
|
url: url,
|
||||||
wait: waitParam,
|
wait: waitParam,
|
||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
|
fullPageScreenshot: fullPageScreenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ export async function scrapSingleUrl(
|
|||||||
includeRawHtml: false,
|
includeRawHtml: false,
|
||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
|
fullPageScreenshot: false,
|
||||||
headers: undefined,
|
headers: undefined,
|
||||||
},
|
},
|
||||||
extractorOptions: ExtractorOptions = {
|
extractorOptions: ExtractorOptions = {
|
||||||
@@ -171,6 +172,7 @@ export async function scrapSingleUrl(
|
|||||||
url,
|
url,
|
||||||
waitFor: pageOptions.waitFor,
|
waitFor: pageOptions.waitFor,
|
||||||
screenshot: pageOptions.screenshot,
|
screenshot: pageOptions.screenshot,
|
||||||
|
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
fireEngineOptions: {
|
fireEngineOptions: {
|
||||||
@@ -306,7 +308,7 @@ export async function scrapSingleUrl(
|
|||||||
const scrapersInOrder = getScrapingFallbackOrder(
|
const scrapersInOrder = getScrapingFallbackOrder(
|
||||||
defaultScraper,
|
defaultScraper,
|
||||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||||
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -240,4 +240,12 @@ export const urlSpecificParams = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"digikey.com":{
|
||||||
|
defaultScraper: "fire-engine",
|
||||||
|
params:{
|
||||||
|
fireEngineOptions:{
|
||||||
|
engine: "tlsclient",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -44,9 +44,9 @@ export async function logScrape(
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
Logger.error(`Error logging proxy:\n${error}`);
|
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error logging proxy:\n${error}`);
|
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ export function getWebScraperQueue() {
|
|||||||
maxStalledCount: 10,
|
maxStalledCount: 10,
|
||||||
},
|
},
|
||||||
defaultJobOptions:{
|
defaultJobOptions:{
|
||||||
attempts: 5
|
attempts: 2
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
Logger.info("Web scraper queue created");
|
Logger.info("Web scraper queue created");
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ if (process.env.ENV === 'production') {
|
|||||||
const wsq = getWebScraperQueue();
|
const wsq = getWebScraperQueue();
|
||||||
|
|
||||||
async function processJob(job: Job, done) {
|
async function processJob(job: Job, done) {
|
||||||
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
Logger.info(`🐂 Worker taking job ${job.id}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
job.progress({
|
job.progress({
|
||||||
@@ -61,7 +61,7 @@ async function processJob(job: Job, done) {
|
|||||||
pageOptions: job.data.pageOptions,
|
pageOptions: job.data.pageOptions,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
});
|
});
|
||||||
Logger.debug(`🐂 Job done ${job.id}`);
|
Logger.info(`🐂 Job done ${job.id}`);
|
||||||
done(null, data);
|
done(null, data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||||
|
|||||||
@@ -36,17 +36,9 @@ export const supabase_service: SupabaseClient = new Proxy(
|
|||||||
new SupabaseService(),
|
new SupabaseService(),
|
||||||
{
|
{
|
||||||
get: function (target, prop, receiver) {
|
get: function (target, prop, receiver) {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
|
||||||
Logger.debug(
|
|
||||||
"Attempted to access Supabase client when it's not configured."
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const client = target.getClient();
|
const client = target.getClient();
|
||||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||||
if (client === null) {
|
if (client === null) {
|
||||||
Logger.error(
|
|
||||||
"Attempted to access Supabase client when it's not configured."
|
|
||||||
);
|
|
||||||
return () => {
|
return () => {
|
||||||
throw new Error("Supabase client is not configured.");
|
throw new Error("Supabase client is not configured.");
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -128,3 +128,5 @@ dist
|
|||||||
.yarn/build-state.yml
|
.yarn/build-state.yml
|
||||||
.yarn/install-state.gz
|
.yarn/install-state.gz
|
||||||
.pnp.*
|
.pnp.*
|
||||||
|
|
||||||
|
build
|
||||||
|
|||||||
@@ -0,0 +1,271 @@
|
|||||||
|
"use strict";
|
||||||
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
||||||
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
||||||
|
return new (P || (P = Promise))(function (resolve, reject) {
|
||||||
|
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
||||||
|
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
||||||
|
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
||||||
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
||||||
|
});
|
||||||
|
};
|
||||||
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||||
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||||
|
};
|
||||||
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
|
const axios_1 = __importDefault(require("axios"));
|
||||||
|
const zod_1 = require("zod");
|
||||||
|
const zod_to_json_schema_1 = require("zod-to-json-schema");
|
||||||
|
/**
|
||||||
|
* Main class for interacting with the Firecrawl API.
|
||||||
|
*/
|
||||||
|
class FirecrawlApp {
|
||||||
|
/**
|
||||||
|
* Initializes a new instance of the FirecrawlApp class.
|
||||||
|
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||||
|
*/
|
||||||
|
constructor({ apiKey = null, apiUrl = null }) {
|
||||||
|
this.apiKey = apiKey || "";
|
||||||
|
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
||||||
|
if (!this.apiKey) {
|
||||||
|
throw new Error("No API key provided");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Scrapes a URL using the Firecrawl API.
|
||||||
|
* @param {string} url - The URL to scrape.
|
||||||
|
* @param {Params | null} params - Additional parameters for the scrape request.
|
||||||
|
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||||
|
*/
|
||||||
|
scrapeUrl(url, params = null) {
|
||||||
|
var _a;
|
||||||
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
|
const headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
|
};
|
||||||
|
let jsonData = Object.assign({ url }, params);
|
||||||
|
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
|
||||||
|
let schema = params.extractorOptions.extractionSchema;
|
||||||
|
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||||
|
if (schema instanceof zod_1.z.ZodSchema) {
|
||||||
|
schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
|
||||||
|
}
|
||||||
|
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers });
|
||||||
|
if (response.status === 200) {
|
||||||
|
const responseData = response.data;
|
||||||
|
if (responseData.success) {
|
||||||
|
return responseData;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.handleError(response, "scrape URL");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
throw new Error(error.message);
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Searches for a query using the Firecrawl API.
|
||||||
|
* @param {string} query - The query to search for.
|
||||||
|
* @param {Params | null} params - Additional parameters for the search request.
|
||||||
|
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||||
|
*/
|
||||||
|
search(query, params = null) {
|
||||||
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
|
const headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Authorization: `Bearer ${this.apiKey}`,
|
||||||
|
};
|
||||||
|
let jsonData = { query };
|
||||||
|
if (params) {
|
||||||
|
jsonData = Object.assign(Object.assign({}, jsonData), params);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers });
|
||||||
|
if (response.status === 200) {
|
||||||
|
const responseData = response.data;
|
||||||
|
if (responseData.success) {
|
||||||
|
return responseData;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.handleError(response, "search");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
throw new Error(error.message);
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||||
|
* @param {string} url - The URL to crawl.
|
||||||
|
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||||
|
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||||
|
* @param {number} pollInterval - Time in seconds for job status checks.
|
||||||
|
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||||
|
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||||
|
*/
|
||||||
|
crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) {
|
||||||
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
|
const headers = this.prepareHeaders(idempotencyKey);
|
||||||
|
let jsonData = { url };
|
||||||
|
if (params) {
|
||||||
|
jsonData = Object.assign(Object.assign({}, jsonData), params);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers);
|
||||||
|
if (response.status === 200) {
|
||||||
|
const jobId = response.data.jobId;
|
||||||
|
if (waitUntilDone) {
|
||||||
|
return this.monitorJobStatus(jobId, headers, pollInterval);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return { success: true, jobId };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.handleError(response, "start crawl job");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
console.log(error);
|
||||||
|
throw new Error(error.message);
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Checks the status of a crawl job using the Firecrawl API.
|
||||||
|
* @param {string} jobId - The job ID of the crawl operation.
|
||||||
|
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
||||||
|
*/
|
||||||
|
checkCrawlStatus(jobId) {
|
||||||
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
|
const headers = this.prepareHeaders();
|
||||||
|
try {
|
||||||
|
const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
|
||||||
|
if (response.status === 200) {
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
status: response.data.status,
|
||||||
|
current: response.data.current,
|
||||||
|
current_url: response.data.current_url,
|
||||||
|
current_step: response.data.current_step,
|
||||||
|
total: response.data.total,
|
||||||
|
data: response.data.data,
|
||||||
|
partial_data: !response.data.data
|
||||||
|
? response.data.partial_data
|
||||||
|
: undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.handleError(response, "check crawl status");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
throw new Error(error.message);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
status: "unknown",
|
||||||
|
current: 0,
|
||||||
|
current_url: "",
|
||||||
|
current_step: "",
|
||||||
|
total: 0,
|
||||||
|
error: "Internal server error.",
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Prepares the headers for an API request.
|
||||||
|
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||||
|
*/
|
||||||
|
prepareHeaders(idempotencyKey) {
|
||||||
|
return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}));
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Sends a POST request to the specified URL.
|
||||||
|
* @param {string} url - The URL to send the request to.
|
||||||
|
* @param {Params} data - The data to send in the request.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @returns {Promise<AxiosResponse>} The response from the POST request.
|
||||||
|
*/
|
||||||
|
postRequest(url, data, headers) {
|
||||||
|
return axios_1.default.post(url, data, { headers });
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Sends a GET request to the specified URL.
|
||||||
|
* @param {string} url - The URL to send the request to.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @returns {Promise<AxiosResponse>} The response from the GET request.
|
||||||
|
*/
|
||||||
|
getRequest(url, headers) {
|
||||||
|
return axios_1.default.get(url, { headers });
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Monitors the status of a crawl job until completion or failure.
|
||||||
|
* @param {string} jobId - The job ID of the crawl operation.
|
||||||
|
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||||
|
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||||
|
* @returns {Promise<any>} The final job status or data.
|
||||||
|
*/
|
||||||
|
monitorJobStatus(jobId, headers, checkInterval) {
|
||||||
|
return __awaiter(this, void 0, void 0, function* () {
|
||||||
|
while (true) {
|
||||||
|
const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
|
||||||
|
if (statusResponse.status === 200) {
|
||||||
|
const statusData = statusResponse.data;
|
||||||
|
if (statusData.status === "completed") {
|
||||||
|
if ("data" in statusData) {
|
||||||
|
return statusData.data;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new Error("Crawl job completed but no data was returned");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
|
||||||
|
if (checkInterval < 2) {
|
||||||
|
checkInterval = 2;
|
||||||
|
}
|
||||||
|
yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.handleError(statusResponse, "check crawl status");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Handles errors from API responses.
|
||||||
|
* @param {AxiosResponse} response - The response from the API.
|
||||||
|
* @param {string} action - The action being performed when the error occurred.
|
||||||
|
*/
|
||||||
|
handleError(response, action) {
|
||||||
|
if ([402, 408, 409, 500].includes(response.status)) {
|
||||||
|
const errorMessage = response.data.error || "Unknown error occurred";
|
||||||
|
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
exports.default = FirecrawlApp;
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"type": "commonjs"}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"type": "module"}
|
||||||
Generated
+2
-2
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "0.0.29",
|
"version": "0.0.34",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "0.0.29",
|
"version": "0.0.34",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"axios": "^1.6.8",
|
"axios": "^1.6.8",
|
||||||
|
|||||||
@@ -1,12 +1,16 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "0.0.29",
|
"version": "0.0.35",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/index.js",
|
"main": "build/cjs/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
|
"exports": {
|
||||||
|
"require": "./build/cjs/index.js",
|
||||||
|
"import": "./build/esm/index.js"
|
||||||
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "tsc",
|
"build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json",
|
||||||
"build-and-publish": "npm run build && npm publish --access public",
|
"build-and-publish": "npm run build && npm publish --access public",
|
||||||
"publish-beta": "npm run build && npm publish --access public --tag beta",
|
"publish-beta": "npm run build && npm publish --access public --tag beta",
|
||||||
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts"
|
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts"
|
||||||
|
|||||||
+5
-1
@@ -95,6 +95,10 @@ export interface CrawlResponse {
|
|||||||
export interface JobStatusResponse {
|
export interface JobStatusResponse {
|
||||||
success: boolean;
|
success: boolean;
|
||||||
status: string;
|
status: string;
|
||||||
|
current?: number;
|
||||||
|
current_url?: string;
|
||||||
|
current_step?: string;
|
||||||
|
total?: number;
|
||||||
jobId?: string;
|
jobId?: string;
|
||||||
data?: FirecrawlDocument[];
|
data?: FirecrawlDocument[];
|
||||||
partial_data?: FirecrawlDocument[];
|
partial_data?: FirecrawlDocument[];
|
||||||
@@ -107,7 +111,7 @@ export interface Params {
|
|||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
extractorOptions?: {
|
extractorOptions?: {
|
||||||
extractionSchema: z.ZodSchema | any;
|
extractionSchema: z.ZodSchema | any;
|
||||||
mode?: "llm-extraction" | "llm-extraction-from-raw-html";
|
mode?: "llm-extraction";
|
||||||
extractionPrompt?: string;
|
extractionPrompt?: string;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Generated
+31
-3
@@ -13,6 +13,7 @@
|
|||||||
"axios": "^1.6.8",
|
"axios": "^1.6.8",
|
||||||
"ts-node": "^10.9.2",
|
"ts-node": "^10.9.2",
|
||||||
"typescript": "^5.4.5",
|
"typescript": "^5.4.5",
|
||||||
|
"uuid": "^10.0.0",
|
||||||
"zod": "^3.23.8"
|
"zod": "^3.23.8"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
@@ -450,6 +451,15 @@
|
|||||||
"resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz",
|
"resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz",
|
||||||
"integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA=="
|
"integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA=="
|
||||||
},
|
},
|
||||||
|
"node_modules/@types/node": {
|
||||||
|
"version": "20.14.11",
|
||||||
|
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz",
|
||||||
|
"integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==",
|
||||||
|
"peer": true,
|
||||||
|
"dependencies": {
|
||||||
|
"undici-types": "~5.26.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/acorn": {
|
"node_modules/acorn": {
|
||||||
"version": "8.11.3",
|
"version": "8.11.3",
|
||||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz",
|
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz",
|
||||||
@@ -728,6 +738,24 @@
|
|||||||
"node": ">=14.17"
|
"node": ">=14.17"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/undici-types": {
|
||||||
|
"version": "5.26.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
|
||||||
|
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
|
||||||
|
"peer": true
|
||||||
|
},
|
||||||
|
"node_modules/uuid": {
|
||||||
|
"version": "10.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz",
|
||||||
|
"integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==",
|
||||||
|
"funding": [
|
||||||
|
"https://github.com/sponsors/broofa",
|
||||||
|
"https://github.com/sponsors/ctavan"
|
||||||
|
],
|
||||||
|
"bin": {
|
||||||
|
"uuid": "dist/bin/uuid"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/v8-compile-cache-lib": {
|
"node_modules/v8-compile-cache-lib": {
|
||||||
"version": "3.0.1",
|
"version": "3.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||||
@@ -750,9 +778,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/zod-to-json-schema": {
|
"node_modules/zod-to-json-schema": {
|
||||||
"version": "3.23.0",
|
"version": "3.23.1",
|
||||||
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz",
|
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz",
|
||||||
"integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==",
|
"integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==",
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"zod": "^3.23.3"
|
"zod": "^3.23.3"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
"axios": "^1.6.8",
|
"axios": "^1.6.8",
|
||||||
"ts-node": "^10.9.2",
|
"ts-node": "^10.9.2",
|
||||||
"typescript": "^5.4.5",
|
"typescript": "^5.4.5",
|
||||||
|
"uuid": "^10.0.0",
|
||||||
"zod": "^3.23.8"
|
"zod": "^3.23.8"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ x-common-service: &common-service
|
|||||||
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
||||||
- HOST=${HOST:-0.0.0.0}
|
- HOST=${HOST:-0.0.0.0}
|
||||||
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
|
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
|
||||||
|
- LOGGING_LEVEL=${LOGGING_LEVEL}
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user