From a0b8a6cad3613cc2cecc1acacdf1a5dcfd50363f Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 13:43:33 +0200 Subject: [PATCH 01/23] feat(js-sdk): build both cjs and esm versions --- apps/js-sdk/firecrawl/build/cjs/index.js | 271 ++++++++++++++++++ apps/js-sdk/firecrawl/build/cjs/package.json | 1 + .../js-sdk/firecrawl/build/{ => esm}/index.js | 0 apps/js-sdk/firecrawl/build/esm/package.json | 1 + apps/js-sdk/firecrawl/package.json | 8 +- apps/js-sdk/firecrawl/types/index.d.ts | 22 +- apps/js-sdk/package-lock.json | 34 ++- apps/js-sdk/package.json | 1 + 8 files changed, 324 insertions(+), 14 deletions(-) create mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js create mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json rename apps/js-sdk/firecrawl/build/{ => esm}/index.js (100%) create mode 100644 apps/js-sdk/firecrawl/build/esm/package.json diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js new file mode 100644 index 00000000..dbc2d6b9 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/index.js @@ -0,0 +1,271 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const axios_1 = __importDefault(require("axios")); +const zod_1 = require("zod"); +const zod_to_json_schema_1 = require("zod-to-json-schema"); +/** + * Main class for interacting with the Firecrawl API. + */ +class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null) { + var _a; + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof zod_1.z.ZodSchema) { + schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query_1) { + return __awaiter(this, arguments, void 0, function* (query, params = null) { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios_1.default.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios_1.default.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} +exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json new file mode 100644 index 00000000..b731bd61 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/package.json @@ -0,0 +1 @@ +{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/esm/index.js similarity index 100% rename from apps/js-sdk/firecrawl/build/index.js rename to apps/js-sdk/firecrawl/build/esm/index.js diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json new file mode 100644 index 00000000..6990891f --- /dev/null +++ b/apps/js-sdk/firecrawl/build/esm/package.json @@ -0,0 +1 @@ +{"type": "module"} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0fef67b0..178e5c66 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,11 +2,15 @@ "name": "@mendable/firecrawl-js", "version": "0.0.29", "description": "JavaScript SDK for Firecrawl API", - "main": "build/index.js", + "main": "build/cjs/index.js", "types": "types/index.d.ts", "type": "module", + "exports": { + "require": "./build/cjs/index.js", + "import": "./build/esm/index.js" + }, "scripts": { - "build": "tsc", + "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "jest src/__tests__/**/*.test.ts" diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 91a58043..bd6cfc20 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -73,16 +73,16 @@ export interface ScrapeResponse { error?: string; } /** -* Response interface for searching operations. -*/ + * Response interface for searching operations. + */ export interface SearchResponse { success: boolean; data?: FirecrawlDocument[]; error?: string; } /** -* Response interface for crawling operations. -*/ + * Response interface for crawling operations. + */ export interface CrawlResponse { success: boolean; jobId?: string; @@ -90,24 +90,28 @@ export interface CrawlResponse { error?: string; } /** -* Response interface for job status checks. -*/ + * Response interface for job status checks. + */ export interface JobStatusResponse { success: boolean; status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; jobId?: string; data?: FirecrawlDocument[]; partial_data?: FirecrawlDocument[]; error?: string; } /** - * Generic parameter interface. - */ + * Generic parameter interface. + */ export interface Params { [key: string]: any; extractorOptions?: { extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction" | "llm-extraction-from-raw-html"; + mode?: "llm-extraction"; extractionPrompt?: string; }; } diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 2bf3f001..ca337062 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -13,6 +13,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { @@ -450,6 +451,15 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, + "node_modules/@types/node": { + "version": "20.14.11", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz", + "integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==", + "peer": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -728,6 +738,24 @@ "node": ">=14.17" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "peer": true + }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", @@ -750,9 +778,9 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.23.0", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz", - "integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==", + "version": "3.23.1", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz", + "integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==", "peerDependencies": { "zod": "^3.23.3" } diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 0e93fe3c..2d2c36e8 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -15,6 +15,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { From 2e62de4f8b58e74113555ade9a6b89689a73f4a6 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 13:45:51 +0200 Subject: [PATCH 02/23] fix(js-sdk): remove built files from repo and add to gitignore --- apps/js-sdk/firecrawl/.gitignore | 2 + apps/js-sdk/firecrawl/build/cjs/index.js | 271 ------------------- apps/js-sdk/firecrawl/build/cjs/package.json | 1 - apps/js-sdk/firecrawl/build/esm/index.js | 265 ------------------ apps/js-sdk/firecrawl/build/esm/package.json | 1 - 5 files changed, 2 insertions(+), 538 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js delete mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json delete mode 100644 apps/js-sdk/firecrawl/build/esm/index.js delete mode 100644 apps/js-sdk/firecrawl/build/esm/package.json diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index c6bba591..96e545b3 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -128,3 +128,5 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* + +build diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js deleted file mode 100644 index dbc2d6b9..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/index.js +++ /dev/null @@ -1,271 +0,0 @@ -"use strict"; -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const axios_1 = __importDefault(require("axios")); -const zod_1 = require("zod"); -const zod_to_json_schema_1 = require("zod-to-json-schema"); -/** - * Main class for interacting with the Firecrawl API. - */ -class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof zod_1.z.ZodSchema) { - schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios_1.default.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios_1.default.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} -exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json deleted file mode 100644 index b731bd61..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js deleted file mode 100644 index 99de5e2b..00000000 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ /dev/null @@ -1,265 +0,0 @@ -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -import axios from "axios"; -import { z } from "zod"; -import { zodToJsonSchema } from "zod-to-json-schema"; -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json deleted file mode 100644 index 6990891f..00000000 --- a/apps/js-sdk/firecrawl/build/esm/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "module"} From 361269974ea6b73a33d03ce388f28f6ae030a16d Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 13:48:39 +0200 Subject: [PATCH 03/23] fix(js-sdk): remove autogenerated index.d.ts from git and add to gitignore --- apps/js-sdk/firecrawl/.gitignore | 1 + apps/js-sdk/firecrawl/types/index.d.ts | 193 ------------------------- 2 files changed, 1 insertion(+), 193 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index 96e545b3..1acd6303 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -130,3 +130,4 @@ dist .pnp.* build +types diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts deleted file mode 100644 index bd6cfc20..00000000 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ /dev/null @@ -1,193 +0,0 @@ -import { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; -/** - * Configuration interface for FirecrawlApp. - */ -export interface FirecrawlAppConfig { - apiKey?: string | null; - apiUrl?: string | null; -} -/** - * Metadata for a Firecrawl document. - */ -export interface FirecrawlDocumentMetadata { - title?: string; - description?: string; - language?: string; - keywords?: string; - robots?: string; - ogTitle?: string; - ogDescription?: string; - ogUrl?: string; - ogImage?: string; - ogAudio?: string; - ogDeterminer?: string; - ogLocale?: string; - ogLocaleAlternate?: string[]; - ogSiteName?: string; - ogVideo?: string; - dctermsCreated?: string; - dcDateCreated?: string; - dcDate?: string; - dctermsType?: string; - dcType?: string; - dctermsAudience?: string; - dctermsSubject?: string; - dcSubject?: string; - dcDescription?: string; - dctermsKeywords?: string; - modifiedTime?: string; - publishedTime?: string; - articleTag?: string; - articleSection?: string; - sourceURL?: string; - pageStatusCode?: number; - pageError?: string; - [key: string]: any; -} -/** - * Document interface for Firecrawl. - */ -export interface FirecrawlDocument { - id?: string; - url?: string; - content: string; - markdown?: string; - html?: string; - llm_extraction?: Record; - createdAt?: Date; - updatedAt?: Date; - type?: string; - metadata: FirecrawlDocumentMetadata; - childrenLinks?: string[]; - provider?: string; - warning?: string; - index?: number; -} -/** - * Response interface for scraping operations. - */ -export interface ScrapeResponse { - success: boolean; - data?: FirecrawlDocument; - error?: string; -} -/** - * Response interface for searching operations. - */ -export interface SearchResponse { - success: boolean; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Response interface for crawling operations. - */ -export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Response interface for job status checks. - */ -export interface JobStatusResponse { - success: boolean; - status: string; - current?: number; - current_url?: string; - current_step?: string; - total?: number; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; - error?: string; -} -/** - * Generic parameter interface. - */ -export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; -} -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - private apiKey; - private apiUrl; - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey, apiUrl }: FirecrawlAppConfig); - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url: string, params?: Params | null): Promise; - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query: string, params?: Params | null): Promise; - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId: string): Promise; - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url: string, headers: AxiosRequestHeaders): Promise; - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response: AxiosResponse, action: string): void; -} From db926a4146192c03a3e023a5a9e9b4d570bc21a7 Mon Sep 17 00:00:00 2001 From: tak-s Date: Sun, 4 Aug 2024 15:05:53 +0900 Subject: [PATCH 04/23] set LOGGING_LEVEL to environment --- docker-compose.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index 4974e8b8..ffcbc4ee 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -29,6 +29,7 @@ x-common-service: &common-service - SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY} - HOST=${HOST:-0.0.0.0} - SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL} + - LOGGING_LEVEL=${LOGGING_LEVEL} extra_hosts: - "host.docker.internal:host-gateway" From af9bc5c8bbe30d0c271697be95a58c19895f132e Mon Sep 17 00:00:00 2001 From: tak-s Date: Sun, 4 Aug 2024 15:09:36 +0900 Subject: [PATCH 05/23] Suppressed repetitive logs --- apps/api/src/services/supabase.ts | 8 -------- 1 file changed, 8 deletions(-) diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index d34f7b52..70ada12b 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -36,17 +36,9 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { - if (process.env.USE_DB_AUTHENTICATION === "false") { - Logger.debug( - "Attempted to access Supabase client when it's not configured." - ); - } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { - Logger.error( - "Attempted to access Supabase client when it's not configured." - ); return () => { throw new Error("Supabase client is not configured."); }; From f32e8de156236dda6f1dd34a1c020a7174821b77 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:13:31 -0300 Subject: [PATCH 06/23] fixes the empty excludes.filter undefined bug --- apps/api/src/lib/entities.ts | 4 ++-- apps/api/src/scraper/WebScraper/index.ts | 27 ++++++++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9ffa4810..8083b905 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -42,8 +42,8 @@ export type SearchOptions = { export type CrawlerOptions = { returnOnlyUrls?: boolean; - includes?: string[]; - excludes?: string[]; + includes?: string | string[]; + excludes?: string | string[]; maxCrawledLinks?: number; maxDepth?: number; limit?: number; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9171b805..e667fa6b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -27,8 +27,8 @@ export class WebScraperDataProvider { private bullJobId: string; private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; - private includes: string[]; - private excludes: string[]; + private includes: string | string[]; + private excludes: string | string[]; private maxCrawledLinks: number; private maxCrawledDepth: number = 10; private returnOnlyUrls: boolean; @@ -171,8 +171,8 @@ export class WebScraperDataProvider { const crawler = new WebCrawler({ jobId: this.jobId, initialUrl: this.urls[0], - includes: this.includes, - excludes: this.excludes, + includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','), + excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','), maxCrawledLinks: this.maxCrawledLinks, maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth), limit: this.limit, @@ -445,6 +445,10 @@ export class WebScraperDataProvider { const url = new URL(document.metadata.sourceURL); const path = url.pathname; + if (!Array.isArray(this.excludes)) { + this.excludes = this.excludes.split(','); + } + if (this.excludes.length > 0 && this.excludes[0] !== "") { // Check if the link should be excluded if ( @@ -456,6 +460,10 @@ export class WebScraperDataProvider { } } + if (!Array.isArray(this.includes)) { + this.includes = this.includes.split(','); + } + if (this.includes.length > 0 && this.includes[0] !== "") { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0) { @@ -567,8 +575,15 @@ export class WebScraperDataProvider { options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; - //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check - this.excludes = this.excludes.filter((item) => item !== ""); + + if (typeof options.crawlerOptions?.excludes === 'string') { + this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== ""); + } + + if (typeof options.crawlerOptions?.includes === 'string') { + this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== ""); + } + this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.allowBackwardCrawling = From 3edc3a3d1580b7ca10a51dbd852a37def6103c0c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:17:37 -0300 Subject: [PATCH 07/23] added fullpagescreenshot capabilities, wip on fire-engine side --- apps/api/openapi.json | 10 ++++++++++ apps/api/src/lib/default-values.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 4 ++++ apps/api/src/scraper/WebScraper/single_url.ts | 2 ++ 5 files changed, 18 insertions(+) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index e0b583f0..fb0c4305 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -84,6 +84,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", @@ -317,6 +322,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 3b303781..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -7,6 +7,7 @@ export const defaultPageOptions = { includeHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, parsePDF: true }; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9ffa4810..4dc2050d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { fetchPageContent?: boolean; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index ba67043c..dfe23a89 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger"; * @param url The URL to scrape * @param waitFor The time to wait for the page to load * @param screenshot Whether to take a screenshot + * @param fullPageScreenshot Whether to take a full page screenshot * @param pageOptions The options for the page * @param headers The headers to send with the request * @param options The options for the request @@ -20,6 +21,7 @@ export async function scrapWithFireEngine({ url, waitFor = 0, screenshot = false, + fullPageScreenshot = false, pageOptions = { parsePDF: true }, fireEngineOptions = {}, headers, @@ -28,6 +30,7 @@ export async function scrapWithFireEngine({ url: string; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; @@ -71,6 +74,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, + fullPageScreenshot: fullPageScreenshot, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4a44b23f..0fa2fc8b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -128,6 +128,7 @@ export async function scrapSingleUrl( includeRawHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, headers: undefined, }, extractorOptions: ExtractorOptions = { @@ -171,6 +172,7 @@ export async function scrapSingleUrl( url, waitFor: pageOptions.waitFor, screenshot: pageOptions.screenshot, + fullPageScreenshot: pageOptions.fullPageScreenshot, pageOptions: pageOptions, headers: pageOptions.headers, fireEngineOptions: { From 4d24a99d50358a31a5aa30fde79309545227c3dc Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:34:43 -0300 Subject: [PATCH 08/23] fix params --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 5 +++-- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index dfe23a89..0bb9986f 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -52,6 +52,7 @@ export async function scrapWithFireEngine({ const waitParam = reqParams["params"]?.wait ?? waitFor; const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -64,7 +65,7 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); @@ -74,7 +75,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, - fullPageScreenshot: fullPageScreenshot, + fullPageScreenshot: fullPageScreenshotParam, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0fa2fc8b..12e075fd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -308,7 +308,7 @@ export async function scrapSingleUrl( const scrapersInOrder = getScrapingFallbackOrder( defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, - pageOptions && pageOptions.screenshot && pageOptions.screenshot === true, + pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), pageOptions && pageOptions.headers && pageOptions.headers !== undefined ); From a67a5c04c91409e80e81a19536736e70f9c478d8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 6 Aug 2024 18:02:56 -0400 Subject: [PATCH 09/23] Revert "Merge pull request #432 from mendableai/mog/js-sdk-cjs" This reverts commit bb90e03dea5dc2f475dbf8dc70425af9b3dfe246, reversing changes made to 3321ca9398b5a129ffc1689ec2a88b2468cf16e9. --- apps/js-sdk/firecrawl/types/index.d.ts | 189 +++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 00000000..91a58043 --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,189 @@ +import { AxiosResponse, AxiosRequestHeaders } from "axios"; +import { z } from "zod"; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; + apiUrl?: string | null; +} +/** + * Metadata for a Firecrawl document. + */ +export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; + [key: string]: any; +} +/** + * Document interface for Firecrawl. + */ +export interface FirecrawlDocument { + id?: string; + url?: string; + content: string; + markdown?: string; + html?: string; + llm_extraction?: Record; + createdAt?: Date; + updatedAt?: Date; + type?: string; + metadata: FirecrawlDocumentMetadata; + childrenLinks?: string[]; + provider?: string; + warning?: string; + index?: number; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: FirecrawlDocument; + error?: string; +} +/** +* Response interface for searching operations. +*/ +export interface SearchResponse { + success: boolean; + data?: FirecrawlDocument[]; + error?: string; +} +/** +* Response interface for crawling operations. +*/ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; +} +/** +* Response interface for job status checks. +*/ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction" | "llm-extraction-from-raw-html"; + extractionPrompt?: string; + }; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + private apiUrl; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey, apiUrl }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +} From 5da4472842497dbe4d462609293917e0b2199fc8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 6 Aug 2024 18:41:06 -0400 Subject: [PATCH 10/23] Revert "Merge pull request #432 from mendableai/mog/js-sdk-cjs" This reverts commit bb90e03dea5dc2f475dbf8dc70425af9b3dfe246, reversing changes made to 3321ca9398b5a129ffc1689ec2a88b2468cf16e9. --- apps/js-sdk/firecrawl/.gitignore | 3 - apps/js-sdk/firecrawl/build/index.js | 265 +++++++++++++++++++++++++++ apps/js-sdk/firecrawl/package.json | 8 +- apps/js-sdk/package-lock.json | 34 +--- apps/js-sdk/package.json | 1 - 5 files changed, 270 insertions(+), 41 deletions(-) create mode 100644 apps/js-sdk/firecrawl/build/index.js diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index 1acd6303..c6bba591 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -128,6 +128,3 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* - -build -types diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js new file mode 100644 index 00000000..99de5e2b --- /dev/null +++ b/apps/js-sdk/firecrawl/build/index.js @@ -0,0 +1,265 @@ +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +import axios from "axios"; +import { z } from "zod"; +import { zodToJsonSchema } from "zod-to-json-schema"; +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null) { + var _a; + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query_1) { + return __awaiter(this, arguments, void 0, function* (query, params = null) { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 5a6d6bfb..71d2362e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,15 +2,11 @@ "name": "@mendable/firecrawl-js", "version": "0.0.29", "description": "JavaScript SDK for Firecrawl API", - "main": "build/cjs/index.js", + "main": "build/index.js", "types": "types/index.d.ts", "type": "module", - "exports": { - "require": "./build/cjs/index.js", - "import": "./build/esm/index.js" - }, "scripts": { - "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", + "build": "tsc", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index ca337062..2bf3f001 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -13,7 +13,6 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", - "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { @@ -451,15 +450,6 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, - "node_modules/@types/node": { - "version": "20.14.11", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz", - "integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==", - "peer": true, - "dependencies": { - "undici-types": "~5.26.4" - } - }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -738,24 +728,6 @@ "node": ">=14.17" } }, - "node_modules/undici-types": { - "version": "5.26.5", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", - "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", - "peer": true - }, - "node_modules/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "bin": { - "uuid": "dist/bin/uuid" - } - }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", @@ -778,9 +750,9 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.23.1", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz", - "integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==", + "version": "3.23.0", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz", + "integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==", "peerDependencies": { "zod": "^3.23.3" } diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 2d2c36e8..0e93fe3c 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -15,7 +15,6 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", - "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { From f294d3922cde4f66a717d3e6b7e1664660b31b19 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 6 Aug 2024 18:44:45 -0400 Subject: [PATCH 11/23] Nick: revert --- apps/js-sdk/firecrawl/build/cjs/index.js | 271 +++++++++++++++++++ apps/js-sdk/firecrawl/build/cjs/package.json | 1 + apps/js-sdk/firecrawl/build/esm/index.js | 265 ++++++++++++++++++ apps/js-sdk/firecrawl/build/esm/package.json | 1 + apps/js-sdk/firecrawl/build/index.js | 14 +- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/types/index.d.ts | 22 +- 7 files changed, 559 insertions(+), 17 deletions(-) create mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js create mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json create mode 100644 apps/js-sdk/firecrawl/build/esm/index.js create mode 100644 apps/js-sdk/firecrawl/build/esm/package.json diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js new file mode 100644 index 00000000..da340cae --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/index.js @@ -0,0 +1,271 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const axios_1 = __importDefault(require("axios")); +const zod_1 = require("zod"); +const zod_to_json_schema_1 = require("zod-to-json-schema"); +/** + * Main class for interacting with the Firecrawl API. + */ +class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url, params = null) { + var _a; + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof zod_1.z.ZodSchema) { + schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query, params = null) { + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios_1.default.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios_1.default.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} +exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json new file mode 100644 index 00000000..b731bd61 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/package.json @@ -0,0 +1 @@ +{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js new file mode 100644 index 00000000..ef79f180 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/esm/index.js @@ -0,0 +1,265 @@ +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +import axios from "axios"; +import { z } from "zod"; +import { zodToJsonSchema } from "zod-to-json-schema"; +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url, params = null) { + var _a; + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query, params = null) { + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json new file mode 100644 index 00000000..6990891f --- /dev/null +++ b/apps/js-sdk/firecrawl/build/esm/package.json @@ -0,0 +1 @@ +{"type": "module"} diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index 99de5e2b..ef79f180 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -31,9 +31,9 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the scrape request. * @returns {Promise} The response from the scrape operation. */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; + scrapeUrl(url, params = null) { + var _a; + return __awaiter(this, void 0, void 0, function* () { const headers = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -74,8 +74,8 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the search request. * @returns {Promise} The response from the search operation. */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { + search(query, params = null) { + return __awaiter(this, void 0, void 0, function* () { const headers = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -114,8 +114,8 @@ export default class FirecrawlApp { * @param {string} idempotencyKey - Optional idempotency key for the request. * @returns {Promise} The response from the crawl operation. */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + return __awaiter(this, void 0, void 0, function* () { const headers = this.prepareHeaders(idempotencyKey); let jsonData = { url }; if (params) { diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 71d2362e..e6a398e4 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.29", + "version": "0.0.34", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 91a58043..bd6cfc20 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -73,16 +73,16 @@ export interface ScrapeResponse { error?: string; } /** -* Response interface for searching operations. -*/ + * Response interface for searching operations. + */ export interface SearchResponse { success: boolean; data?: FirecrawlDocument[]; error?: string; } /** -* Response interface for crawling operations. -*/ + * Response interface for crawling operations. + */ export interface CrawlResponse { success: boolean; jobId?: string; @@ -90,24 +90,28 @@ export interface CrawlResponse { error?: string; } /** -* Response interface for job status checks. -*/ + * Response interface for job status checks. + */ export interface JobStatusResponse { success: boolean; status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; jobId?: string; data?: FirecrawlDocument[]; partial_data?: FirecrawlDocument[]; error?: string; } /** - * Generic parameter interface. - */ + * Generic parameter interface. + */ export interface Params { [key: string]: any; extractorOptions?: { extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction" | "llm-extraction-from-raw-html"; + mode?: "llm-extraction"; extractionPrompt?: string; }; } From 5f7724205f59bd7f415da16805216d5a7ad4657d Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 01:06:21 +0200 Subject: [PATCH 12/23] fix(js-sdk): re-add types --- apps/js-sdk/firecrawl/.gitignore | 1 - apps/js-sdk/firecrawl/types/index.d.ts | 193 +++++++++++++++++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index 1acd6303..96e545b3 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -130,4 +130,3 @@ dist .pnp.* build -types diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 00000000..bd6cfc20 --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,193 @@ +import { AxiosResponse, AxiosRequestHeaders } from "axios"; +import { z } from "zod"; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; + apiUrl?: string | null; +} +/** + * Metadata for a Firecrawl document. + */ +export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; + [key: string]: any; +} +/** + * Document interface for Firecrawl. + */ +export interface FirecrawlDocument { + id?: string; + url?: string; + content: string; + markdown?: string; + html?: string; + llm_extraction?: Record; + createdAt?: Date; + updatedAt?: Date; + type?: string; + metadata: FirecrawlDocumentMetadata; + childrenLinks?: string[]; + provider?: string; + warning?: string; + index?: number; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: FirecrawlDocument; + error?: string; +} +/** + * Response interface for searching operations. + */ +export interface SearchResponse { + success: boolean; + data?: FirecrawlDocument[]; + error?: string; +} +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; +} +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction"; + extractionPrompt?: string; + }; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + private apiUrl; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey, apiUrl }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +} From 020a5efdb761ee96253c1100f748537cd1b1dd00 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 01:27:26 +0200 Subject: [PATCH 13/23] Revert "Revert "Merge pull request #432 from mendableai/mog/js-sdk-cjs"" This reverts commit 5da4472842497dbe4d462609293917e0b2199fc8. --- apps/js-sdk/firecrawl/build/index.js | 265 --------------------------- apps/js-sdk/firecrawl/package.json | 8 +- apps/js-sdk/package-lock.json | 34 +++- apps/js-sdk/package.json | 1 + 4 files changed, 38 insertions(+), 270 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/build/index.js diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js deleted file mode 100644 index ef79f180..00000000 --- a/apps/js-sdk/firecrawl/build/index.js +++ /dev/null @@ -1,265 +0,0 @@ -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -import axios from "axios"; -import { z } from "zod"; -import { zodToJsonSchema } from "zod-to-json-schema"; -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url, params = null) { - var _a; - return __awaiter(this, void 0, void 0, function* () { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query, params = null) { - return __awaiter(this, void 0, void 0, function* () { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e6a398e4..f50e7a4e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,11 +2,15 @@ "name": "@mendable/firecrawl-js", "version": "0.0.34", "description": "JavaScript SDK for Firecrawl API", - "main": "build/index.js", + "main": "build/cjs/index.js", "types": "types/index.d.ts", "type": "module", + "exports": { + "require": "./build/cjs/index.js", + "import": "./build/esm/index.js" + }, "scripts": { - "build": "tsc", + "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 2bf3f001..ca337062 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -13,6 +13,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { @@ -450,6 +451,15 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, + "node_modules/@types/node": { + "version": "20.14.11", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz", + "integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==", + "peer": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -728,6 +738,24 @@ "node": ">=14.17" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "peer": true + }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", @@ -750,9 +778,9 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.23.0", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz", - "integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==", + "version": "3.23.1", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz", + "integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==", "peerDependencies": { "zod": "^3.23.3" } diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 0e93fe3c..2d2c36e8 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -15,6 +15,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { From b5ec47fd96b2d36a398186f96a09be601271f79d Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 13:53:04 +0200 Subject: [PATCH 14/23] fix(runWebScraper): don't fetch next job --- apps/api/src/main/runWebScraper.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 5e7d2279..76665aa2 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -131,13 +131,13 @@ const saveJob = async (job: Job, result: any) => { if (error) throw new Error(error.message); try { - await job.moveToCompleted(null); + await job.moveToCompleted(null, false, false); } catch (error) { // I think the job won't exist here anymore } } else { try { - await job.moveToCompleted(result); + await job.moveToCompleted(result, false, false); } catch (error) { // I think the job won't exist here anymore } From 2e2e80d679d067ccc65d18a7443fda3b3d512cdc Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:17:50 +0200 Subject: [PATCH 15/23] fix(scrape-events): updateScrapeResult fix --- apps/api/src/lib/scrape-events.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 8d677279..cd8cfa9a 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -59,6 +59,12 @@ export class ScrapeEvents { try { const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + + if (!previousLog) { + Logger.warn("Previous log not found."); + return; + } + await supabase.from("scrape_events").update({ content: { ...previousLog.content, From 8216266d16564fa137ae3fb632114afec697162b Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:19:20 +0200 Subject: [PATCH 16/23] fix(scrape_log): display error properly --- apps/api/src/services/logging/scrape_log.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 208159da..099e4a0b 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -44,9 +44,9 @@ export async function logScrape( ]); if (error) { - Logger.error(`Error logging proxy:\n${error}`); + Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`); } } catch (error) { - Logger.error(`Error logging proxy:\n${error}`); + Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`); } } From 7bb922071cce3008529128b99f8a0004021f75f3 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:35:20 +0200 Subject: [PATCH 17/23] fix(queue-worker): manually renew lock (testing) --- apps/api/src/scraper/WebScraper/index.ts | 2 +- apps/api/src/services/queue-worker.ts | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e667fa6b..c3834bcd 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -94,7 +94,7 @@ export class WebScraperDataProvider { const jobStatus = await job.getState(); if (jobStatus === "failed") { Logger.info( - "Job has failed or has been cancelled by the user. Stopping the job..." + "Job " + job.id + " has failed or has been cancelled by the user. Stopping the job..." ); return [] as Document[]; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e7767809..e46ffc1a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -22,6 +22,11 @@ const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { Logger.debug(`🐂 Worker taking job ${job.id}`); + const lockInterval = setInterval(() => { + Logger.debug(`🐂 Renewing lock for ${job.id}`); + job.extendLock(60000); + }, 15000); + try { job.progress({ current: 1, @@ -62,6 +67,7 @@ async function processJob(job: Job, done) { origin: job.data.origin, }); Logger.debug(`🐂 Job done ${job.id}`); + clearInterval(lockInterval); done(null, data); } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); @@ -108,8 +114,9 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); + clearInterval(lockInterval); done(null, data); - } + } } wsq.process( From 9df8719efa52f4465e5345a772b473d2a3d96264 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:56:04 +0200 Subject: [PATCH 18/23] fix(queue-worker): raise queue log level to info --- apps/api/src/services/queue-worker.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e46ffc1a..a82f12cd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -20,10 +20,10 @@ if (process.env.ENV === 'production') { const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { - Logger.debug(`🐂 Worker taking job ${job.id}`); + Logger.info(`🐂 Worker taking job ${job.id}`); const lockInterval = setInterval(() => { - Logger.debug(`🐂 Renewing lock for ${job.id}`); + Logger.info(`🐂 Renewing lock for ${job.id}`); job.extendLock(60000); }, 15000); @@ -66,7 +66,7 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - Logger.debug(`🐂 Job done ${job.id}`); + Logger.info(`🐂 Job done ${job.id}`); clearInterval(lockInterval); done(null, data); } catch (error) { From cdf7bad5b4bc001c739096387f7e078fa043174b Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 15:20:56 +0200 Subject: [PATCH 19/23] fix(runWebScraper): don't move to completed --- apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/services/queue-worker.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 76665aa2..ba22f28b 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -131,7 +131,7 @@ const saveJob = async (job: Job, result: any) => { if (error) throw new Error(error.message); try { - await job.moveToCompleted(null, false, false); + // await job.moveToCompleted(null, false, false); } catch (error) { // I think the job won't exist here anymore } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a82f12cd..bddf9300 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -68,7 +68,7 @@ async function processJob(job: Job, done) { }); Logger.info(`🐂 Job done ${job.id}`); clearInterval(lockInterval); - done(null, data); + done(null, null); } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { From b7c01dcb9b325036a1ca768cf8bfdcfe5e93df27 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 16:31:50 +0200 Subject: [PATCH 20/23] fix(webScraperQueue): reduce retries to 2 --- apps/api/src/services/queue-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index d531c2db..0cd65f32 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -14,7 +14,7 @@ export function getWebScraperQueue() { maxStalledCount: 10, }, defaultJobOptions:{ - attempts: 5 + attempts: 2 } }); Logger.info("Web scraper queue created"); From fe9fdb578b1ce2adbd5966b9df677c44e7c9b07e Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 16:34:25 +0200 Subject: [PATCH 21/23] revert bad hotfixes --- apps/api/src/lib/scrape-events.ts | 6 ------ apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 2 +- apps/api/src/services/queue-worker.ts | 11 ++--------- 4 files changed, 4 insertions(+), 17 deletions(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index cd8cfa9a..8d677279 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -59,12 +59,6 @@ export class ScrapeEvents { try { const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; - - if (!previousLog) { - Logger.warn("Previous log not found."); - return; - } - await supabase.from("scrape_events").update({ content: { ...previousLog.content, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index ba22f28b..76665aa2 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -131,7 +131,7 @@ const saveJob = async (job: Job, result: any) => { if (error) throw new Error(error.message); try { - // await job.moveToCompleted(null, false, false); + await job.moveToCompleted(null, false, false); } catch (error) { // I think the job won't exist here anymore } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c3834bcd..e667fa6b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -94,7 +94,7 @@ export class WebScraperDataProvider { const jobStatus = await job.getState(); if (jobStatus === "failed") { Logger.info( - "Job " + job.id + " has failed or has been cancelled by the user. Stopping the job..." + "Job has failed or has been cancelled by the user. Stopping the job..." ); return [] as Document[]; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index bddf9300..cc92b3ab 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -22,11 +22,6 @@ const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { Logger.info(`🐂 Worker taking job ${job.id}`); - const lockInterval = setInterval(() => { - Logger.info(`🐂 Renewing lock for ${job.id}`); - job.extendLock(60000); - }, 15000); - try { job.progress({ current: 1, @@ -67,8 +62,7 @@ async function processJob(job: Job, done) { origin: job.data.origin, }); Logger.info(`🐂 Job done ${job.id}`); - clearInterval(lockInterval); - done(null, null); + done(null, data); } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { @@ -114,9 +108,8 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - clearInterval(lockInterval); done(null, data); - } + } } wsq.process( From b12e1157cc3628c109346753f54518b29b95168d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 7 Aug 2024 10:40:00 -0400 Subject: [PATCH 22/23] Nick: v35 bump --- apps/js-sdk/firecrawl/build/esm/index.js | 14 +++++++------- apps/js-sdk/firecrawl/package-lock.json | 4 ++-- apps/js-sdk/firecrawl/package.json | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js index ef79f180..99de5e2b 100644 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ b/apps/js-sdk/firecrawl/build/esm/index.js @@ -31,9 +31,9 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the scrape request. * @returns {Promise} The response from the scrape operation. */ - scrapeUrl(url, params = null) { - var _a; - return __awaiter(this, void 0, void 0, function* () { + scrapeUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null) { + var _a; const headers = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -74,8 +74,8 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the search request. * @returns {Promise} The response from the search operation. */ - search(query, params = null) { - return __awaiter(this, void 0, void 0, function* () { + search(query_1) { + return __awaiter(this, arguments, void 0, function* (query, params = null) { const headers = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -114,8 +114,8 @@ export default class FirecrawlApp { * @param {string} idempotencyKey - Optional idempotency key for the request. * @returns {Promise} The response from the crawl operation. */ - crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - return __awaiter(this, void 0, void 0, function* () { + crawlUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { const headers = this.prepareHeaders(idempotencyKey); let jsonData = { url }; if (params) { diff --git a/apps/js-sdk/firecrawl/package-lock.json b/apps/js-sdk/firecrawl/package-lock.json index 25b0e305..c42d6ca7 100644 --- a/apps/js-sdk/firecrawl/package-lock.json +++ b/apps/js-sdk/firecrawl/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.29", + "version": "0.0.34", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@mendable/firecrawl-js", - "version": "0.0.29", + "version": "0.0.34", "license": "MIT", "dependencies": { "axios": "^1.6.8", diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f50e7a4e..a76748d4 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.34", + "version": "0.0.35", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", From f1f56050100ebcfa4d81357f96f0544d3bc616dd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 8 Aug 2024 12:31:58 -0400 Subject: [PATCH 23/23] Update website_params.ts --- .../src/scraper/WebScraper/utils/custom/website_params.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 8583e614..fcd3f69b 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -240,4 +240,12 @@ export const urlSpecificParams = { }, }, }, + "digikey.com":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + engine: "tlsclient", + }, + }, + } };