Merge branch 'main' into feat/idempotency-key

This commit is contained in:
rafaelsideguide
2024-05-23 12:20:06 -03:00
69 changed files with 9115 additions and 615 deletions
+36
View File
@@ -77,6 +77,42 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes
scrapeExample();
```
### Extracting structured data from a URL
With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it:
```js
import { z } from "zod";
const zodSchema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5)
.describe("Top 5 stories on Hacker News"),
});
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
extractorOptions: { extractionSchema: zodSchema },
});
console.log(llmExtractionResult.data.llm_extraction);
```
### Search for a query
Used to search the web, get the most relevant results, scrap each page and return the markdown.
```js
query = 'what is mendable?'
searchResult = app.search(query)
```
### Crawling a Website
+40 -28
View File
@@ -7,7 +7,9 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
import axios from 'axios';
import axios from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
/**
* Main class for interacting with the Firecrawl API.
*/
@@ -19,7 +21,7 @@ export default class FirecrawlApp {
constructor({ apiKey = null }) {
this.apiKey = apiKey || '';
if (!this.apiKey) {
throw new Error('No API key provided');
throw new Error("No API key provided");
}
}
/**
@@ -30,16 +32,22 @@ export default class FirecrawlApp {
*/
scrapeUrl(url_1) {
return __awaiter(this, arguments, void 0, function* (url, params = null) {
var _a;
const headers = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
};
let jsonData = { url };
if (params) {
jsonData = Object.assign(Object.assign({}, jsonData), params);
let jsonData = Object.assign({ url }, params);
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
}
try {
const response = yield axios.post('https://api.firecrawl.dev/v0/scrape', jsonData, { headers });
const response = yield axios.post("https://api.firecrawl.dev/v0/scrape", jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
@@ -50,13 +58,13 @@ export default class FirecrawlApp {
}
}
else {
this.handleError(response, 'scrape URL');
this.handleError(response, "scrape URL");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
return { success: false, error: "Internal server error." };
});
}
/**
@@ -68,15 +76,15 @@ export default class FirecrawlApp {
search(query_1) {
return __awaiter(this, arguments, void 0, function* (query, params = null) {
const headers = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
};
let jsonData = { query };
if (params) {
jsonData = Object.assign(Object.assign({}, jsonData), params);
}
try {
const response = yield axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
const response = yield axios.post("https://api.firecrawl.dev/v0/search", jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
@@ -87,13 +95,13 @@ export default class FirecrawlApp {
}
}
else {
this.handleError(response, 'search');
this.handleError(response, "search");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
return { success: false, error: "Internal server error." };
});
}
/**
@@ -113,7 +121,7 @@ export default class FirecrawlApp {
jsonData = Object.assign(Object.assign({}, jsonData), params);
}
try {
const response = yield this.postRequest('https://api.firecrawl.dev/v0/crawl', jsonData, headers);
const response = yield this.postRequest("https://api.firecrawl.dev/v0/crawl", jsonData, headers);
if (response.status === 200) {
const jobId = response.data.jobId;
if (waitUntilDone) {
@@ -124,14 +132,14 @@ export default class FirecrawlApp {
}
}
else {
this.handleError(response, 'start crawl job');
this.handleError(response, "start crawl job");
}
}
catch (error) {
console.log(error);
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
return { success: false, error: "Internal server error." };
});
}
/**
@@ -148,13 +156,17 @@ export default class FirecrawlApp {
return response.data;
}
else {
this.handleError(response, 'check crawl status');
this.handleError(response, "check crawl status");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, status: 'unknown', error: 'Internal server error.' };
return {
success: false,
status: "unknown",
error: "Internal server error.",
};
});
}
/**
@@ -196,26 +208,26 @@ export default class FirecrawlApp {
const statusResponse = yield this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
if (statusResponse.status === 200) {
const statusData = statusResponse.data;
if (statusData.status === 'completed') {
if ('data' in statusData) {
if (statusData.status === "completed") {
if ("data" in statusData) {
return statusData.data;
}
else {
throw new Error('Crawl job completed but no data was returned');
throw new Error("Crawl job completed but no data was returned");
}
}
else if (['active', 'paused', 'pending', 'queued'].includes(statusData.status)) {
else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
if (timeout < 2) {
timeout = 2;
}
yield new Promise(resolve => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again
yield new Promise((resolve) => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again
}
else {
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
}
}
else {
this.handleError(statusResponse, 'check crawl status');
this.handleError(statusResponse, "check crawl status");
}
}
});
@@ -226,8 +238,8 @@ export default class FirecrawlApp {
* @param {string} action - The action being performed when the error occurred.
*/
handleError(response, action) {
if ([402, 409, 500].includes(response.status)) {
const errorMessage = response.data.error || 'Unknown error occurred';
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage = response.data.error || "Unknown error occurred";
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
}
else {
+21 -3
View File
@@ -1,15 +1,17 @@
{
"name": "@mendable/firecrawl-js",
"version": "0.0.13",
"version": "0.0.17-beta.8",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@mendable/firecrawl-js",
"version": "0.0.13",
"version": "0.0.17-beta.8",
"license": "MIT",
"dependencies": {
"axios": "^1.6.8"
"axios": "^1.6.8",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
},
"devDependencies": {
"@jest/globals": "^29.7.0",
@@ -3766,6 +3768,22 @@
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/zod": {
"version": "3.23.8",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz",
"integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==",
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
},
"node_modules/zod-to-json-schema": {
"version": "3.23.0",
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz",
"integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==",
"peerDependencies": {
"zod": "^3.23.3"
}
}
}
}
+5 -2
View File
@@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "0.0.16",
"version": "0.0.21",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/index.js",
"types": "types/index.d.ts",
@@ -8,6 +8,7 @@
"scripts": {
"build": "tsc",
"publish": "npm run build && npm publish --access public",
"publish-beta": "npm run build && npm publish --access public --tag beta",
"test": "jest src/**/*.test.ts"
},
"repository": {
@@ -17,7 +18,9 @@
"author": "Mendable.ai",
"license": "MIT",
"dependencies": {
"axios": "^1.6.8"
"axios": "^1.6.8",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
},
"bugs": {
"url": "https://github.com/mendableai/firecrawl/issues"
+116 -44
View File
@@ -1,5 +1,6 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios';
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
/**
* Configuration interface for FirecrawlApp.
*/
@@ -12,6 +13,11 @@ export interface FirecrawlAppConfig {
*/
export interface Params {
[key: string]: any;
extractorOptions?: {
extractionSchema: z.ZodSchema | any;
mode?: "llm-extraction";
extractionPrompt?: string;
};
}
/**
@@ -63,9 +69,9 @@ export default class FirecrawlApp {
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey = null }: FirecrawlAppConfig) {
this.apiKey = apiKey || '';
this.apiKey = apiKey || "";
if (!this.apiKey) {
throw new Error('No API key provided');
throw new Error("No API key provided");
}
}
@@ -75,31 +81,50 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the scrape request.
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
*/
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
async scrapeUrl(
url: string,
params: Params | null = null
): Promise<ScrapeResponse> {
const headers: AxiosRequestHeaders = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: Params = { url };
if (params) {
jsonData = { ...jsonData, ...params };
let jsonData: Params = { url, ...params };
if (params?.extractorOptions?.extractionSchema) {
let schema = params.extractorOptions.extractionSchema;
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema);
}
jsonData = {
...jsonData,
extractorOptions: {
...params.extractorOptions,
extractionSchema: schema,
mode: params.extractorOptions.mode || "llm-extraction",
},
};
}
try {
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/scrape', jsonData, { headers });
const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/scrape",
jsonData,
{ headers },
);
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return responseData;
return responseData;
} else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
}
} else {
this.handleError(response, 'scrape URL');
this.handleError(response, "scrape URL");
}
} catch (error: any) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
return { success: false, error: "Internal server error." };
}
/**
@@ -108,31 +133,38 @@ export default class FirecrawlApp {
* @param {Params | null} params - Additional parameters for the search request.
* @returns {Promise<SearchResponse>} The response from the search operation.
*/
async search(query: string, params: Params | null = null): Promise<SearchResponse> {
async search(
query: string,
params: Params | null = null
): Promise<SearchResponse> {
const headers: AxiosRequestHeaders = {
'Content-Type': 'application/json',
'Authorization': `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: Params = { query };
if (params) {
jsonData = { ...jsonData, ...params };
}
try {
const response: AxiosResponse = await axios.post('https://api.firecrawl.dev/v0/search', jsonData, { headers });
const response: AxiosResponse = await axios.post(
"https://api.firecrawl.dev/v0/search",
jsonData,
{ headers }
);
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return responseData;
return responseData;
} else {
throw new Error(`Failed to search. Error: ${responseData.error}`);
}
} else {
this.handleError(response, 'search');
this.handleError(response, "search");
}
} catch (error: any) {
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
return { success: false, error: "Internal server error." };
}
/**
@@ -144,14 +176,24 @@ export default class FirecrawlApp {
* @param {string} idempotencyKey - Optional idempotency key for the request.
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
*/
async crawlUrl(url: string, params: Params | null = null, waitUntilDone: boolean = true, timeout: number = 2, idempotencyKey?: string): Promise<CrawlResponse | any> {
async crawlUrl(
url: string,
params: Params | null = null,
waitUntilDone: boolean = true,
timeout: number = 2,
idempotencyKey?: string
): Promise<CrawlResponse | any> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: Params = { url };
if (params) {
jsonData = { ...jsonData, ...params };
}
try {
const response: AxiosResponse = await this.postRequest('https://api.firecrawl.dev/v0/crawl', jsonData, headers);
const response: AxiosResponse = await this.postRequest(
"https://api.firecrawl.dev/v0/crawl",
jsonData,
headers
);
if (response.status === 200) {
const jobId: string = response.data.jobId;
if (waitUntilDone) {
@@ -160,13 +202,13 @@ export default class FirecrawlApp {
return { success: true, jobId };
}
} else {
this.handleError(response, 'start crawl job');
this.handleError(response, "start crawl job");
}
} catch (error: any) {
console.log(error)
console.log(error);
throw new Error(error.message);
}
return { success: false, error: 'Internal server error.' };
return { success: false, error: "Internal server error." };
}
/**
@@ -177,16 +219,23 @@ export default class FirecrawlApp {
async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> {
const headers: AxiosRequestHeaders = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
const response: AxiosResponse = await this.getRequest(
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, 'check crawl status');
this.handleError(response, "check crawl status");
}
} catch (error: any) {
throw new Error(error.message);
}
return { success: false, status: 'unknown', error: 'Internal server error.' };
return {
success: false,
status: "unknown",
error: "Internal server error.",
};
}
/**
@@ -208,7 +257,11 @@ export default class FirecrawlApp {
* @param {AxiosRequestHeaders} headers - The headers for the request.
* @returns {Promise<AxiosResponse>} The response from the POST request.
*/
postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
postRequest(
url: string,
data: Params,
headers: AxiosRequestHeaders
): Promise<AxiosResponse> {
return axios.post(url, data, { headers });
}
@@ -218,7 +271,10 @@ export default class FirecrawlApp {
* @param {AxiosRequestHeaders} headers - The headers for the request.
* @returns {Promise<AxiosResponse>} The response from the GET request.
*/
getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
getRequest(
url: string,
headers: AxiosRequestHeaders
): Promise<AxiosResponse> {
return axios.get(url, { headers });
}
@@ -229,27 +285,38 @@ export default class FirecrawlApp {
* @param {number} timeout - Timeout in seconds for job status checks.
* @returns {Promise<any>} The final job status or data.
*/
async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise<any> {
async monitorJobStatus(
jobId: string,
headers: AxiosRequestHeaders,
timeout: number
): Promise<any> {
while (true) {
const statusResponse: AxiosResponse = await this.getRequest(`https://api.firecrawl.dev/v0/crawl/status/${jobId}`, headers);
const statusResponse: AxiosResponse = await this.getRequest(
`https://api.firecrawl.dev/v0/crawl/status/${jobId}`,
headers
);
if (statusResponse.status === 200) {
const statusData = statusResponse.data;
if (statusData.status === 'completed') {
if ('data' in statusData) {
if (statusData.status === "completed") {
if ("data" in statusData) {
return statusData.data;
} else {
throw new Error('Crawl job completed but no data was returned');
throw new Error("Crawl job completed but no data was returned");
}
} else if (['active', 'paused', 'pending', 'queued'].includes(statusData.status)) {
} else if (
["active", "paused", "pending", "queued"].includes(statusData.status)
) {
if (timeout < 2) {
timeout = 2;
}
await new Promise(resolve => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again
await new Promise((resolve) => setTimeout(resolve, timeout * 1000)); // Wait for the specified timeout before checking again
} else {
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
throw new Error(
`Crawl job failed or was stopped. Status: ${statusData.status}`
);
}
} else {
this.handleError(statusResponse, 'check crawl status');
this.handleError(statusResponse, "check crawl status");
}
}
}
@@ -260,11 +327,16 @@ export default class FirecrawlApp {
* @param {string} action - The action being performed when the error occurred.
*/
handleError(response: AxiosResponse, action: string): void {
if ([402, 409, 500].includes(response.status)) {
const errorMessage: string = response.data.error || 'Unknown error occurred';
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage: string =
response.data.error || "Unknown error occurred";
throw new Error(
`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`
);
} else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
throw new Error(
`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`
);
}
}
}
+7 -1
View File
@@ -1,4 +1,5 @@
import { AxiosResponse, AxiosRequestHeaders } from 'axios';
import { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
/**
* Configuration interface for FirecrawlApp.
*/
@@ -10,6 +11,11 @@ export interface FirecrawlAppConfig {
*/
export interface Params {
[key: string]: any;
extractorOptions?: {
extractionSchema: z.ZodSchema | any;
mode?: "llm-extraction";
extractionPrompt?: string;
};
}
/**
* Response interface for scraping operations.