Merge branch 'main' into nsc/new-extract

This commit is contained in:
Nicolas
2024-11-13 17:14:43 -05:00
18 changed files with 140 additions and 110 deletions
+1 -1
View File
@@ -84,7 +84,6 @@
"escape-html": "^1.0.3", "escape-html": "^1.0.3",
"express-rate-limit": "^7.3.1", "express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2", "express-ws": "^5.0.2",
"form-data": "^4.0.0",
"glob": "^10.4.2", "glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5", "gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.4.1", "ioredis": "^5.4.1",
@@ -117,6 +116,7 @@
"turndown": "^7.1.3", "turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2", "turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4", "typesense": "^1.5.4",
"undici": "^6.20.1",
"unstructured-client": "^0.11.3", "unstructured-client": "^0.11.3",
"uuid": "^10.0.0", "uuid": "^10.0.0",
"winston": "^3.14.2", "winston": "^3.14.2",
+9 -3
View File
@@ -110,9 +110,6 @@ importers:
express-ws: express-ws:
specifier: ^5.0.2 specifier: ^5.0.2
version: 5.0.2(express@4.19.2) version: 5.0.2(express@4.19.2)
form-data:
specifier: ^4.0.0
version: 4.0.0
glob: glob:
specifier: ^10.4.2 specifier: ^10.4.2
version: 10.4.2 version: 10.4.2
@@ -209,6 +206,9 @@ importers:
typesense: typesense:
specifier: ^1.5.4 specifier: ^1.5.4
version: 1.8.2(@babel/runtime@7.24.6) version: 1.8.2(@babel/runtime@7.24.6)
undici:
specifier: ^6.20.1
version: 6.20.1
unstructured-client: unstructured-client:
specifier: ^0.11.3 specifier: ^0.11.3
version: 0.11.3(zod@3.23.8) version: 0.11.3(zod@3.23.8)
@@ -5007,6 +5007,10 @@ packages:
undici-types@5.26.5: undici-types@5.26.5:
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
undici@6.20.1:
resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==}
engines: {node: '>=18.17'}
union@0.5.0: union@0.5.0:
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==} resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
engines: {node: '>= 0.8.0'} engines: {node: '>= 0.8.0'}
@@ -11209,6 +11213,8 @@ snapshots:
undici-types@5.26.5: {} undici-types@5.26.5: {}
undici@6.20.1: {}
union@0.5.0: union@0.5.0:
dependencies: dependencies:
qs: 6.12.2 qs: 6.12.2
+26 -59
View File
@@ -1,5 +1,10 @@
### Crawl Website # Pick your baseUrl here:
POST http://localhost:3002/v0/scrape HTTP/1.1 @baseUrl = http://localhost:3002
# @baseUrl = https://api.firecrawl.dev
### Scrape Website
# @name scrape
POST {{baseUrl}}/v1/scrape HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}} Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
@@ -7,17 +12,9 @@ content-type: application/json
"url":"firecrawl.dev" "url":"firecrawl.dev"
} }
### Check Job Status ### Crawl Website
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 # @name crawl
Authorization: Bearer {{$dotenv TEST_API_KEY}} POST {{baseUrl}}/v1/crawl HTTP/1.1
### Check Job Status
GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website
POST http://localhost:3002/v0/crawl HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}} Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
@@ -25,58 +22,28 @@ content-type: application/json
"url": "firecrawl.dev" "url": "firecrawl.dev"
} }
## "reoveTags": [], ### Check Crawl Status
# "mode": "crawl", @crawlId = {{crawl.response.body.$.id}}
# "crawlerOptions": { # @name crawlStatus
# "allowBackwardCrawling": false GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
# }, Authorization: Bearer {{$dotenv TEST_API_KEY}}
# "pageOptions": {
# "onlyMainContent": false,
# "includeHtml": false,
# "parsePDF": true
# }
### Batch Scrape Websites
# @name batchScrape
POST {{baseUrl}}/v1/batch/scrape HTTP/1.1
### Scrape Website
POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}} Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
{ {
"url":"https://mendable.ai" "urls": [
"firecrawl.dev",
"mendable.ai"
]
} }
### Check Batch Scrape Status
@batchScrapeId = {{batchScrape.response.body.$.id}}
### Check Job Status # @name batchScrapeStatus
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}} Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Get Job Result
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url":"https://mendable.ai"
}
### Check Job Status
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Get Active Jobs Count
GET http://localhost:3002/serverHealthCheck
content-type: application/json
### Notify Server Health Check
GET http://localhost:3002/serverHealthCheck/notify
content-type: application/json
+1 -1
View File
@@ -75,7 +75,7 @@ export async function crawlStatusController(req: Request, res: Response) {
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active"; const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active";
const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
if ( if (
jobs.length > 0 && jobs.length > 0 &&
+2
View File
@@ -138,6 +138,8 @@ export async function crawlController(req: Request, res: Response) {
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
delete (scrapeOptions as any).timeout;
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions, crawlerOptions,
+12 -1
View File
@@ -239,11 +239,22 @@ const crawlerOptions = z.object({
export type CrawlerOptions = z.infer<typeof crawlerOptions>; export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const webhookSchema = z.preprocess(x => {
if (typeof x === "string") {
return { url: x };
} else {
return x;
}
}, z.object({
url: z.string().url(),
headers: z.record(z.string(), z.string()).default({}),
}).strict(strictMessage))
export const crawlRequestSchema = crawlerOptions.extend({ export const crawlRequestSchema = crawlerOptions.extend({
url, url,
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(), webhook: webhookSchema.optional(),
limit: z.number().default(10000), limit: z.number().default(10000),
}).strict(strictMessage); }).strict(strictMessage);
+3 -2
View File
@@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
return res; return res;
} }
export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler { export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: id, jobId: id,
initialUrl: initialUrl ?? sc.originUrl!, initialUrl: sc.originUrl!,
baseUrl: newBase ? new URL(newBase).origin : undefined,
includes: sc.crawlerOptions?.includes ?? [], includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [], excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
+3 -1
View File
@@ -27,6 +27,7 @@ export class WebCrawler {
constructor({ constructor({
jobId, jobId,
initialUrl, initialUrl,
baseUrl,
includes, includes,
excludes, excludes,
maxCrawledLinks = 10000, maxCrawledLinks = 10000,
@@ -38,6 +39,7 @@ export class WebCrawler {
}: { }: {
jobId: string; jobId: string;
initialUrl: string; initialUrl: string;
baseUrl?: string;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
maxCrawledLinks?: number; maxCrawledLinks?: number;
@@ -49,7 +51,7 @@ export class WebCrawler {
}) { }) {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
this.includes = Array.isArray(includes) ? includes : []; this.includes = Array.isArray(includes) ? includes : [];
this.excludes = Array.isArray(excludes) ? excludes : []; this.excludes = Array.isArray(excludes) ? excludes : [];
this.limit = limit; this.limit = limit;
@@ -1,6 +1,6 @@
import { Meta } from "../.."; import { Meta } from "../..";
import { EngineScrapeResult } from ".."; import { EngineScrapeResult } from "..";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { downloadFile } from "../utils/downloadFile";
import mammoth from "mammoth"; import mammoth from "mammoth";
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> { export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
@@ -1,5 +1,4 @@
import { createReadStream, promises as fs } from "node:fs"; import { createReadStream, promises as fs } from "node:fs";
import FormData from "form-data";
import { Meta } from "../.."; import { Meta } from "../..";
import { EngineScrapeResult } from ".."; import { EngineScrapeResult } from "..";
import * as marked from "marked"; import * as marked from "marked";
@@ -16,10 +15,26 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath }); meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
const uploadForm = new FormData(); const uploadForm = new FormData();
uploadForm.append("file", createReadStream(tempFilePath), {
filename: tempFilePath, // This is utterly stupid but it works! - mogery
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]? uploadForm.append("file", {
}); [Symbol.toStringTag]: "Blob",
name: tempFilePath,
stream() {
return createReadStream(tempFilePath) as unknown as ReadableStream<Uint8Array>
},
arrayBuffer() {
throw Error("Unimplemented in mock Blob: arrayBuffer")
},
size: (await fs.stat(tempFilePath)).size,
text() {
throw Error("Unimplemented in mock Blob: text")
},
slice(start, end, contentType) {
throw Error("Unimplemented in mock Blob: slice")
},
type: "application/pdf",
} as Blob);
const upload = await robustFetch({ const upload = await robustFetch({
url: "https://api.cloud.llamaindex.ai/api/parsing/upload", url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
@@ -47,6 +62,8 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
schema: z.object({ schema: z.object({
markdown: z.string(), markdown: z.string(),
}), }),
tryCount: 32,
tryCooldown: 250,
}); });
return { return {
@@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs";
import { EngineError } from "../../error"; import { EngineError } from "../../error";
import { Writable } from "stream"; import { Writable } from "stream";
import { v4 as uuid } from "uuid"; import { v4 as uuid } from "uuid";
import * as undici from "undici";
export async function fetchFileToBuffer(url: string): Promise<{ export async function fetchFileToBuffer(url: string): Promise<{
response: Response, response: Response,
@@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{
} }
export async function downloadFile(id: string, url: string): Promise<{ export async function downloadFile(id: string, url: string): Promise<{
response: Response response: undici.Response
tempFilePath: string tempFilePath: string
}> { }> {
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
const tempFileWrite = createWriteStream(tempFilePath); const tempFileWrite = createWriteStream(tempFilePath);
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying // TODO: maybe we could use tlsclient for this? for proxying
// use undici to ignore SSL for now
const response = await undici.fetch(url, {
dispatcher: new undici.Agent({
connect: {
rejectUnauthorized: false,
},
})
});
// This should never happen in the current state of JS (2024), but let's check anyways. // This should never happen in the current state of JS (2024), but let's check anyways.
if (response.body === null) { if (response.body === null) {
+8 -4
View File
@@ -2,7 +2,6 @@ import { Logger } from "winston";
import { z, ZodError } from "zod"; import { z, ZodError } from "zod";
import { v4 as uuid } from "uuid"; import { v4 as uuid } from "uuid";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import FormData from "form-data";
export type RobustFetchParams<Schema extends z.Schema<any>> = { export type RobustFetchParams<Schema extends z.Schema<any>> = {
url: string; url: string;
@@ -16,6 +15,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
ignoreFailure?: boolean; ignoreFailure?: boolean;
requestId?: string; requestId?: string;
tryCount?: number; tryCount?: number;
tryCooldown?: number;
}; };
export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer<Schema>>({ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer<Schema>>({
@@ -29,8 +29,9 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
ignoreFailure = false, ignoreFailure = false,
requestId = uuid(), requestId = uuid(),
tryCount = 1, tryCount = 1,
tryCooldown,
}: RobustFetchParams<Schema>): Promise<Output> { }: RobustFetchParams<Schema>): Promise<Output> {
const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount }; const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount, tryCooldown };
let request: Response; let request: Response;
try { try {
@@ -38,14 +39,14 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
method, method,
headers: { headers: {
...(body instanceof FormData ...(body instanceof FormData
? body.getHeaders() ? ({})
: body !== undefined ? ({ : body !== undefined ? ({
"Content-Type": "application/json", "Content-Type": "application/json",
}) : {}), }) : {}),
...(headers !== undefined ? headers : {}), ...(headers !== undefined ? headers : {}),
}, },
...(body instanceof FormData ? ({ ...(body instanceof FormData ? ({
body: body.getBuffer(), body,
}) : body !== undefined ? ({ }) : body !== undefined ? ({
body: JSON.stringify(body), body: JSON.stringify(body),
}) : {}), }) : {}),
@@ -87,6 +88,9 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
if (request.status >= 300) { if (request.status >= 300) {
if (tryCount > 1) { if (tryCount > 1) {
logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId }); logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId });
if (tryCooldown !== undefined) {
await new Promise((resolve) => setTimeout(() => resolve(null), tryCooldown));
}
return await robustFetch({ return await robustFetch({
...params, ...params,
requestId, requestId,
@@ -6,6 +6,7 @@ import { logger } from "../../../src/lib/logger";
import { sendSlackWebhook } from "../alerts/slack"; import { sendSlackWebhook } from "../alerts/slack";
import { getNotificationString } from "./notification_string"; import { getNotificationString } from "./notification_string";
import { AuthCreditUsageChunk } from "../../controllers/v1/types"; import { AuthCreditUsageChunk } from "../../controllers/v1/types";
import { redlock } from "../redlock";
const emailTemplates: Record< const emailTemplates: Record<
NotificationType, NotificationType,
@@ -88,6 +89,7 @@ export async function sendNotificationInternal(
if (team_id === "preview") { if (team_id === "preview") {
return { success: true }; return { success: true };
} }
return await redlock.using([`notification-lock:${team_id}:${notificationType}`], 5000, async () => {
if (!bypassRecentChecks) { if (!bypassRecentChecks) {
const fifteenDaysAgo = new Date(); const fifteenDaysAgo = new Date();
@@ -172,4 +174,5 @@ export async function sendNotificationInternal(
} }
return { success: true }; return { success: true };
});
} }
+19 -19
View File
@@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined); const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
const links = crawler.filterLinks( const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
@@ -568,24 +568,24 @@ async function processJob(job: Job & { id: string }, token: string) {
crawl_id: job.data.crawl_id, crawl_id: job.data.crawl_id,
}); });
await logJob({ // await logJob({
job_id: job.data.crawl_id, // job_id: job.data.crawl_id,
success: false, // success: false,
message: // message:
typeof error === "string" // typeof error === "string"
? error // ? error
: error.message ?? // : error.message ??
"Something went wrong... Contact help@mendable.ai", // "Something went wrong... Contact help@mendable.ai",
num_docs: 0, // num_docs: 0,
docs: [], // docs: [],
time_taken: 0, // time_taken: 0,
team_id: job.data.team_id, // team_id: job.data.team_id,
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape", // mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc ? sc.originUrl ?? job.data.url : job.data.url, // url: sc ? sc.originUrl ?? job.data.url : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : undefined, // crawlerOptions: sc ? sc.crawlerOptions : undefined,
scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions, // scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
origin: job.data.origin, // origin: job.data.origin,
}); // });
} }
// done(null, data); // done(null, data);
return data; return data;
+9 -5
View File
@@ -1,15 +1,17 @@
import axios from "axios"; import axios from "axios";
import { logger } from "../../src/lib/logger"; import { logger } from "../lib/logger";
import { supabase_service } from "./supabase"; import { supabase_service } from "./supabase";
import { WebhookEventType } from "../types"; import { WebhookEventType } from "../types";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { z } from "zod";
import { webhookSchema } from "../controllers/v1/types";
configDotenv(); configDotenv();
export const callWebhook = async ( export const callWebhook = async (
teamId: string, teamId: string,
id: string, id: string,
data: any | null, data: any | null,
specified?: string, specified?: z.infer<typeof webhookSchema>,
v1 = false, v1 = false,
eventType: WebhookEventType = "crawl.page", eventType: WebhookEventType = "crawl.page",
awaitWebhook: boolean = false awaitWebhook: boolean = false
@@ -20,7 +22,7 @@ export const callWebhook = async (
id id
); );
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
let webhookUrl = specified ?? selfHostedUrl; let webhookUrl = specified ?? (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined);
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set // Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
// and the USE_DB_AUTHENTICATION environment variable is set to true // and the USE_DB_AUTHENTICATION environment variable is set to true
@@ -73,7 +75,7 @@ export const callWebhook = async (
if (awaitWebhook) { if (awaitWebhook) {
try { try {
await axios.post( await axios.post(
webhookUrl, webhookUrl.url,
{ {
success: !v1 success: !v1
? data.success ? data.success
@@ -92,6 +94,7 @@ export const callWebhook = async (
{ {
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
...webhookUrl.headers,
}, },
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
} }
@@ -104,7 +107,7 @@ export const callWebhook = async (
} else { } else {
axios axios
.post( .post(
webhookUrl, webhookUrl.url,
{ {
success: !v1 success: !v1
? data.success ? data.success
@@ -123,6 +126,7 @@ export const callWebhook = async (
{ {
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
...webhookUrl.headers,
}, },
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1) timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
} }
+3 -2
View File
@@ -1,4 +1,5 @@
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types"; import { z } from "zod";
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, webhookSchema } from "./controllers/v1/types";
import { ExtractorOptions, Document } from "./lib/entities"; import { ExtractorOptions, Document } from "./lib/entities";
import { InternalOptions } from "./scraper/scrapeURL"; import { InternalOptions } from "./scraper/scrapeURL";
@@ -33,7 +34,7 @@ export interface WebScraperOptions {
origin?: string; origin?: string;
crawl_id?: string; crawl_id?: string;
sitemapped?: boolean; sitemapped?: boolean;
webhook?: string; webhook?: z.infer<typeof webhookSchema>;
v1?: boolean; v1?: boolean;
is_scrape?: boolean; is_scrape?: boolean;
} }
+1 -1
View File
@@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.8.1", "version": "1.8.2",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",
+4 -1
View File
@@ -153,7 +153,10 @@ export interface CrawlParams {
allowExternalLinks?: boolean; allowExternalLinks?: boolean;
ignoreSitemap?: boolean; ignoreSitemap?: boolean;
scrapeOptions?: CrawlScrapeOptions; scrapeOptions?: CrawlScrapeOptions;
webhook?: string; webhook?: string | {
url: string;
headers?: Record<string, string>;
};
deduplicateSimilarURLs?: boolean; deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean; ignoreQueryParameters?: boolean;
} }