Merge branch 'main' into nsc/new-extract
This commit is contained in:
@@ -84,7 +84,6 @@
|
|||||||
"escape-html": "^1.0.3",
|
"escape-html": "^1.0.3",
|
||||||
"express-rate-limit": "^7.3.1",
|
"express-rate-limit": "^7.3.1",
|
||||||
"express-ws": "^5.0.2",
|
"express-ws": "^5.0.2",
|
||||||
"form-data": "^4.0.0",
|
|
||||||
"glob": "^10.4.2",
|
"glob": "^10.4.2",
|
||||||
"gpt3-tokenizer": "^1.1.5",
|
"gpt3-tokenizer": "^1.1.5",
|
||||||
"ioredis": "^5.4.1",
|
"ioredis": "^5.4.1",
|
||||||
@@ -117,6 +116,7 @@
|
|||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
"turndown-plugin-gfm": "^1.0.2",
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"typesense": "^1.5.4",
|
"typesense": "^1.5.4",
|
||||||
|
"undici": "^6.20.1",
|
||||||
"unstructured-client": "^0.11.3",
|
"unstructured-client": "^0.11.3",
|
||||||
"uuid": "^10.0.0",
|
"uuid": "^10.0.0",
|
||||||
"winston": "^3.14.2",
|
"winston": "^3.14.2",
|
||||||
|
|||||||
Generated
+9
-3
@@ -110,9 +110,6 @@ importers:
|
|||||||
express-ws:
|
express-ws:
|
||||||
specifier: ^5.0.2
|
specifier: ^5.0.2
|
||||||
version: 5.0.2(express@4.19.2)
|
version: 5.0.2(express@4.19.2)
|
||||||
form-data:
|
|
||||||
specifier: ^4.0.0
|
|
||||||
version: 4.0.0
|
|
||||||
glob:
|
glob:
|
||||||
specifier: ^10.4.2
|
specifier: ^10.4.2
|
||||||
version: 10.4.2
|
version: 10.4.2
|
||||||
@@ -209,6 +206,9 @@ importers:
|
|||||||
typesense:
|
typesense:
|
||||||
specifier: ^1.5.4
|
specifier: ^1.5.4
|
||||||
version: 1.8.2(@babel/runtime@7.24.6)
|
version: 1.8.2(@babel/runtime@7.24.6)
|
||||||
|
undici:
|
||||||
|
specifier: ^6.20.1
|
||||||
|
version: 6.20.1
|
||||||
unstructured-client:
|
unstructured-client:
|
||||||
specifier: ^0.11.3
|
specifier: ^0.11.3
|
||||||
version: 0.11.3(zod@3.23.8)
|
version: 0.11.3(zod@3.23.8)
|
||||||
@@ -5007,6 +5007,10 @@ packages:
|
|||||||
undici-types@5.26.5:
|
undici-types@5.26.5:
|
||||||
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
|
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
|
||||||
|
|
||||||
|
undici@6.20.1:
|
||||||
|
resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==}
|
||||||
|
engines: {node: '>=18.17'}
|
||||||
|
|
||||||
union@0.5.0:
|
union@0.5.0:
|
||||||
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
|
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
|
||||||
engines: {node: '>= 0.8.0'}
|
engines: {node: '>= 0.8.0'}
|
||||||
@@ -11209,6 +11213,8 @@ snapshots:
|
|||||||
|
|
||||||
undici-types@5.26.5: {}
|
undici-types@5.26.5: {}
|
||||||
|
|
||||||
|
undici@6.20.1: {}
|
||||||
|
|
||||||
union@0.5.0:
|
union@0.5.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
qs: 6.12.2
|
qs: 6.12.2
|
||||||
|
|||||||
+27
-60
@@ -1,5 +1,10 @@
|
|||||||
### Crawl Website
|
# Pick your baseUrl here:
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
@baseUrl = http://localhost:3002
|
||||||
|
# @baseUrl = https://api.firecrawl.dev
|
||||||
|
|
||||||
|
### Scrape Website
|
||||||
|
# @name scrape
|
||||||
|
POST {{baseUrl}}/v1/scrape HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
@@ -7,17 +12,9 @@ content-type: application/json
|
|||||||
"url":"firecrawl.dev"
|
"url":"firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
### Check Job Status
|
### Crawl Website
|
||||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
# @name crawl
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
POST {{baseUrl}}/v1/crawl HTTP/1.1
|
||||||
|
|
||||||
|
|
||||||
### Check Job Status
|
|
||||||
GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
|
||||||
|
|
||||||
|
|
||||||
### Scrape Website
|
|
||||||
POST http://localhost:3002/v0/crawl HTTP/1.1
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
@@ -25,58 +22,28 @@ content-type: application/json
|
|||||||
"url": "firecrawl.dev"
|
"url": "firecrawl.dev"
|
||||||
}
|
}
|
||||||
|
|
||||||
## "reoveTags": [],
|
### Check Crawl Status
|
||||||
# "mode": "crawl",
|
@crawlId = {{crawl.response.body.$.id}}
|
||||||
# "crawlerOptions": {
|
# @name crawlStatus
|
||||||
# "allowBackwardCrawling": false
|
GET {{baseUrl}}/v1/crawl/{{crawlId}} HTTP/1.1
|
||||||
# },
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
# "pageOptions": {
|
|
||||||
# "onlyMainContent": false,
|
|
||||||
# "includeHtml": false,
|
|
||||||
# "parsePDF": true
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
|
### Batch Scrape Websites
|
||||||
|
# @name batchScrape
|
||||||
|
POST {{baseUrl}}/v1/batch/scrape HTTP/1.1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Scrape Website
|
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"https://mendable.ai"
|
"urls": [
|
||||||
|
"firecrawl.dev",
|
||||||
|
"mendable.ai"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
### Check Batch Scrape Status
|
||||||
|
@batchScrapeId = {{batchScrape.response.body.$.id}}
|
||||||
### Check Job Status
|
# @name batchScrapeStatus
|
||||||
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
|
GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
||||||
|
|
||||||
### Get Job Result
|
|
||||||
|
|
||||||
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|
||||||
content-type: application/json
|
|
||||||
|
|
||||||
{
|
|
||||||
"url":"https://mendable.ai"
|
|
||||||
}
|
|
||||||
|
|
||||||
### Check Job Status
|
|
||||||
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
|
|
||||||
Authorization: Bearer {{$dotenv TEST_API_KEY}}
|
|
||||||
|
|
||||||
### Get Active Jobs Count
|
|
||||||
GET http://localhost:3002/serverHealthCheck
|
|
||||||
content-type: application/json
|
|
||||||
|
|
||||||
### Notify Server Health Check
|
|
||||||
GET http://localhost:3002/serverHealthCheck/notify
|
|
||||||
content-type: application/json
|
|
||||||
|
|
||||||
@@ -75,7 +75,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active";
|
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active";
|
||||||
|
|
||||||
const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit").map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
jobs.length > 0 &&
|
jobs.length > 0 &&
|
||||||
|
|||||||
@@ -138,6 +138,8 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
||||||
|
|
||||||
|
delete (scrapeOptions as any).timeout;
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: url,
|
originUrl: url,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
|
|||||||
@@ -239,11 +239,22 @@ const crawlerOptions = z.object({
|
|||||||
|
|
||||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||||
|
|
||||||
|
export const webhookSchema = z.preprocess(x => {
|
||||||
|
if (typeof x === "string") {
|
||||||
|
return { url: x };
|
||||||
|
} else {
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
}, z.object({
|
||||||
|
url: z.string().url(),
|
||||||
|
headers: z.record(z.string(), z.string()).default({}),
|
||||||
|
}).strict(strictMessage))
|
||||||
|
|
||||||
export const crawlRequestSchema = crawlerOptions.extend({
|
export const crawlRequestSchema = crawlerOptions.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||||
webhook: z.string().url().optional(),
|
webhook: webhookSchema.optional(),
|
||||||
limit: z.number().default(10000),
|
limit: z.number().default(10000),
|
||||||
}).strict(strictMessage);
|
}).strict(strictMessage);
|
||||||
|
|
||||||
|
|||||||
@@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler {
|
export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
initialUrl: initialUrl ?? sc.originUrl!,
|
initialUrl: sc.originUrl!,
|
||||||
|
baseUrl: newBase ? new URL(newBase).origin : undefined,
|
||||||
includes: sc.crawlerOptions?.includes ?? [],
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ export class WebCrawler {
|
|||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
initialUrl,
|
initialUrl,
|
||||||
|
baseUrl,
|
||||||
includes,
|
includes,
|
||||||
excludes,
|
excludes,
|
||||||
maxCrawledLinks = 10000,
|
maxCrawledLinks = 10000,
|
||||||
@@ -38,6 +39,7 @@ export class WebCrawler {
|
|||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
|
baseUrl?: string;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
@@ -49,7 +51,7 @@ export class WebCrawler {
|
|||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
this.baseUrl = new URL(initialUrl).origin;
|
this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
|
||||||
this.includes = Array.isArray(includes) ? includes : [];
|
this.includes = Array.isArray(includes) ? includes : [];
|
||||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||||
this.limit = limit;
|
this.limit = limit;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile } from "../utils/downloadFile";
|
||||||
import mammoth from "mammoth";
|
import mammoth from "mammoth";
|
||||||
|
|
||||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import { createReadStream, promises as fs } from "node:fs";
|
import { createReadStream, promises as fs } from "node:fs";
|
||||||
import FormData from "form-data";
|
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import * as marked from "marked";
|
import * as marked from "marked";
|
||||||
@@ -16,10 +15,26 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
||||||
|
|
||||||
const uploadForm = new FormData();
|
const uploadForm = new FormData();
|
||||||
uploadForm.append("file", createReadStream(tempFilePath), {
|
|
||||||
filename: tempFilePath,
|
// This is utterly stupid but it works! - mogery
|
||||||
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
|
uploadForm.append("file", {
|
||||||
});
|
[Symbol.toStringTag]: "Blob",
|
||||||
|
name: tempFilePath,
|
||||||
|
stream() {
|
||||||
|
return createReadStream(tempFilePath) as unknown as ReadableStream<Uint8Array>
|
||||||
|
},
|
||||||
|
arrayBuffer() {
|
||||||
|
throw Error("Unimplemented in mock Blob: arrayBuffer")
|
||||||
|
},
|
||||||
|
size: (await fs.stat(tempFilePath)).size,
|
||||||
|
text() {
|
||||||
|
throw Error("Unimplemented in mock Blob: text")
|
||||||
|
},
|
||||||
|
slice(start, end, contentType) {
|
||||||
|
throw Error("Unimplemented in mock Blob: slice")
|
||||||
|
},
|
||||||
|
type: "application/pdf",
|
||||||
|
} as Blob);
|
||||||
|
|
||||||
const upload = await robustFetch({
|
const upload = await robustFetch({
|
||||||
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||||
@@ -47,6 +62,8 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
schema: z.object({
|
schema: z.object({
|
||||||
markdown: z.string(),
|
markdown: z.string(),
|
||||||
}),
|
}),
|
||||||
|
tryCount: 32,
|
||||||
|
tryCooldown: 250,
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs";
|
|||||||
import { EngineError } from "../../error";
|
import { EngineError } from "../../error";
|
||||||
import { Writable } from "stream";
|
import { Writable } from "stream";
|
||||||
import { v4 as uuid } from "uuid";
|
import { v4 as uuid } from "uuid";
|
||||||
|
import * as undici from "undici";
|
||||||
|
|
||||||
export async function fetchFileToBuffer(url: string): Promise<{
|
export async function fetchFileToBuffer(url: string): Promise<{
|
||||||
response: Response,
|
response: Response,
|
||||||
@@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function downloadFile(id: string, url: string): Promise<{
|
export async function downloadFile(id: string, url: string): Promise<{
|
||||||
response: Response
|
response: undici.Response
|
||||||
tempFilePath: string
|
tempFilePath: string
|
||||||
}> {
|
}> {
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
||||||
const tempFileWrite = createWriteStream(tempFilePath);
|
const tempFileWrite = createWriteStream(tempFilePath);
|
||||||
|
|
||||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
// TODO: maybe we could use tlsclient for this? for proxying
|
||||||
|
// use undici to ignore SSL for now
|
||||||
|
const response = await undici.fetch(url, {
|
||||||
|
dispatcher: new undici.Agent({
|
||||||
|
connect: {
|
||||||
|
rejectUnauthorized: false,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
// This should never happen in the current state of JS (2024), but let's check anyways.
|
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||||
if (response.body === null) {
|
if (response.body === null) {
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import { Logger } from "winston";
|
|||||||
import { z, ZodError } from "zod";
|
import { z, ZodError } from "zod";
|
||||||
import { v4 as uuid } from "uuid";
|
import { v4 as uuid } from "uuid";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import FormData from "form-data";
|
|
||||||
|
|
||||||
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
||||||
url: string;
|
url: string;
|
||||||
@@ -16,6 +15,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
|||||||
ignoreFailure?: boolean;
|
ignoreFailure?: boolean;
|
||||||
requestId?: string;
|
requestId?: string;
|
||||||
tryCount?: number;
|
tryCount?: number;
|
||||||
|
tryCooldown?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer<Schema>>({
|
export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer<Schema>>({
|
||||||
@@ -29,8 +29,9 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
ignoreFailure = false,
|
ignoreFailure = false,
|
||||||
requestId = uuid(),
|
requestId = uuid(),
|
||||||
tryCount = 1,
|
tryCount = 1,
|
||||||
|
tryCooldown,
|
||||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||||
const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount };
|
const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount, tryCooldown };
|
||||||
|
|
||||||
let request: Response;
|
let request: Response;
|
||||||
try {
|
try {
|
||||||
@@ -38,14 +39,14 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
method,
|
method,
|
||||||
headers: {
|
headers: {
|
||||||
...(body instanceof FormData
|
...(body instanceof FormData
|
||||||
? body.getHeaders()
|
? ({})
|
||||||
: body !== undefined ? ({
|
: body !== undefined ? ({
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
}) : {}),
|
}) : {}),
|
||||||
...(headers !== undefined ? headers : {}),
|
...(headers !== undefined ? headers : {}),
|
||||||
},
|
},
|
||||||
...(body instanceof FormData ? ({
|
...(body instanceof FormData ? ({
|
||||||
body: body.getBuffer(),
|
body,
|
||||||
}) : body !== undefined ? ({
|
}) : body !== undefined ? ({
|
||||||
body: JSON.stringify(body),
|
body: JSON.stringify(body),
|
||||||
}) : {}),
|
}) : {}),
|
||||||
@@ -87,6 +88,9 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
if (request.status >= 300) {
|
if (request.status >= 300) {
|
||||||
if (tryCount > 1) {
|
if (tryCount > 1) {
|
||||||
logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId });
|
logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId });
|
||||||
|
if (tryCooldown !== undefined) {
|
||||||
|
await new Promise((resolve) => setTimeout(() => resolve(null), tryCooldown));
|
||||||
|
}
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
...params,
|
...params,
|
||||||
requestId,
|
requestId,
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import { logger } from "../../../src/lib/logger";
|
|||||||
import { sendSlackWebhook } from "../alerts/slack";
|
import { sendSlackWebhook } from "../alerts/slack";
|
||||||
import { getNotificationString } from "./notification_string";
|
import { getNotificationString } from "./notification_string";
|
||||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||||
|
import { redlock } from "../redlock";
|
||||||
|
|
||||||
const emailTemplates: Record<
|
const emailTemplates: Record<
|
||||||
NotificationType,
|
NotificationType,
|
||||||
@@ -88,6 +89,7 @@ export async function sendNotificationInternal(
|
|||||||
if (team_id === "preview") {
|
if (team_id === "preview") {
|
||||||
return { success: true };
|
return { success: true };
|
||||||
}
|
}
|
||||||
|
return await redlock.using([`notification-lock:${team_id}:${notificationType}`], 5000, async () => {
|
||||||
|
|
||||||
if (!bypassRecentChecks) {
|
if (!bypassRecentChecks) {
|
||||||
const fifteenDaysAgo = new Date();
|
const fifteenDaysAgo = new Date();
|
||||||
@@ -171,5 +173,6 @@ export async function sendNotificationInternal(
|
|||||||
return { success: false };
|
return { success: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
return { success: true };
|
return { success: true };
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
||||||
@@ -568,24 +568,24 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
crawl_id: job.data.crawl_id,
|
crawl_id: job.data.crawl_id,
|
||||||
});
|
});
|
||||||
|
|
||||||
await logJob({
|
// await logJob({
|
||||||
job_id: job.data.crawl_id,
|
// job_id: job.data.crawl_id,
|
||||||
success: false,
|
// success: false,
|
||||||
message:
|
// message:
|
||||||
typeof error === "string"
|
// typeof error === "string"
|
||||||
? error
|
// ? error
|
||||||
: error.message ??
|
// : error.message ??
|
||||||
"Something went wrong... Contact help@mendable.ai",
|
// "Something went wrong... Contact help@mendable.ai",
|
||||||
num_docs: 0,
|
// num_docs: 0,
|
||||||
docs: [],
|
// docs: [],
|
||||||
time_taken: 0,
|
// time_taken: 0,
|
||||||
team_id: job.data.team_id,
|
// team_id: job.data.team_id,
|
||||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
// mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||||
url: sc ? sc.originUrl ?? job.data.url : job.data.url,
|
// url: sc ? sc.originUrl ?? job.data.url : job.data.url,
|
||||||
crawlerOptions: sc ? sc.crawlerOptions : undefined,
|
// crawlerOptions: sc ? sc.crawlerOptions : undefined,
|
||||||
scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
|
// scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
|
||||||
origin: job.data.origin,
|
// origin: job.data.origin,
|
||||||
});
|
// });
|
||||||
}
|
}
|
||||||
// done(null, data);
|
// done(null, data);
|
||||||
return data;
|
return data;
|
||||||
|
|||||||
@@ -1,15 +1,17 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { logger } from "../../src/lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
import { supabase_service } from "./supabase";
|
import { supabase_service } from "./supabase";
|
||||||
import { WebhookEventType } from "../types";
|
import { WebhookEventType } from "../types";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import { z } from "zod";
|
||||||
|
import { webhookSchema } from "../controllers/v1/types";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export const callWebhook = async (
|
export const callWebhook = async (
|
||||||
teamId: string,
|
teamId: string,
|
||||||
id: string,
|
id: string,
|
||||||
data: any | null,
|
data: any | null,
|
||||||
specified?: string,
|
specified?: z.infer<typeof webhookSchema>,
|
||||||
v1 = false,
|
v1 = false,
|
||||||
eventType: WebhookEventType = "crawl.page",
|
eventType: WebhookEventType = "crawl.page",
|
||||||
awaitWebhook: boolean = false
|
awaitWebhook: boolean = false
|
||||||
@@ -20,7 +22,7 @@ export const callWebhook = async (
|
|||||||
id
|
id
|
||||||
);
|
);
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
let webhookUrl = specified ?? selfHostedUrl;
|
let webhookUrl = specified ?? (selfHostedUrl ? webhookSchema.parse({ url: selfHostedUrl }) : undefined);
|
||||||
|
|
||||||
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
|
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
|
||||||
// and the USE_DB_AUTHENTICATION environment variable is set to true
|
// and the USE_DB_AUTHENTICATION environment variable is set to true
|
||||||
@@ -73,7 +75,7 @@ export const callWebhook = async (
|
|||||||
if (awaitWebhook) {
|
if (awaitWebhook) {
|
||||||
try {
|
try {
|
||||||
await axios.post(
|
await axios.post(
|
||||||
webhookUrl,
|
webhookUrl.url,
|
||||||
{
|
{
|
||||||
success: !v1
|
success: !v1
|
||||||
? data.success
|
? data.success
|
||||||
@@ -92,6 +94,7 @@ export const callWebhook = async (
|
|||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
...webhookUrl.headers,
|
||||||
},
|
},
|
||||||
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
||||||
}
|
}
|
||||||
@@ -104,7 +107,7 @@ export const callWebhook = async (
|
|||||||
} else {
|
} else {
|
||||||
axios
|
axios
|
||||||
.post(
|
.post(
|
||||||
webhookUrl,
|
webhookUrl.url,
|
||||||
{
|
{
|
||||||
success: !v1
|
success: !v1
|
||||||
? data.success
|
? data.success
|
||||||
@@ -123,6 +126,7 @@ export const callWebhook = async (
|
|||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
...webhookUrl.headers,
|
||||||
},
|
},
|
||||||
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document } from "./controllers/v1/types";
|
import { z } from "zod";
|
||||||
|
import { AuthCreditUsageChunk, ScrapeOptions, Document as V1Document, webhookSchema } from "./controllers/v1/types";
|
||||||
import { ExtractorOptions, Document } from "./lib/entities";
|
import { ExtractorOptions, Document } from "./lib/entities";
|
||||||
import { InternalOptions } from "./scraper/scrapeURL";
|
import { InternalOptions } from "./scraper/scrapeURL";
|
||||||
|
|
||||||
@@ -33,7 +34,7 @@ export interface WebScraperOptions {
|
|||||||
origin?: string;
|
origin?: string;
|
||||||
crawl_id?: string;
|
crawl_id?: string;
|
||||||
sitemapped?: boolean;
|
sitemapped?: boolean;
|
||||||
webhook?: string;
|
webhook?: z.infer<typeof webhookSchema>;
|
||||||
v1?: boolean;
|
v1?: boolean;
|
||||||
is_scrape?: boolean;
|
is_scrape?: boolean;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.8.1",
|
"version": "1.8.2",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
|||||||
@@ -153,7 +153,10 @@ export interface CrawlParams {
|
|||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
ignoreSitemap?: boolean;
|
ignoreSitemap?: boolean;
|
||||||
scrapeOptions?: CrawlScrapeOptions;
|
scrapeOptions?: CrawlScrapeOptions;
|
||||||
webhook?: string;
|
webhook?: string | {
|
||||||
|
url: string;
|
||||||
|
headers?: Record<string, string>;
|
||||||
|
};
|
||||||
deduplicateSimilarURLs?: boolean;
|
deduplicateSimilarURLs?: boolean;
|
||||||
ignoreQueryParameters?: boolean;
|
ignoreQueryParameters?: boolean;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user