Nick: revert trailing comma
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
{
|
{
|
||||||
"trailingComma": "none"
|
"trailingComma": "all"
|
||||||
}
|
}
|
||||||
@@ -3,7 +3,7 @@ import dotenv from "dotenv";
|
|||||||
import {
|
import {
|
||||||
FirecrawlCrawlResponse,
|
FirecrawlCrawlResponse,
|
||||||
FirecrawlCrawlStatusResponse,
|
FirecrawlCrawlStatusResponse,
|
||||||
FirecrawlScrapeResponse
|
FirecrawlScrapeResponse,
|
||||||
} from "../../types";
|
} from "../../types";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@@ -23,9 +23,9 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
authors: { type: "array", items: { type: "string" } }
|
authors: { type: "array", items: { type: "string" } },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -45,7 +45,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
|
|
||||||
expect(gotItRight).toBeGreaterThan(1);
|
expect(gotItRight).toBeGreaterThan(1);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -62,9 +62,9 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
founders: { type: "array", items: { type: "string" } }
|
founders: { type: "array", items: { type: "string" } },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -83,7 +83,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
|
|
||||||
expect(gotItRight).toBeGreaterThanOrEqual(2);
|
expect(gotItRight).toBeGreaterThanOrEqual(2);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -100,10 +100,10 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
schema: {
|
schema: {
|
||||||
type: "array",
|
type: "array",
|
||||||
items: {
|
items: {
|
||||||
type: "string"
|
type: "string",
|
||||||
},
|
},
|
||||||
required: ["items"]
|
required: ["items"],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -118,7 +118,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
|
|
||||||
expect(gotItRight).toBeGreaterThan(2);
|
expect(gotItRight).toBeGreaterThan(2);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -135,15 +135,15 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
pciDssCompliance: { type: "boolean" }
|
pciDssCompliance: { type: "boolean" },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
expect(response.body.data?.pciDssCompliance).toBe(true);
|
expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -163,10 +163,10 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
connector: { type: "string" },
|
connector: { type: "string" },
|
||||||
description: { type: "string" },
|
description: { type: "string" },
|
||||||
supportsCaptureDelete: { type: "boolean" }
|
supportsCaptureDelete: { type: "boolean" },
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -174,7 +174,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
// expect(response.body).toHaveProperty("data");
|
// expect(response.body).toHaveProperty("data");
|
||||||
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -186,17 +186,17 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
urls: [
|
urls: [
|
||||||
"https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"
|
"https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003",
|
||||||
],
|
],
|
||||||
prompt: "what applicant tracking system is this company using?",
|
prompt: "what applicant tracking system is this company using?",
|
||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
isGreenhouseATS: { type: "boolean" },
|
isGreenhouseATS: { type: "boolean" },
|
||||||
answer: { type: "string" }
|
answer: { type: "string" },
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
allowExternalLinks: true
|
allowExternalLinks: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -204,7 +204,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
expect(response.body.data?.isGreenhouseATS).toBe(true);
|
expect(response.body.data?.isGreenhouseATS).toBe(true);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -222,12 +222,12 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
items: {
|
items: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
component: { type: "string" }
|
component: { type: "string" },
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
required: ["items"]
|
required: ["items"],
|
||||||
},
|
},
|
||||||
allowExternalLinks: true
|
allowExternalLinks: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body.data?.items);
|
console.log(response.body.data?.items);
|
||||||
@@ -248,7 +248,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
}
|
}
|
||||||
expect(gotItRight).toBeGreaterThan(2);
|
expect(gotItRight).toBeGreaterThan(2);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -267,11 +267,11 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
name: { type: "string" },
|
name: { type: "string" },
|
||||||
work: { type: "string" },
|
work: { type: "string" },
|
||||||
education: { type: "string" }
|
education: { type: "string" },
|
||||||
},
|
},
|
||||||
required: ["name", "work", "education"]
|
required: ["name", "work", "education"],
|
||||||
},
|
},
|
||||||
allowExternalLinks: true
|
allowExternalLinks: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body.data);
|
console.log(response.body.data);
|
||||||
@@ -281,7 +281,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
expect(response.body.data?.work).toBeDefined();
|
expect(response.body.data?.work).toBeDefined();
|
||||||
expect(response.body.data?.education).toBeDefined();
|
expect(response.body.data?.education).toBeDefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -293,7 +293,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
urls: ["https://docs.firecrawl.dev"],
|
urls: ["https://docs.firecrawl.dev"],
|
||||||
prompt: "What is the title and description of the page?"
|
prompt: "What is the title and description of the page?",
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body.data);
|
console.log(response.body.data);
|
||||||
@@ -302,6 +302,6 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
expect(typeof response.body.data).toBe("object");
|
expect(typeof response.body.data).toBe("object");
|
||||||
expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
|
expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent("should return an error for a blocklisted URL", async () => {
|
it.concurrent("should return an error for a blocklisted URL", async () => {
|
||||||
@@ -59,7 +59,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain(
|
expect(response.body.error).toContain(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -103,30 +103,30 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.description).toBe(
|
expect(response.body.data.metadata.description).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.keywords).toBe(
|
expect(response.body.data.metadata.keywords).toBe(
|
||||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
"Roast My Website,Roast,Website,GitHub,Firecrawl",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.ogDescription).toBe(
|
expect(response.body.data.metadata.ogDescription).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogUrl).toBe(
|
expect(response.body.data.metadata.ogUrl).toBe(
|
||||||
"https://www.roastmywebsite.ai"
|
"https://www.roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogImage).toBe(
|
expect(response.body.data.metadata.ogImage).toBe(
|
||||||
"https://www.roastmywebsite.ai/og.png"
|
"https://www.roastmywebsite.ai/og.png",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.sourceURL).toBe(
|
expect(response.body.data.metadata.sourceURL).toBe(
|
||||||
"https://roastmywebsite.ai"
|
"https://roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -138,7 +138,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
pageOptions: { includeHtml: true }
|
pageOptions: { includeHtml: true },
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -152,7 +152,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -164,7 +164,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
pageOptions: { includeRawHtml: true }
|
pageOptions: { includeRawHtml: true },
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -178,7 +178,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -196,12 +196,12 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("content");
|
expect(response.body.data).toHaveProperty("content");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.content).toContain(
|
expect(response.body.data.content).toContain(
|
||||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -219,12 +219,12 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("content");
|
expect(response.body.data).toHaveProperty("content");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.content).toContain(
|
expect(response.body.data.content).toContain(
|
||||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -236,7 +236,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
pageOptions: { parsePDF: false }
|
pageOptions: { parsePDF: false },
|
||||||
});
|
});
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
@@ -245,10 +245,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("content");
|
expect(response.body.data).toHaveProperty("content");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.content).toContain(
|
expect(response.body.data.content).toContain(
|
||||||
"/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj"
|
"/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -266,16 +266,16 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"Scrape This Site"
|
"Scrape This Site",
|
||||||
);
|
);
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"Lessons and Videos"
|
"Lessons and Videos",
|
||||||
); // #footer
|
); // #footer
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"[Sandbox]("
|
"[Sandbox](",
|
||||||
); // .nav
|
); // .nav
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"web scraping"
|
"web scraping",
|
||||||
); // strong
|
); // strong
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@@ -284,7 +284,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
pageOptions: { removeTags: [".nav", "#footer", "strong"] }
|
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -297,7 +297,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||||
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||||
@@ -337,10 +337,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"bad request"
|
"bad request",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -359,10 +359,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"unauthorized"
|
"unauthorized",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -381,10 +381,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"forbidden"
|
"forbidden",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -403,10 +403,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"not found"
|
"not found",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -425,10 +425,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"method not allowed"
|
"method not allowed",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -447,10 +447,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"internal server error"
|
"internal server error",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -469,7 +469,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent("should return an error for a blocklisted URL", async () => {
|
it.concurrent("should return an error for a blocklisted URL", async () => {
|
||||||
@@ -481,7 +481,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain(
|
expect(response.body.error).toContain(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -496,9 +496,9 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("jobId");
|
expect(response.body).toHaveProperty("jobId");
|
||||||
expect(response.body.jobId).toMatch(
|
expect(response.body.jobId).toMatch(
|
||||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
|
||||||
);
|
);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should prevent duplicate requests using the same idempotency key",
|
"should prevent duplicate requests using the same idempotency key",
|
||||||
@@ -525,7 +525,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
|
|
||||||
expect(secondResponse.statusCode).toBe(409);
|
expect(secondResponse.statusCode).toBe(409);
|
||||||
expect(secondResponse.body.error).toBe("Idempotency key already used");
|
expect(secondResponse.body.error).toBe("Idempotency key already used");
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -539,8 +539,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
limit: 10,
|
limit: 10,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
includes: ["blog/*"]
|
includes: ["blog/*"],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
let response;
|
let response;
|
||||||
@@ -563,7 +563,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
const completedResponse = response;
|
const completedResponse = response;
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(5);
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
urls.forEach((url: string) => {
|
urls.forEach((url: string) => {
|
||||||
@@ -579,13 +579,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -599,8 +599,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
limit: 10,
|
limit: 10,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
excludes: ["blog/*"]
|
excludes: ["blog/*"],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
let isFinished = false;
|
let isFinished = false;
|
||||||
@@ -623,14 +623,14 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
const completedResponse = response;
|
const completedResponse = response;
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(5);
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
urls.forEach((url: string) => {
|
urls.forEach((url: string) => {
|
||||||
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
90000
|
90000,
|
||||||
); // 90 seconds
|
); // 90 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -642,7 +642,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
crawlerOptions: { limit: 3 }
|
crawlerOptions: { limit: 3 },
|
||||||
});
|
});
|
||||||
|
|
||||||
let isFinished = false;
|
let isFinished = false;
|
||||||
@@ -674,13 +674,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -692,7 +692,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.scrapethissite.com",
|
||||||
crawlerOptions: { maxDepth: 1 }
|
crawlerOptions: { maxDepth: 1 },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -726,13 +726,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(1);
|
expect(urls.length).toBeGreaterThan(1);
|
||||||
|
|
||||||
@@ -748,7 +748,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(depth).toBeLessThanOrEqual(2);
|
expect(depth).toBeLessThanOrEqual(2);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -760,7 +760,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com/pages/",
|
url: "https://www.scrapethissite.com/pages/",
|
||||||
crawlerOptions: { maxDepth: 1 }
|
crawlerOptions: { maxDepth: 1 },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -794,7 +794,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(1);
|
expect(urls.length).toBeGreaterThan(1);
|
||||||
|
|
||||||
@@ -810,7 +810,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(depth).toBeLessThanOrEqual(3);
|
expect(depth).toBeLessThanOrEqual(3);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -822,7 +822,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.mendable.ai",
|
url: "https://www.mendable.ai",
|
||||||
crawlerOptions: { maxDepth: 0 }
|
crawlerOptions: { maxDepth: 0 },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -849,7 +849,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
const testurls = completedResponse.body.data.map(
|
const testurls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
//console.log(testurls)
|
//console.log(testurls)
|
||||||
|
|
||||||
@@ -861,7 +861,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||||
|
|
||||||
@@ -877,7 +877,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(depth).toBeLessThanOrEqual(1);
|
expect(depth).toBeLessThanOrEqual(1);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and valid limit option", async () => {
|
||||||
@@ -934,7 +934,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
pageOptions: { includeHtml: true }
|
pageOptions: { includeHtml: true },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -969,10 +969,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
|
|
||||||
// 120 seconds
|
// 120 seconds
|
||||||
@@ -983,13 +983,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||||
|
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1005,8 +1005,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
allowExternalContentLinks: true,
|
allowExternalContentLinks: true,
|
||||||
ignoreSitemap: true,
|
ignoreSitemap: true,
|
||||||
returnOnlyUrls: true,
|
returnOnlyUrls: true,
|
||||||
limit: 50
|
limit: 50,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(crawlInitResponse.statusCode).toBe(200);
|
expect(crawlInitResponse.statusCode).toBe(200);
|
||||||
@@ -1031,19 +1031,19 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect.arrayContaining([
|
expect.arrayContaining([
|
||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
url: expect.stringContaining(
|
url: expect.stringContaining(
|
||||||
"https://firecrawl.dev/?ref=mendable+banner"
|
"https://firecrawl.dev/?ref=mendable+banner",
|
||||||
)
|
),
|
||||||
}),
|
}),
|
||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
url: expect.stringContaining("https://mendable.ai/pricing")
|
url: expect.stringContaining("https://mendable.ai/pricing"),
|
||||||
}),
|
}),
|
||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
url: expect.stringContaining("https://x.com/CalebPeffer")
|
url: expect.stringContaining("https://x.com/CalebPeffer"),
|
||||||
})
|
}),
|
||||||
])
|
]),
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 3 minutes timeout
|
); // 3 minutes timeout
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1062,7 +1062,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
// it.concurrent("should return an error for a blocklisted URL", async () => {
|
// it.concurrent("should return an error for a blocklisted URL", async () => {
|
||||||
@@ -1088,7 +1088,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
|
|
||||||
expect(response.statusCode).toBe(408);
|
expect(response.statusCode).toBe(408);
|
||||||
},
|
},
|
||||||
3000
|
3000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
|
// it.concurrent("should return a successful response with a valid API key for crawlWebsitePreview", async () => {
|
||||||
@@ -1120,7 +1120,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ query: "test" });
|
.send({ query: "test" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1136,7 +1136,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.success).toBe(true);
|
expect(response.body.success).toBe(true);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1153,7 +1153,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.get("/v0/crawl/status/123")
|
.get("/v0/crawl/status/123")
|
||||||
.set("Authorization", `Bearer invalid-api-key`);
|
.set("Authorization", `Bearer invalid-api-key`);
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1163,7 +1163,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.get("/v0/crawl/status/invalidJobId")
|
.get("/v0/crawl/status/invalidJobId")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
expect(response.statusCode).toBe(404);
|
expect(response.statusCode).toBe(404);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1201,22 +1201,22 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
|
|
||||||
const childrenLinks = completedResponse.body.data.filter(
|
const childrenLinks = completedResponse.body.data.filter(
|
||||||
(doc) =>
|
(doc) =>
|
||||||
doc.metadata &&
|
doc.metadata &&
|
||||||
doc.metadata.sourceURL &&
|
doc.metadata.sourceURL &&
|
||||||
doc.metadata.sourceURL.includes("mendable.ai/blog")
|
doc.metadata.sourceURL.includes("mendable.ai/blog"),
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 120 seconds
|
); // 120 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1236,9 +1236,9 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
"abs/*",
|
"abs/*",
|
||||||
"static/*",
|
"static/*",
|
||||||
"about/*",
|
"about/*",
|
||||||
"archive/*"
|
"archive/*",
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -1266,21 +1266,21 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect.arrayContaining([
|
expect.arrayContaining([
|
||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
content: expect.stringContaining(
|
content: expect.stringContaining(
|
||||||
"asymmetries might represent, for instance, preferred source orientations to our line of sight."
|
"asymmetries might represent, for instance, preferred source orientations to our line of sight.",
|
||||||
)
|
),
|
||||||
})
|
}),
|
||||||
])
|
]),
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 120 seconds
|
); // 120 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1292,7 +1292,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
pageOptions: { includeHtml: true }
|
pageOptions: { includeHtml: true },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -1333,13 +1333,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
expect(completedResponse.body.data[0].markdown).toContain("_Roast_");
|
||||||
expect(completedResponse.body.data[0].html).toContain("<h1");
|
expect(completedResponse.body.data[0].html).toContain("<h1");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
}); // 60 seconds
|
}); // 60 seconds
|
||||||
|
|
||||||
@@ -1353,7 +1353,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://mendable.ai/blog",
|
url: "https://mendable.ai/blog",
|
||||||
pageOptions: { includeHtml: true },
|
pageOptions: { includeHtml: true },
|
||||||
crawlerOptions: { allowBackwardCrawling: true }
|
crawlerOptions: { allowBackwardCrawling: true },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -1397,10 +1397,10 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
expect(completedResponse.body.data.length).toBeGreaterThan(
|
expect(completedResponse.body.data.length).toBeGreaterThan(
|
||||||
onlyChildrenLinks.length
|
onlyChildrenLinks.length,
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1438,13 +1438,13 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("metadata");
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.partial_data[0].metadata.pageStatusCode
|
completedResponse.body.partial_data[0].metadata.pageStatusCode,
|
||||||
).toBe(200);
|
).toBe(200);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.partial_data[0].metadata.pageError
|
completedResponse.body.partial_data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
describe("POST /v0/scrape with LLM Extraction", () => {
|
describe("POST /v0/scrape with LLM Extraction", () => {
|
||||||
@@ -1458,7 +1458,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
onlyMainContent: true
|
onlyMainContent: true,
|
||||||
},
|
},
|
||||||
extractorOptions: {
|
extractorOptions: {
|
||||||
mode: "llm-extraction",
|
mode: "llm-extraction",
|
||||||
@@ -1468,18 +1468,18 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
company_mission: {
|
company_mission: {
|
||||||
type: "string"
|
type: "string",
|
||||||
},
|
},
|
||||||
supports_sso: {
|
supports_sso: {
|
||||||
type: "boolean"
|
type: "boolean",
|
||||||
},
|
},
|
||||||
is_open_source: {
|
is_open_source: {
|
||||||
type: "boolean"
|
type: "boolean",
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"]
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Ensure that the job was successfully created before proceeding with LLM extraction
|
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||||
@@ -1498,7 +1498,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(llmExtraction.is_open_source).toBe(false);
|
expect(llmExtraction.is_open_source).toBe(false);
|
||||||
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 secs
|
); // 60 secs
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1519,15 +1519,15 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
primary_cta: {
|
primary_cta: {
|
||||||
type: "string"
|
type: "string",
|
||||||
},
|
},
|
||||||
secondary_cta: {
|
secondary_cta: {
|
||||||
type: "string"
|
type: "string",
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
required: ["primary_cta", "secondary_cta"]
|
required: ["primary_cta", "secondary_cta"],
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Ensure that the job was successfully created before proceeding with LLM extraction
|
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||||
@@ -1542,7 +1542,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(llmExtraction).toHaveProperty("secondary_cta");
|
expect(llmExtraction).toHaveProperty("secondary_cta");
|
||||||
expect(typeof llmExtraction.secondary_cta).toBe("string");
|
expect(typeof llmExtraction.secondary_cta).toBe("string");
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 secs
|
); // 60 secs
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1617,8 +1617,8 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://flutterbricks.com",
|
url: "https://flutterbricks.com",
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
mode: "fast"
|
mode: "fast",
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
@@ -1660,7 +1660,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(results.length).toBeGreaterThanOrEqual(10);
|
expect(results.length).toBeGreaterThanOrEqual(10);
|
||||||
expect(results.length).toBeLessThanOrEqual(15);
|
expect(results.length).toBeLessThanOrEqual(15);
|
||||||
},
|
},
|
||||||
20000
|
20000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// it.concurrent("should complete the crawl in more than 10 seconds", async () => {
|
// it.concurrent("should complete the crawl in more than 10 seconds", async () => {
|
||||||
@@ -1741,7 +1741,7 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
|
|
||||||
expect(response.statusCode).toBe(429);
|
expect(response.statusCode).toBe(429);
|
||||||
},
|
},
|
||||||
90000
|
90000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: false,
|
sitemapOnly: false,
|
||||||
search: "smart-crawl"
|
search: "smart-crawl",
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -24,7 +24,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
expect(response.body.links.length).toBeGreaterThan(0);
|
expect(response.body.links.length).toBeGreaterThan(0);
|
||||||
expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
|
expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -37,7 +37,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: false,
|
sitemapOnly: false,
|
||||||
includeSubdomains: true
|
includeSubdomains: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -45,10 +45,10 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("links");
|
expect(response.body).toHaveProperty("links");
|
||||||
expect(response.body.links.length).toBeGreaterThan(0);
|
expect(response.body.links.length).toBeGreaterThan(0);
|
||||||
expect(response.body.links[response.body.links.length - 1]).toContain(
|
expect(response.body.links[response.body.links.length - 1]).toContain(
|
||||||
"docs.firecrawl.dev"
|
"docs.firecrawl.dev",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -60,7 +60,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: true
|
sitemapOnly: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -68,10 +68,10 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("links");
|
expect(response.body).toHaveProperty("links");
|
||||||
expect(response.body.links.length).toBeGreaterThan(0);
|
expect(response.body.links.length).toBeGreaterThan(0);
|
||||||
expect(response.body.links[response.body.links.length - 1]).not.toContain(
|
expect(response.body.links[response.body.links.length - 1]).not.toContain(
|
||||||
"docs.firecrawl.dev"
|
"docs.firecrawl.dev",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -84,7 +84,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: false,
|
sitemapOnly: false,
|
||||||
limit: 10
|
limit: 10,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -92,7 +92,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("links");
|
expect(response.body).toHaveProperty("links");
|
||||||
expect(response.body.links.length).toBeLessThanOrEqual(10);
|
expect(response.body.links.length).toBeLessThanOrEqual(10);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -104,7 +104,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://geekflare.com/sitemap_index.xml",
|
url: "https://geekflare.com/sitemap_index.xml",
|
||||||
sitemapOnly: true
|
sitemapOnly: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -112,6 +112,6 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("links");
|
expect(response.body).toHaveProperty("links");
|
||||||
expect(response.body.links.length).toBeGreaterThan(1900);
|
expect(response.body.links.length).toBeGreaterThan(1900);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain(
|
expect(response.body.error).toContain(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain(
|
expect(response.body.error).toContain(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -101,7 +101,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("jobId");
|
expect(response.body).toHaveProperty("jobId");
|
||||||
expect(response.body.jobId).toMatch(
|
expect(response.body.jobId).toMatch(
|
||||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -120,7 +120,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain(
|
expect(response.body.error).toContain(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -132,7 +132,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("jobId");
|
expect(response.body).toHaveProperty("jobId");
|
||||||
expect(response.body.jobId).toMatch(
|
expect(response.body.jobId).toMatch(
|
||||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -172,7 +172,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
|
|
||||||
it("should return Job not found for invalid job ID", async () => {
|
it("should return Job not found for invalid job ID", async () => {
|
||||||
const response = await request(TEST_URL).get(
|
const response = await request(TEST_URL).get(
|
||||||
"/v0/crawl/status/invalidJobId"
|
"/v0/crawl/status/invalidJobId",
|
||||||
);
|
);
|
||||||
expect(response.statusCode).toBe(404);
|
expect(response.statusCode).toBe(404);
|
||||||
});
|
});
|
||||||
@@ -185,7 +185,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
const response = await request(TEST_URL).get(
|
const response = await request(TEST_URL).get(
|
||||||
`/v0/crawl/status/${crawlResponse.body.jobId}`
|
`/v0/crawl/status/${crawlResponse.body.jobId}`,
|
||||||
);
|
);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("status");
|
expect(response.body).toHaveProperty("status");
|
||||||
@@ -195,7 +195,7 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
await new Promise((r) => setTimeout(r, 30000));
|
await new Promise((r) => setTimeout(r, 30000));
|
||||||
|
|
||||||
const completedResponse = await request(TEST_URL).get(
|
const completedResponse = await request(TEST_URL).get(
|
||||||
`/v0/crawl/status/${crawlResponse.body.jobId}`
|
`/v0/crawl/status/${crawlResponse.body.jobId}`,
|
||||||
);
|
);
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import request from "supertest";
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import {
|
||||||
ScrapeRequestInput,
|
ScrapeRequestInput,
|
||||||
ScrapeResponseRequestTest
|
ScrapeResponseRequestTest,
|
||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@@ -24,7 +24,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
"process.env.USE_DB_AUTHENTICATION",
|
"process.env.USE_DB_AUTHENTICATION",
|
||||||
process.env.USE_DB_AUTHENTICATION
|
process.env.USE_DB_AUTHENTICATION,
|
||||||
);
|
);
|
||||||
console.log("?", process.env.USE_DB_AUTHENTICATION === "true");
|
console.log("?", process.env.USE_DB_AUTHENTICATION === "true");
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
@@ -47,7 +47,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://facebook.com/fake-test"
|
url: "https://facebook.com/fake-test",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@@ -58,7 +58,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toBe(
|
expect(response.body.error).toBe(
|
||||||
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."
|
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -71,14 +71,14 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai"
|
url: "https://roastmywebsite.ai",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -100,37 +100,37 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.description).toBe(
|
expect(response.body.data.metadata.description).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.keywords).toBe(
|
expect(response.body.data.metadata.keywords).toBe(
|
||||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
"Roast My Website,Roast,Website,GitHub,Firecrawl",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.ogDescription).toBe(
|
expect(response.body.data.metadata.ogDescription).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogUrl).toBe(
|
expect(response.body.data.metadata.ogUrl).toBe(
|
||||||
"https://www.roastmywebsite.ai"
|
"https://www.roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogImage).toBe(
|
expect(response.body.data.metadata.ogImage).toBe(
|
||||||
"https://www.roastmywebsite.ai/og.png"
|
"https://www.roastmywebsite.ai/og.png",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.sourceURL).toBe(
|
expect(response.body.data.metadata.sourceURL).toBe(
|
||||||
"https://roastmywebsite.ai"
|
"https://roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/abs/2410.04840"
|
url: "https://arxiv.org/abs/2410.04840",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -151,43 +151,43 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.markdown).toContain("Strong Model Collapse");
|
expect(response.body.data.markdown).toContain("Strong Model Collapse");
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
expect(response.body.data.metadata.description).toContain(
|
expect(response.body.data.metadata.description).toContain(
|
||||||
"Abstract page for arXiv paper 2410.04840: Strong Model Collapse"
|
"Abstract page for arXiv paper 2410.04840: Strong Model Collapse",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.citation_title).toBe(
|
expect(response.body.data.metadata.citation_title).toBe(
|
||||||
"Strong Model Collapse"
|
"Strong Model Collapse",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.citation_author).toEqual([
|
expect(response.body.data.metadata.citation_author).toEqual([
|
||||||
"Dohmatob, Elvis",
|
"Dohmatob, Elvis",
|
||||||
"Feng, Yunzhen",
|
"Feng, Yunzhen",
|
||||||
"Subramonian, Arjun",
|
"Subramonian, Arjun",
|
||||||
"Kempe, Julia"
|
"Kempe, Julia",
|
||||||
]);
|
]);
|
||||||
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
|
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
|
||||||
expect(response.body.data.metadata.citation_online_date).toBe(
|
expect(response.body.data.metadata.citation_online_date).toBe(
|
||||||
"2024/10/08"
|
"2024/10/08",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.citation_pdf_url).toBe(
|
expect(response.body.data.metadata.citation_pdf_url).toBe(
|
||||||
"http://arxiv.org/pdf/2410.04840"
|
"http://arxiv.org/pdf/2410.04840",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.citation_arxiv_id).toBe(
|
expect(response.body.data.metadata.citation_arxiv_id).toBe(
|
||||||
"2410.04840"
|
"2410.04840",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.citation_abstract).toContain(
|
expect(response.body.data.metadata.citation_abstract).toContain(
|
||||||
"Within the scaling laws paradigm"
|
"Within the scaling laws paradigm",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.sourceURL).toBe(
|
expect(response.body.data.metadata.sourceURL).toBe(
|
||||||
"https://arxiv.org/abs/2410.04840"
|
"https://arxiv.org/abs/2410.04840",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key and includeHtml set to true",
|
"should return a successful response with a valid API key and includeHtml set to true",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["markdown", "html"]
|
formats: ["markdown", "html"],
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -209,13 +209,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response for a valid scrape with PDF file",
|
"should return a successful response for a valid scrape with PDF file",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
// formats: ["markdown", "html"],
|
// formats: ["markdown", "html"],
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -232,19 +232,19 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Broad Line Radio Galaxy"
|
"Broad Line Radio Galaxy",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
|
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
url: "https://arxiv.org/pdf/astro-ph/9301001",
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
@@ -261,12 +261,12 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("markdown");
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Broad Line Radio Galaxy"
|
"Broad Line Radio Galaxy",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -274,7 +274,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
onlyMainContent: false // default is true
|
onlyMainContent: false, // default is true
|
||||||
};
|
};
|
||||||
const responseWithoutRemoveTags: ScrapeResponseRequestTest =
|
const responseWithoutRemoveTags: ScrapeResponseRequestTest =
|
||||||
await request(TEST_URL)
|
await request(TEST_URL)
|
||||||
@@ -292,16 +292,16 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
|
||||||
"[FAQ](/faq/)"
|
"[FAQ](/faq/)",
|
||||||
); // .nav
|
); // .nav
|
||||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
|
||||||
"Hartley Brody 2023"
|
"Hartley Brody 2023",
|
||||||
); // #footer
|
); // #footer
|
||||||
|
|
||||||
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
excludeTags: [".nav", "#footer", "strong"],
|
excludeTags: [".nav", "#footer", "strong"],
|
||||||
onlyMainContent: false // default is true
|
onlyMainContent: false, // default is true
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
@@ -320,7 +320,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||||
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -342,7 +342,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(400);
|
expect(response.body.data.metadata.statusCode).toBe(400);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -364,7 +364,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(401);
|
expect(response.body.data.metadata.statusCode).toBe(401);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Removed it as we want to retry fallback to the next scraper
|
// Removed it as we want to retry fallback to the next scraper
|
||||||
@@ -405,7 +405,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(404);
|
expect(response.body.data.metadata.statusCode).toBe(404);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
// it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||||
@@ -455,7 +455,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
expect(response.statusCode).toBe(408);
|
expect(response.statusCode).toBe(408);
|
||||||
},
|
},
|
||||||
3000
|
3000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -463,7 +463,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["html", "rawHtml"]
|
formats: ["html", "rawHtml"],
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -486,7 +486,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -495,7 +495,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://ycombinator.com/companies",
|
url: "https://ycombinator.com/companies",
|
||||||
formats: ["markdown"],
|
formats: ["markdown"],
|
||||||
waitFor: 8000
|
waitFor: 8000,
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -518,7 +518,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -526,7 +526,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["links"]
|
formats: ["links"],
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -548,7 +548,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -569,14 +569,14 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://roastmywebsite.ai"
|
url: "https://roastmywebsite.ai",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -594,7 +594,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const links = response.body.links as unknown[];
|
const links = response.body.links as unknown[];
|
||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -602,7 +602,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://usemotion.com",
|
url: "https://usemotion.com",
|
||||||
search: "pricing"
|
search: "pricing",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -621,7 +621,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
expect(links[0]).toContain("usemotion.com/pricing");
|
expect(links[0]).toContain("usemotion.com/pricing");
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -630,7 +630,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
search: "docs",
|
search: "docs",
|
||||||
includeSubdomains: true
|
includeSubdomains: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -650,10 +650,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
const containsDocsFirecrawlDev = links.some((link: string) =>
|
const containsDocsFirecrawlDev = links.some((link: string) =>
|
||||||
link.includes("docs.firecrawl.dev")
|
link.includes("docs.firecrawl.dev"),
|
||||||
);
|
);
|
||||||
expect(containsDocsFirecrawlDev).toBe(true);
|
expect(containsDocsFirecrawlDev).toBe(true);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -662,7 +662,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://www.firecrawl.dev",
|
url: "https://www.firecrawl.dev",
|
||||||
search: "docs",
|
search: "docs",
|
||||||
includeSubdomains: true
|
includeSubdomains: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -682,11 +682,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
const containsDocsFirecrawlDev = links.some((link: string) =>
|
const containsDocsFirecrawlDev = links.some((link: string) =>
|
||||||
link.includes("docs.firecrawl.dev")
|
link.includes("docs.firecrawl.dev"),
|
||||||
);
|
);
|
||||||
expect(containsDocsFirecrawlDev).toBe(true);
|
expect(containsDocsFirecrawlDev).toBe(true);
|
||||||
},
|
},
|
||||||
10000
|
10000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -695,7 +695,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://www.firecrawl.dev",
|
url: "https://www.firecrawl.dev",
|
||||||
search: "docs",
|
search: "docs",
|
||||||
includeSubdomains: false
|
includeSubdomains: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -714,14 +714,14 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent("should return an error for invalid URL", async () => {
|
it.concurrent("should return an error for invalid URL", async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "invalid-url",
|
url: "invalid-url",
|
||||||
includeSubdomains: true,
|
includeSubdomains: true,
|
||||||
search: "test"
|
search: "test",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -746,7 +746,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://facebook.com/fake-test"
|
url: "https://facebook.com/fake-test",
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@@ -757,7 +757,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toBe(
|
expect(response.body.error).toBe(
|
||||||
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."
|
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -770,7 +770,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response", async () => {
|
it.concurrent("should return a successful response", async () => {
|
||||||
@@ -783,7 +783,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("id");
|
expect(response.body).toHaveProperty("id");
|
||||||
expect(response.body.id).toMatch(
|
expect(response.body.id).toMatch(
|
||||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
|
||||||
);
|
);
|
||||||
expect(response.body).toHaveProperty("success", true);
|
expect(response.body).toHaveProperty("success", true);
|
||||||
expect(response.body).toHaveProperty("url");
|
expect(response.body).toHaveProperty("url");
|
||||||
@@ -800,7 +800,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
limit: 40,
|
limit: 40,
|
||||||
includePaths: ["blog/*"]
|
includePaths: ["blog/*"],
|
||||||
});
|
});
|
||||||
|
|
||||||
let response;
|
let response;
|
||||||
@@ -826,7 +826,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(5);
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
urls.forEach((url: string) => {
|
urls.forEach((url: string) => {
|
||||||
@@ -843,7 +843,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 180 seconds
|
); // 180 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -856,7 +856,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
limit: 40,
|
limit: 40,
|
||||||
excludePaths: ["blog/*"]
|
excludePaths: ["blog/*"],
|
||||||
});
|
});
|
||||||
|
|
||||||
let isFinished = false;
|
let isFinished = false;
|
||||||
@@ -882,14 +882,14 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(3);
|
expect(urls.length).toBeGreaterThan(3);
|
||||||
urls.forEach((url: string) => {
|
urls.forEach((url: string) => {
|
||||||
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
|
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
90000
|
90000,
|
||||||
); // 90 seconds
|
); // 90 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -901,7 +901,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.scrapethissite.com",
|
||||||
maxDepth: 1
|
maxDepth: 1,
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -911,7 +911,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("status");
|
expect(response.body).toHaveProperty("status");
|
||||||
expect(["active", "waiting", "completed", "scraping"]).toContain(
|
expect(["active", "waiting", "completed", "scraping"]).toContain(
|
||||||
response.body.status
|
response.body.status,
|
||||||
);
|
);
|
||||||
// wait for 60 seconds
|
// wait for 60 seconds
|
||||||
let isCompleted = false;
|
let isCompleted = false;
|
||||||
@@ -939,7 +939,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(1);
|
expect(urls.length).toBeGreaterThan(1);
|
||||||
|
|
||||||
@@ -955,7 +955,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(depth).toBeLessThanOrEqual(2);
|
expect(depth).toBeLessThanOrEqual(2);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -972,7 +972,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.get("/v1/crawl/123")
|
.get("/v1/crawl/123")
|
||||||
.set("Authorization", `Bearer invalid-api-key`);
|
.set("Authorization", `Bearer invalid-api-key`);
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -982,7 +982,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.get("/v1/crawl/invalidJobId")
|
.get("/v1/crawl/invalidJobId")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
expect(response.statusCode).toBe(404);
|
expect(response.statusCode).toBe(404);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1026,12 +1026,12 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||||
|
|
||||||
const childrenLinks = completedResponse.body.data.filter(
|
const childrenLinks = completedResponse.body.data.filter(
|
||||||
(doc) => doc.metadata && doc.metadata.sourceURL
|
(doc) => doc.metadata && doc.metadata.sourceURL,
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 120 seconds
|
); // 120 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -1068,7 +1068,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import request from "supertest";
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import {
|
||||||
ScrapeRequest,
|
ScrapeRequest,
|
||||||
ScrapeResponseRequestTest
|
ScrapeResponseRequestTest,
|
||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@@ -14,7 +14,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
"should return a successful response for a scrape with 403 page",
|
"should return a successful response for a scrape with 403 page",
|
||||||
async () => {
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -30,18 +30,18 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should handle 'formats:markdown (default)' parameter correctly",
|
"should handle 'formats:markdown (default)' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL
|
url: E2E_TEST_SERVER_URL,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -57,26 +57,26 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("markdown");
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"This page is used for end-to-end (e2e) testing with Firecrawl."
|
"This page is used for end-to-end (e2e) testing with Firecrawl.",
|
||||||
);
|
);
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Content with id #content-1"
|
"Content with id #content-1",
|
||||||
);
|
);
|
||||||
// expect(response.body.data.markdown).toContain("Loading...");
|
// expect(response.body.data.markdown).toContain("Loading...");
|
||||||
expect(response.body.data.markdown).toContain("Click me!");
|
expect(response.body.data.markdown).toContain("Click me!");
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Power your AI apps with clean data crawled from any website. It's also open-source."
|
"Power your AI apps with clean data crawled from any website. It's also open-source.",
|
||||||
); // firecrawl.dev inside an iframe
|
); // firecrawl.dev inside an iframe
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"This content loads only when you see it. Don't blink! 👼"
|
"This content loads only when you see it. Don't blink! 👼",
|
||||||
); // the browser always scroll to the bottom
|
); // the browser always scroll to the bottom
|
||||||
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
||||||
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
||||||
expect(response.body.data.markdown).not.toContain(
|
expect(response.body.data.markdown).not.toContain(
|
||||||
"This content is only visible on mobile"
|
"This content is only visible on mobile",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -84,11 +84,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
formats: ["html"]
|
formats: ["html"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -105,13 +105,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("html");
|
expect(response.body.data).toHaveProperty("html");
|
||||||
|
|
||||||
expect(response.body.data.html).not.toContain(
|
expect(response.body.data.html).not.toContain(
|
||||||
'<header class="row-start-1" style="">Header</header>'
|
'<header class="row-start-1" style="">Header</header>',
|
||||||
);
|
);
|
||||||
expect(response.body.data.html).toContain(
|
expect(response.body.data.html).toContain(
|
||||||
'<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>'
|
'<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>',
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -119,11 +119,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
formats: ["rawHtml"]
|
formats: ["rawHtml"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -140,11 +140,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("rawHtml");
|
expect(response.body.data).toHaveProperty("rawHtml");
|
||||||
|
|
||||||
expect(response.body.data.rawHtml).toContain(
|
expect(response.body.data.rawHtml).toContain(
|
||||||
">This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
|
">This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
|
||||||
);
|
);
|
||||||
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// - TODO: tests for links
|
// - TODO: tests for links
|
||||||
@@ -157,11 +157,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// @ts-ignore
|
// @ts-ignore
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
headers: { "e2e-header-test": "firecrawl" }
|
headers: { "e2e-header-test": "firecrawl" },
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -175,10 +175,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"e2e-header-test: firecrawl"
|
"e2e-header-test: firecrawl",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -186,11 +186,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
includeTags: ["#content-1"]
|
includeTags: ["#content-1"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -204,13 +204,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).not.toContain(
|
expect(response.body.data.markdown).not.toContain(
|
||||||
"<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
|
"<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
|
||||||
);
|
);
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Content with id #content-1"
|
"Content with id #content-1",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -218,11 +218,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
excludeTags: ["#content-1"]
|
excludeTags: ["#content-1"],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -236,13 +236,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"This page is used for end-to-end (e2e) testing with Firecrawl."
|
"This page is used for end-to-end (e2e) testing with Firecrawl.",
|
||||||
);
|
);
|
||||||
expect(response.body.data.markdown).not.toContain(
|
expect(response.body.data.markdown).not.toContain(
|
||||||
"Content with id #content-1"
|
"Content with id #content-1",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -251,11 +251,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
formats: ["html", "markdown"],
|
formats: ["html", "markdown"],
|
||||||
onlyMainContent: false
|
onlyMainContent: false,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -269,13 +269,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"This page is used for end-to-end (e2e) testing with Firecrawl."
|
"This page is used for end-to-end (e2e) testing with Firecrawl.",
|
||||||
);
|
);
|
||||||
expect(response.body.data.html).toContain(
|
expect(response.body.data.html).toContain(
|
||||||
'<header class="row-start-1" style="">Header</header>'
|
'<header class="row-start-1" style="">Header</header>',
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -283,11 +283,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
timeout: 500
|
timeout: 500,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -302,7 +302,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.error).toBe("Request timed out");
|
expect(response.body.error).toBe("Request timed out");
|
||||||
expect(response.body.success).toBe(false);
|
expect(response.body.success).toBe(false);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -310,11 +310,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
mobile: true
|
mobile: true,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -327,17 +327,17 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"This content is only visible on mobile"
|
"This content is only visible on mobile",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should handle 'parsePDF' parameter correctly",
|
"should handle 'parsePDF' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -352,21 +352,21 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"arXiv:astro-ph/9301001v1 7 Jan 1993"
|
"arXiv:astro-ph/9301001v1 7 Jan 1993",
|
||||||
);
|
);
|
||||||
expect(response.body.data.markdown).not.toContain(
|
expect(response.body.data.markdown).not.toContain(
|
||||||
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm"
|
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
|
||||||
);
|
);
|
||||||
|
|
||||||
const responseNoParsePDF: ScrapeResponseRequestTest = await request(
|
const responseNoParsePDF: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
parsePDF: false
|
parsePDF: false,
|
||||||
});
|
});
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
@@ -376,10 +376,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(responseNoParsePDF.body.data.markdown).toContain(
|
expect(responseNoParsePDF.body.data.markdown).toContain(
|
||||||
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm"
|
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// it.concurrent("should handle 'location' parameter correctly",
|
// it.concurrent("should handle 'location' parameter correctly",
|
||||||
@@ -408,11 +408,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: "https://expired.badssl.com/",
|
url: "https://expired.badssl.com/",
|
||||||
timeout: 120000
|
timeout: 120000,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -430,7 +430,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
const scrapeRequestWithSkipTlsVerification = {
|
const scrapeRequestWithSkipTlsVerification = {
|
||||||
url: "https://expired.badssl.com/",
|
url: "https://expired.badssl.com/",
|
||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
timeout: 120000
|
timeout: 120000,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const responseWithSkipTlsVerification: ScrapeResponseRequestTest =
|
const responseWithSkipTlsVerification: ScrapeResponseRequestTest =
|
||||||
@@ -448,10 +448,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
// console.log(responseWithSkipTlsVerification.body.data)
|
// console.log(responseWithSkipTlsVerification.body.data)
|
||||||
expect(responseWithSkipTlsVerification.body.data.markdown).toContain(
|
expect(responseWithSkipTlsVerification.body.data.markdown).toContain(
|
||||||
"badssl.com"
|
"badssl.com",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -459,11 +459,11 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
removeBase64Images: true
|
removeBase64Images: true,
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -478,7 +478,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// - TODO: not working for every image
|
// - TODO: not working for every image
|
||||||
// expect(response.body.data.markdown).toContain("Image-Removed");
|
// expect(response.body.data.markdown).toContain("Image-Removed");
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -489,13 +489,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "wait",
|
type: "wait",
|
||||||
milliseconds: 10000
|
milliseconds: 10000,
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -508,10 +508,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
expect(response.body.data.markdown).not.toContain("Loading...");
|
expect(response.body.data.markdown).not.toContain("Loading...");
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Content loaded after 5 seconds!"
|
"Content loaded after 5 seconds!",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// screenshot
|
// screenshot
|
||||||
@@ -522,13 +522,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "screenshot"
|
type: "screenshot",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -543,15 +543,15 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have screenshots array");
|
throw new Error("Expected response body to have screenshots array");
|
||||||
}
|
}
|
||||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
|
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
expect(response.body.data.actions.screenshots[0]).toContain(
|
expect(response.body.data.actions.screenshots[0]).toContain(
|
||||||
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"
|
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO compare screenshot with expected screenshot
|
// TODO compare screenshot with expected screenshot
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -562,16 +562,16 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "screenshot",
|
type: "screenshot",
|
||||||
fullPage: true
|
fullPage: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "scrape"
|
type: "scrape",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -587,24 +587,24 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have screenshots array");
|
throw new Error("Expected response body to have screenshots array");
|
||||||
}
|
}
|
||||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
|
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
expect(response.body.data.actions.screenshots[0]).toContain(
|
expect(response.body.data.actions.screenshots[0]).toContain(
|
||||||
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"
|
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-",
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!response.body.data.actions?.scrapes) {
|
if (!response.body.data.actions?.scrapes) {
|
||||||
throw new Error("Expected response body to have scrapes array");
|
throw new Error("Expected response body to have scrapes array");
|
||||||
}
|
}
|
||||||
expect(response.body.data.actions.scrapes[0].url).toBe(
|
expect(response.body.data.actions.scrapes[0].url).toBe(
|
||||||
"https://firecrawl-e2e-test.vercel.app/"
|
"https://firecrawl-e2e-test.vercel.app/",
|
||||||
);
|
);
|
||||||
expect(response.body.data.actions.scrapes[0].html).toContain(
|
expect(response.body.data.actions.scrapes[0].html).toContain(
|
||||||
"This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
|
"This page is used for end-to-end (e2e) testing with Firecrawl.</p>",
|
||||||
);
|
);
|
||||||
// TODO compare screenshot with expected full page screenshot
|
// TODO compare screenshot with expected full page screenshot
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -615,13 +615,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "click",
|
type: "click",
|
||||||
selector: "#click-me"
|
selector: "#click-me",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -634,10 +634,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
expect(response.body.data.markdown).not.toContain("Click me!");
|
expect(response.body.data.markdown).not.toContain("Click me!");
|
||||||
expect(response.body.data.markdown).toContain(
|
expect(response.body.data.markdown).toContain(
|
||||||
"Text changed after click!"
|
"Text changed after click!",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -649,17 +649,17 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "click",
|
type: "click",
|
||||||
selector: "#input-1"
|
selector: "#input-1",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "write",
|
type: "write",
|
||||||
text: "Hello, world!"
|
text: "Hello, world!",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -675,7 +675,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// uncomment the following line:
|
// uncomment the following line:
|
||||||
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO: fix this test (need to fix fire-engine first)
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
@@ -688,13 +688,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "press",
|
type: "press",
|
||||||
key: "ArrowDown"
|
key: "ArrowDown",
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -709,7 +709,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// }
|
// }
|
||||||
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO: fix this test (need to fix fire-engine first)
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
@@ -722,18 +722,18 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
actions: [
|
actions: [
|
||||||
{
|
{
|
||||||
type: "click",
|
type: "click",
|
||||||
selector: "#scroll-bottom-loader"
|
selector: "#scroll-bottom-loader",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "scroll",
|
type: "scroll",
|
||||||
direction: "down",
|
direction: "down",
|
||||||
amount: 2000
|
amount: 2000,
|
||||||
}
|
},
|
||||||
]
|
],
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
FIRECRAWL_API_URL
|
FIRECRAWL_API_URL,
|
||||||
)
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -748,7 +748,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
//
|
//
|
||||||
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO: test scrape action
|
// TODO: test scrape action
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import dotenv from "dotenv";
|
|||||||
import {
|
import {
|
||||||
FirecrawlCrawlResponse,
|
FirecrawlCrawlResponse,
|
||||||
FirecrawlCrawlStatusResponse,
|
FirecrawlCrawlStatusResponse,
|
||||||
FirecrawlScrapeResponse
|
FirecrawlScrapeResponse,
|
||||||
} from "../../types";
|
} from "../../types";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@@ -42,7 +42,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -63,30 +63,30 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.description).toBe(
|
expect(response.body.data.metadata.description).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.keywords).toBe(
|
expect(response.body.data.metadata.keywords).toBe(
|
||||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
"Roast My Website,Roast,Website,GitHub,Firecrawl",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.ogDescription).toBe(
|
expect(response.body.data.metadata.ogDescription).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogUrl).toBe(
|
expect(response.body.data.metadata.ogUrl).toBe(
|
||||||
"https://www.roastmywebsite.ai"
|
"https://www.roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogImage).toBe(
|
expect(response.body.data.metadata.ogImage).toBe(
|
||||||
"https://www.roastmywebsite.ai/og.png"
|
"https://www.roastmywebsite.ai/og.png",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||||
expect(response.body.data.metadata.sourceURL).toBe(
|
expect(response.body.data.metadata.sourceURL).toBe(
|
||||||
"https://roastmywebsite.ai"
|
"https://roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -98,7 +98,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
pageOptions: { includeHtml: true }
|
pageOptions: { includeHtml: true },
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -112,7 +112,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -130,12 +130,12 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("content");
|
expect(response.body.data).toHaveProperty("content");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.content).toContain(
|
expect(response.body.data.content).toContain(
|
||||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -153,12 +153,12 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("content");
|
expect(response.body.data).toHaveProperty("content");
|
||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.content).toContain(
|
expect(response.body.data.content).toContain(
|
||||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
"We present spectrophotometric observations of the Broad Line Radio Galaxy",
|
||||||
);
|
);
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
expect(response.body.data.metadata.pageStatusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -177,16 +177,16 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"Scrape This Site"
|
"Scrape This Site",
|
||||||
);
|
);
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"Lessons and Videos"
|
"Lessons and Videos",
|
||||||
); // #footer
|
); // #footer
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"[Sandbox]("
|
"[Sandbox](",
|
||||||
); // .nav
|
); // .nav
|
||||||
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
expect(responseWithoutRemoveTags.body.data.content).toContain(
|
||||||
"web scraping"
|
"web scraping",
|
||||||
); // strong
|
); // strong
|
||||||
|
|
||||||
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
const response: FirecrawlScrapeResponse = await request(TEST_URL)
|
||||||
@@ -195,7 +195,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
pageOptions: { removeTags: [".nav", "#footer", "strong"] }
|
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -208,7 +208,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||||
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -227,10 +227,10 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
expect(response.body.data.metadata.pageStatusCode).toBe(400);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"bad request"
|
"bad request",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -249,10 +249,10 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
expect(response.body.data.metadata.pageStatusCode).toBe(401);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"unauthorized"
|
"unauthorized",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -271,10 +271,10 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
expect(response.body.data.metadata.pageStatusCode).toBe(403);
|
||||||
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
expect(response.body.data.metadata.pageError.toLowerCase()).toContain(
|
||||||
"forbidden"
|
"forbidden",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -293,7 +293,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
expect(response.body.data.metadata.pageStatusCode).toBe(404);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -312,7 +312,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
expect(response.body.data.metadata.pageStatusCode).toBe(405);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -331,7 +331,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.data).toHaveProperty("metadata");
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -351,7 +351,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -365,9 +365,9 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("jobId");
|
expect(response.body).toHaveProperty("jobId");
|
||||||
expect(response.body.jobId).toMatch(
|
expect(response.body.jobId).toMatch(
|
||||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/,
|
||||||
);
|
);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -381,8 +381,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
limit: 10,
|
limit: 10,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
includes: ["blog/*"]
|
includes: ["blog/*"],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
let response: FirecrawlCrawlStatusResponse;
|
let response: FirecrawlCrawlStatusResponse;
|
||||||
@@ -408,7 +408,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(5);
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
urls.forEach((url: string) => {
|
urls.forEach((url: string) => {
|
||||||
@@ -424,13 +424,13 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 180 seconds
|
); // 180 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -444,8 +444,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
limit: 10,
|
limit: 10,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
excludes: ["blog/*"]
|
excludes: ["blog/*"],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
let isFinished = false;
|
let isFinished = false;
|
||||||
@@ -467,20 +467,20 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(
|
const completedResponse: FirecrawlCrawlStatusResponse = await request(
|
||||||
TEST_URL
|
TEST_URL,
|
||||||
)
|
)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(5);
|
expect(urls.length).toBeGreaterThan(5);
|
||||||
urls.forEach((url: string) => {
|
urls.forEach((url: string) => {
|
||||||
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
expect(url.startsWith("https://wwww.mendable.ai/blog/")).toBeFalsy();
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
90000
|
90000,
|
||||||
); // 90 seconds
|
); // 90 seconds
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -492,7 +492,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.scrapethissite.com",
|
||||||
crawlerOptions: { maxDepth: 1 }
|
crawlerOptions: { maxDepth: 1 },
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -515,7 +515,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
const completedResponse: FirecrawlCrawlStatusResponse = await request(
|
const completedResponse: FirecrawlCrawlStatusResponse = await request(
|
||||||
TEST_URL
|
TEST_URL,
|
||||||
)
|
)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
@@ -528,13 +528,13 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
const urls = completedResponse.body.data.map(
|
const urls = completedResponse.body.data.map(
|
||||||
(item: any) => item.metadata?.sourceURL
|
(item: any) => item.metadata?.sourceURL,
|
||||||
);
|
);
|
||||||
expect(urls.length).toBeGreaterThan(1);
|
expect(urls.length).toBeGreaterThan(1);
|
||||||
|
|
||||||
@@ -550,14 +550,14 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(depth).toBeLessThanOrEqual(2);
|
expect(depth).toBeLessThanOrEqual(2);
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v0/crawlWebsitePreview", () => {
|
describe("POST /v0/crawlWebsitePreview", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
|
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
|
||||||
"/v0/crawlWebsitePreview"
|
"/v0/crawlWebsitePreview",
|
||||||
);
|
);
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
@@ -571,7 +571,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -585,7 +585,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
|
|
||||||
expect(response.statusCode).toBe(408);
|
expect(response.statusCode).toBe(408);
|
||||||
},
|
},
|
||||||
3000
|
3000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -604,7 +604,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ query: "test" });
|
.send({ query: "test" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -620,7 +620,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(response.body.success).toBe(true);
|
expect(response.body.success).toBe(true);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds timeout
|
); // 60 seconds timeout
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -637,7 +637,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.get("/v0/crawl/status/123")
|
.get("/v0/crawl/status/123")
|
||||||
.set("Authorization", `Bearer invalid-api-key`);
|
.set("Authorization", `Bearer invalid-api-key`);
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -647,7 +647,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.get("/v0/crawl/status/invalidJobId")
|
.get("/v0/crawl/status/invalidJobId")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
expect(response.statusCode).toBe(404);
|
expect(response.statusCode).toBe(404);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -689,22 +689,22 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
200
|
200,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
|
|
||||||
const childrenLinks = completedResponse.body.data.filter(
|
const childrenLinks = completedResponse.body.data.filter(
|
||||||
(doc) =>
|
(doc) =>
|
||||||
doc.metadata &&
|
doc.metadata &&
|
||||||
doc.metadata.sourceURL &&
|
doc.metadata.sourceURL &&
|
||||||
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
|
doc.metadata.sourceURL.includes("firecrawl.dev/blog"),
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||||
},
|
},
|
||||||
180000
|
180000,
|
||||||
); // 120 seconds
|
); // 120 seconds
|
||||||
|
|
||||||
// TODO: review the test below
|
// TODO: review the test below
|
||||||
@@ -762,7 +762,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://docs.tatum.io",
|
url: "https://docs.tatum.io",
|
||||||
crawlerOptions: { limit: 200 }
|
crawlerOptions: { limit: 200 },
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
@@ -798,22 +798,22 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
|
expect(completedResponse.body.data).toEqual(expect.arrayContaining([]));
|
||||||
expect(completedResponse.body).toHaveProperty("partial_data");
|
expect(completedResponse.body).toHaveProperty("partial_data");
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
||||||
"content"
|
"content",
|
||||||
);
|
);
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
||||||
"markdown"
|
"markdown",
|
||||||
);
|
);
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
expect(completedResponse.body.partial_data[0]).toHaveProperty(
|
||||||
"metadata"
|
"metadata",
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.partial_data[0].metadata.pageStatusCode
|
completedResponse.body.partial_data[0].metadata.pageStatusCode,
|
||||||
).toBe(200);
|
).toBe(200);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.partial_data[0].metadata.pageError
|
completedResponse.body.partial_data[0].metadata.pageError,
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -828,7 +828,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
onlyMainContent: true
|
onlyMainContent: true,
|
||||||
},
|
},
|
||||||
extractorOptions: {
|
extractorOptions: {
|
||||||
mode: "llm-extraction",
|
mode: "llm-extraction",
|
||||||
@@ -838,18 +838,18 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
company_mission: {
|
company_mission: {
|
||||||
type: "string"
|
type: "string",
|
||||||
},
|
},
|
||||||
supports_sso: {
|
supports_sso: {
|
||||||
type: "boolean"
|
type: "boolean",
|
||||||
},
|
},
|
||||||
is_open_source: {
|
is_open_source: {
|
||||||
type: "boolean"
|
type: "boolean",
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"]
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Ensure that the job was successfully created before proceeding with LLM extraction
|
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||||
@@ -868,7 +868,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(llmExtraction.is_open_source).toBe(false);
|
expect(llmExtraction.is_open_source).toBe(false);
|
||||||
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
expect(typeof llmExtraction.is_open_source).toBe("boolean");
|
||||||
},
|
},
|
||||||
60000
|
60000,
|
||||||
); // 60 secs
|
); // 60 secs
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -10,9 +10,9 @@ jest.mock("../auth", () => ({
|
|||||||
success: true,
|
success: true,
|
||||||
team_id: "team123",
|
team_id: "team123",
|
||||||
error: null,
|
error: null,
|
||||||
status: 200
|
status: 200,
|
||||||
}),
|
}),
|
||||||
reduce: jest.fn()
|
reduce: jest.fn(),
|
||||||
}));
|
}));
|
||||||
jest.mock("../../services/idempotency/validate");
|
jest.mock("../../services/idempotency/validate");
|
||||||
|
|
||||||
@@ -21,15 +21,15 @@ describe("crawlController", () => {
|
|||||||
const req = {
|
const req = {
|
||||||
headers: {
|
headers: {
|
||||||
"x-idempotency-key": await uuidv4(),
|
"x-idempotency-key": await uuidv4(),
|
||||||
Authorization: `Bearer ${process.env.TEST_API_KEY}`
|
Authorization: `Bearer ${process.env.TEST_API_KEY}`,
|
||||||
},
|
},
|
||||||
body: {
|
body: {
|
||||||
url: "https://mendable.ai"
|
url: "https://mendable.ai",
|
||||||
}
|
},
|
||||||
} as unknown as Request;
|
} as unknown as Request;
|
||||||
const res = {
|
const res = {
|
||||||
status: jest.fn().mockReturnThis(),
|
status: jest.fn().mockReturnThis(),
|
||||||
json: jest.fn()
|
json: jest.fn(),
|
||||||
} as unknown as Response;
|
} as unknown as Response;
|
||||||
|
|
||||||
// Mock the idempotency key validation to return false for the second call
|
// Mock the idempotency key validation to return false for the second call
|
||||||
@@ -45,7 +45,7 @@ describe("crawlController", () => {
|
|||||||
await crawlController(req, res);
|
await crawlController(req, res);
|
||||||
expect(res.status).toHaveBeenCalledWith(409);
|
expect(res.status).toHaveBeenCalledWith(409);
|
||||||
expect(res.json).toHaveBeenCalledWith({
|
expect(res.json).toHaveBeenCalledWith({
|
||||||
error: "Idempotency key already used"
|
error: "Idempotency key already used",
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import {
|
|||||||
AuthResponse,
|
AuthResponse,
|
||||||
NotificationType,
|
NotificationType,
|
||||||
PlanType,
|
PlanType,
|
||||||
RateLimiterMode
|
RateLimiterMode,
|
||||||
} from "../types";
|
} from "../types";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { withAuth } from "../lib/withAuth";
|
import { withAuth } from "../lib/withAuth";
|
||||||
@@ -41,7 +41,7 @@ export async function setCachedACUC(
|
|||||||
acuc:
|
acuc:
|
||||||
| AuthCreditUsageChunk
|
| AuthCreditUsageChunk
|
||||||
| null
|
| null
|
||||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
|
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null),
|
||||||
) {
|
) {
|
||||||
const cacheKeyACUC = `acuc_${api_key}`;
|
const cacheKeyACUC = `acuc_${api_key}`;
|
||||||
const redLockKey = `lock_${cacheKeyACUC}`;
|
const redLockKey = `lock_${cacheKeyACUC}`;
|
||||||
@@ -76,7 +76,7 @@ export async function setCachedACUC(
|
|||||||
export async function getACUC(
|
export async function getACUC(
|
||||||
api_key: string,
|
api_key: string,
|
||||||
cacheOnly = false,
|
cacheOnly = false,
|
||||||
useCache = true
|
useCache = true,
|
||||||
): Promise<AuthCreditUsageChunk | null> {
|
): Promise<AuthCreditUsageChunk | null> {
|
||||||
const cacheKeyACUC = `acuc_${api_key}`;
|
const cacheKeyACUC = `acuc_${api_key}`;
|
||||||
|
|
||||||
@@ -97,7 +97,7 @@ export async function getACUC(
|
|||||||
({ data, error } = await supabase_service.rpc(
|
({ data, error } = await supabase_service.rpc(
|
||||||
"auth_credit_usage_chunk_test_21_credit_pack",
|
"auth_credit_usage_chunk_test_21_credit_pack",
|
||||||
{ input_key: api_key },
|
{ input_key: api_key },
|
||||||
{ get: true }
|
{ get: true },
|
||||||
));
|
));
|
||||||
|
|
||||||
if (!error) {
|
if (!error) {
|
||||||
@@ -105,13 +105,13 @@ export async function getACUC(
|
|||||||
}
|
}
|
||||||
|
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
|
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`,
|
||||||
);
|
);
|
||||||
retries++;
|
retries++;
|
||||||
if (retries === maxRetries) {
|
if (retries === maxRetries) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Failed to retrieve authentication and credit usage data after 3 attempts: " +
|
"Failed to retrieve authentication and credit usage data after 3 attempts: " +
|
||||||
JSON.stringify(error)
|
JSON.stringify(error),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -143,19 +143,19 @@ export async function clearACUC(api_key: string): Promise<void> {
|
|||||||
export async function authenticateUser(
|
export async function authenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
mode?: RateLimiterMode
|
mode?: RateLimiterMode,
|
||||||
): Promise<AuthResponse> {
|
): Promise<AuthResponse> {
|
||||||
return withAuth(supaAuthenticateUser, {
|
return withAuth(supaAuthenticateUser, {
|
||||||
success: true,
|
success: true,
|
||||||
chunk: null,
|
chunk: null,
|
||||||
team_id: "bypass"
|
team_id: "bypass",
|
||||||
})(req, res, mode);
|
})(req, res, mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function supaAuthenticateUser(
|
export async function supaAuthenticateUser(
|
||||||
req,
|
req,
|
||||||
res,
|
res,
|
||||||
mode?: RateLimiterMode
|
mode?: RateLimiterMode,
|
||||||
): Promise<AuthResponse> {
|
): Promise<AuthResponse> {
|
||||||
const authHeader =
|
const authHeader =
|
||||||
req.headers.authorization ??
|
req.headers.authorization ??
|
||||||
@@ -170,7 +170,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Unauthorized: Token missing",
|
error: "Unauthorized: Token missing",
|
||||||
status: 401
|
status: 401,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -199,7 +199,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Unauthorized: Invalid token",
|
error: "Unauthorized: Invalid token",
|
||||||
status: 401
|
status: 401,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -209,7 +209,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Unauthorized: Invalid token",
|
error: "Unauthorized: Invalid token",
|
||||||
status: 401
|
status: 401,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -219,14 +219,14 @@ export async function supaAuthenticateUser(
|
|||||||
const plan = getPlanByPriceId(priceId);
|
const plan = getPlanByPriceId(priceId);
|
||||||
subscriptionData = {
|
subscriptionData = {
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
plan
|
plan,
|
||||||
};
|
};
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case RateLimiterMode.Crawl:
|
case RateLimiterMode.Crawl:
|
||||||
rateLimiter = getRateLimiter(
|
rateLimiter = getRateLimiter(
|
||||||
RateLimiterMode.Crawl,
|
RateLimiterMode.Crawl,
|
||||||
token,
|
token,
|
||||||
subscriptionData.plan
|
subscriptionData.plan,
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case RateLimiterMode.Scrape:
|
case RateLimiterMode.Scrape:
|
||||||
@@ -234,21 +234,21 @@ export async function supaAuthenticateUser(
|
|||||||
RateLimiterMode.Scrape,
|
RateLimiterMode.Scrape,
|
||||||
token,
|
token,
|
||||||
subscriptionData.plan,
|
subscriptionData.plan,
|
||||||
teamId
|
teamId,
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case RateLimiterMode.Search:
|
case RateLimiterMode.Search:
|
||||||
rateLimiter = getRateLimiter(
|
rateLimiter = getRateLimiter(
|
||||||
RateLimiterMode.Search,
|
RateLimiterMode.Search,
|
||||||
token,
|
token,
|
||||||
subscriptionData.plan
|
subscriptionData.plan,
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case RateLimiterMode.Map:
|
case RateLimiterMode.Map:
|
||||||
rateLimiter = getRateLimiter(
|
rateLimiter = getRateLimiter(
|
||||||
RateLimiterMode.Map,
|
RateLimiterMode.Map,
|
||||||
token,
|
token,
|
||||||
subscriptionData.plan
|
subscriptionData.plan,
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case RateLimiterMode.CrawlStatus:
|
case RateLimiterMode.CrawlStatus:
|
||||||
@@ -278,7 +278,7 @@ export async function supaAuthenticateUser(
|
|||||||
priceId,
|
priceId,
|
||||||
plan: subscriptionData?.plan,
|
plan: subscriptionData?.plan,
|
||||||
mode,
|
mode,
|
||||||
rateLimiterRes
|
rateLimiterRes,
|
||||||
});
|
});
|
||||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||||
@@ -293,7 +293,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||||
status: 429
|
status: 429,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -323,7 +323,7 @@ export async function supaAuthenticateUser(
|
|||||||
success: true,
|
success: true,
|
||||||
team_id: teamId ?? undefined,
|
team_id: teamId ?? undefined,
|
||||||
plan: (subscriptionData?.plan ?? "") as PlanType,
|
plan: (subscriptionData?.plan ?? "") as PlanType,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
function getPlanByPriceId(price_id: string | null): PlanType {
|
function getPlanByPriceId(price_id: string | null): PlanType {
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import { sendSlackWebhook } from "../../../services/alerts/slack";
|
|||||||
|
|
||||||
export async function cleanBefore24hCompleteJobsController(
|
export async function cleanBefore24hCompleteJobsController(
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response
|
res: Response,
|
||||||
) {
|
) {
|
||||||
logger.info("🐂 Cleaning jobs older than 24h");
|
logger.info("🐂 Cleaning jobs older than 24h");
|
||||||
try {
|
try {
|
||||||
@@ -22,8 +22,8 @@ export async function cleanBefore24hCompleteJobsController(
|
|||||||
["completed"],
|
["completed"],
|
||||||
i * batchSize,
|
i * batchSize,
|
||||||
i * batchSize + batchSize,
|
i * batchSize + batchSize,
|
||||||
true
|
true,
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
const completedJobs: Job[] = (
|
const completedJobs: Job[] = (
|
||||||
@@ -33,7 +33,7 @@ export async function cleanBefore24hCompleteJobsController(
|
|||||||
completedJobs.filter(
|
completedJobs.filter(
|
||||||
(job) =>
|
(job) =>
|
||||||
job.finishedOn !== undefined &&
|
job.finishedOn !== undefined &&
|
||||||
job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
job.finishedOn < Date.now() - 24 * 60 * 60 * 1000,
|
||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
let count = 0;
|
let count = 0;
|
||||||
@@ -73,14 +73,14 @@ export async function queuesController(req: Request, res: Response) {
|
|||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
|
|
||||||
const [webScraperActive] = await Promise.all([
|
const [webScraperActive] = await Promise.all([
|
||||||
scrapeQueue.getActiveCount()
|
scrapeQueue.getActiveCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const noActiveJobs = webScraperActive === 0;
|
const noActiveJobs = webScraperActive === 0;
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
return res.status(noActiveJobs ? 200 : 500).json({
|
return res.status(noActiveJobs ? 200 : 500).json({
|
||||||
webScraperActive,
|
webScraperActive,
|
||||||
noActiveJobs
|
noActiveJobs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
@@ -99,7 +99,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
await Promise.all([
|
await Promise.all([
|
||||||
scrapeQueue.getActiveCount(),
|
scrapeQueue.getActiveCount(),
|
||||||
scrapeQueue.getWaitingCount(),
|
scrapeQueue.getWaitingCount(),
|
||||||
scrapeQueue.getPrioritizedCount()
|
scrapeQueue.getPrioritizedCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
||||||
@@ -109,9 +109,9 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`
|
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
const machines = await request.json();
|
const machines = await request.json();
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
(machine.state === "started" ||
|
(machine.state === "started" ||
|
||||||
machine.state === "starting" ||
|
machine.state === "starting" ||
|
||||||
machine.state === "replacing") &&
|
machine.state === "replacing") &&
|
||||||
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
|
machine.config.env["FLY_PROCESS_GROUP"] === "worker",
|
||||||
).length;
|
).length;
|
||||||
|
|
||||||
let targetMachineCount = activeMachines;
|
let targetMachineCount = activeMachines;
|
||||||
@@ -134,17 +134,17 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
|
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
|
||||||
targetMachineCount = Math.min(
|
targetMachineCount = Math.min(
|
||||||
maxNumberOfMachines,
|
maxNumberOfMachines,
|
||||||
activeMachines + baseScaleUp * 3
|
activeMachines + baseScaleUp * 3,
|
||||||
);
|
);
|
||||||
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
|
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
|
||||||
targetMachineCount = Math.min(
|
targetMachineCount = Math.min(
|
||||||
maxNumberOfMachines,
|
maxNumberOfMachines,
|
||||||
activeMachines + baseScaleUp * 2
|
activeMachines + baseScaleUp * 2,
|
||||||
);
|
);
|
||||||
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
|
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
|
||||||
targetMachineCount = Math.min(
|
targetMachineCount = Math.min(
|
||||||
maxNumberOfMachines,
|
maxNumberOfMachines,
|
||||||
activeMachines + baseScaleUp
|
activeMachines + baseScaleUp,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -152,47 +152,47 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
|
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
|
||||||
targetMachineCount = Math.max(
|
targetMachineCount = Math.max(
|
||||||
minNumberOfMachines,
|
minNumberOfMachines,
|
||||||
activeMachines - baseScaleDown * 3
|
activeMachines - baseScaleDown * 3,
|
||||||
);
|
);
|
||||||
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
|
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
|
||||||
targetMachineCount = Math.max(
|
targetMachineCount = Math.max(
|
||||||
minNumberOfMachines,
|
minNumberOfMachines,
|
||||||
activeMachines - baseScaleDown * 2
|
activeMachines - baseScaleDown * 2,
|
||||||
);
|
);
|
||||||
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
|
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
|
||||||
targetMachineCount = Math.max(
|
targetMachineCount = Math.max(
|
||||||
minNumberOfMachines,
|
minNumberOfMachines,
|
||||||
activeMachines - baseScaleDown
|
activeMachines - baseScaleDown,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (targetMachineCount !== activeMachines) {
|
if (targetMachineCount !== activeMachines) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (targetMachineCount > activeMachines) {
|
if (targetMachineCount > activeMachines) {
|
||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||||
false,
|
false,
|
||||||
process.env.SLACK_AUTOSCALER ?? ""
|
process.env.SLACK_AUTOSCALER ?? "",
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||||
false,
|
false,
|
||||||
process.env.SLACK_AUTOSCALER ?? ""
|
process.env.SLACK_AUTOSCALER ?? "",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
mode: "scale-descale",
|
mode: "scale-descale",
|
||||||
count: targetMachineCount
|
count: targetMachineCount,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
mode: "normal",
|
mode: "normal",
|
||||||
count: activeMachines
|
count: activeMachines,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||||||
try {
|
try {
|
||||||
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
|
await retryOperation(() => redisRateLimitClient.set(testKey, testValue));
|
||||||
redisRateLimitHealth = await retryOperation(() =>
|
redisRateLimitHealth = await retryOperation(() =>
|
||||||
redisRateLimitClient.get(testKey)
|
redisRateLimitClient.get(testKey),
|
||||||
);
|
);
|
||||||
await retryOperation(() => redisRateLimitClient.del(testKey));
|
await retryOperation(() => redisRateLimitClient.del(testKey));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -49,7 +49,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||||||
const healthStatus = {
|
const healthStatus = {
|
||||||
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
||||||
redisRateLimitClient:
|
redisRateLimitClient:
|
||||||
redisRateLimitHealth === testValue ? "healthy" : "unhealthy"
|
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
|
||||||
};
|
};
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -60,7 +60,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||||||
return res.status(200).json({ status: "healthy", details: healthStatus });
|
return res.status(200).json({ status: "healthy", details: healthStatus });
|
||||||
} else {
|
} else {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Redis instances health check: ${JSON.stringify(healthStatus)}`
|
`Redis instances health check: ${JSON.stringify(healthStatus)}`,
|
||||||
);
|
);
|
||||||
// await sendSlackWebhook(
|
// await sendSlackWebhook(
|
||||||
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
// `[REDIS DOWN] Redis instances health check: ${JSON.stringify(
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: "cancelled"
|
status: "cancelled",
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
|||||||
@@ -60,12 +60,12 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
// Combine jobs and jobStatuses into a single array of objects
|
// Combine jobs and jobStatuses into a single array of objects
|
||||||
let jobsWithStatuses = jobs.map((job, index) => ({
|
let jobsWithStatuses = jobs.map((job, index) => ({
|
||||||
job,
|
job,
|
||||||
status: jobStatuses[index]
|
status: jobStatuses[index],
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Filter out failed jobs
|
// Filter out failed jobs
|
||||||
jobsWithStatuses = jobsWithStatuses.filter(
|
jobsWithStatuses = jobsWithStatuses.filter(
|
||||||
(x) => x.status !== "failed" && x.status !== "unknown"
|
(x) => x.status !== "failed" && x.status !== "unknown",
|
||||||
);
|
);
|
||||||
|
|
||||||
// Sort jobs by timestamp
|
// Sort jobs by timestamp
|
||||||
@@ -84,10 +84,10 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
const data = jobs
|
const data = jobs
|
||||||
.filter(
|
.filter(
|
||||||
(x) =>
|
(x) =>
|
||||||
x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null
|
x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null,
|
||||||
)
|
)
|
||||||
.map((x) =>
|
.map((x) =>
|
||||||
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -117,7 +117,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
? []
|
? []
|
||||||
: data
|
: data
|
||||||
.filter((x) => x !== null)
|
.filter((x) => x !== null)
|
||||||
.map((x) => toLegacyDocument(x, sc.internalOptions))
|
.map((x) => toLegacyDocument(x, sc.internalOptions)),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
|||||||
import {
|
import {
|
||||||
defaultCrawlPageOptions,
|
defaultCrawlPageOptions,
|
||||||
defaultCrawlerOptions,
|
defaultCrawlerOptions,
|
||||||
defaultOrigin
|
defaultOrigin,
|
||||||
} from "../../../src/lib/default-values";
|
} from "../../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
@@ -21,7 +21,7 @@ import {
|
|||||||
lockURL,
|
lockURL,
|
||||||
lockURLs,
|
lockURLs,
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl
|
StoredCrawl,
|
||||||
} from "../../../src/lib/crawl-redis";
|
} from "../../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
@@ -54,7 +54,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const crawlerOptions = {
|
const crawlerOptions = {
|
||||||
...defaultCrawlerOptions,
|
...defaultCrawlerOptions,
|
||||||
...req.body.crawlerOptions
|
...req.body.crawlerOptions,
|
||||||
};
|
};
|
||||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
|
|
||||||
@@ -82,13 +82,13 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
const {
|
const {
|
||||||
success: creditsCheckSuccess,
|
success: creditsCheckSuccess,
|
||||||
message: creditsCheckMessage,
|
message: creditsCheckMessage,
|
||||||
remainingCredits
|
remainingCredits,
|
||||||
} = await checkTeamCredits(chunk, team_id, limitCheck);
|
} = await checkTeamCredits(chunk, team_id, limitCheck);
|
||||||
|
|
||||||
if (!creditsCheckSuccess) {
|
if (!creditsCheckSuccess) {
|
||||||
return res.status(402).json({
|
return res.status(402).json({
|
||||||
error:
|
error:
|
||||||
"Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com"
|
"Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -113,7 +113,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
error:
|
error:
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +153,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
|
||||||
pageOptions,
|
pageOptions,
|
||||||
undefined,
|
undefined,
|
||||||
undefined
|
undefined,
|
||||||
);
|
);
|
||||||
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||||
|
|
||||||
@@ -166,7 +166,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
createdAt: Date.now()
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
@@ -204,23 +204,23 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
plan,
|
plan,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true
|
sitemapped: true,
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: jobPriority
|
priority: jobPriority,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
await lockURLs(
|
await lockURLs(
|
||||||
id,
|
id,
|
||||||
sc,
|
sc,
|
||||||
jobs.map((x) => x.data.url)
|
jobs.map((x) => x.data.url),
|
||||||
);
|
);
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
id,
|
id,
|
||||||
jobs.map((x) => x.opts.jobId)
|
jobs.map((x) => x.opts.jobId),
|
||||||
);
|
);
|
||||||
for (const job of jobs) {
|
for (const job of jobs) {
|
||||||
// add with sentry instrumentation
|
// add with sentry instrumentation
|
||||||
@@ -243,12 +243,12 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
team_id,
|
team_id,
|
||||||
plan: plan!,
|
plan: plan!,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id
|
crawl_id: id,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 15 // prioritize request 0 of crawl jobs same as scrape jobs
|
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||||
},
|
},
|
||||||
jobId
|
jobId,
|
||||||
);
|
);
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
@@ -258,7 +258,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
error: error instanceof ZodError ? "Invalid URL" : error.message
|
error: error instanceof ZodError ? "Invalid URL" : error.message,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import {
|
|||||||
crawlToCrawler,
|
crawlToCrawler,
|
||||||
lockURL,
|
lockURL,
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl
|
StoredCrawl,
|
||||||
} from "../../../src/lib/crawl-redis";
|
} from "../../../src/lib/crawl-redis";
|
||||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
@@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
error:
|
error:
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,7 +51,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
const pageOptions = req.body.pageOptions ?? {
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
removeTags: []
|
removeTags: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
@@ -94,7 +94,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
|
||||||
pageOptions,
|
pageOptions,
|
||||||
undefined,
|
undefined,
|
||||||
undefined
|
undefined,
|
||||||
);
|
);
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
@@ -105,7 +105,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
robots,
|
robots,
|
||||||
createdAt: Date.now()
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
@@ -131,10 +131,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
origin: "website-preview",
|
origin: "website-preview",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true
|
sitemapped: true,
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId
|
jobId,
|
||||||
);
|
);
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
@@ -151,10 +151,10 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions,
|
internalOptions,
|
||||||
origin: "website-preview",
|
origin: "website-preview",
|
||||||
crawl_id: id
|
crawl_id: id,
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId
|
jobId,
|
||||||
);
|
);
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { ExtractorOptions, PageOptions } from "./../../lib/entities";
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import {
|
import {
|
||||||
billTeam,
|
billTeam,
|
||||||
checkTeamCredits
|
checkTeamCredits,
|
||||||
} from "../../services/billing/credit_billing";
|
} from "../../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
@@ -11,7 +11,7 @@ import {
|
|||||||
Document,
|
Document,
|
||||||
fromLegacyCombo,
|
fromLegacyCombo,
|
||||||
toLegacyDocument,
|
toLegacyDocument,
|
||||||
url as urlSchema
|
url as urlSchema,
|
||||||
} from "../v1/types";
|
} from "../v1/types";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
@@ -19,7 +19,7 @@ import {
|
|||||||
defaultPageOptions,
|
defaultPageOptions,
|
||||||
defaultExtractorOptions,
|
defaultExtractorOptions,
|
||||||
defaultTimeout,
|
defaultTimeout,
|
||||||
defaultOrigin
|
defaultOrigin,
|
||||||
} from "../../lib/default-values";
|
} from "../../lib/default-values";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
@@ -38,7 +38,7 @@ export async function scrapeHelper(
|
|||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
extractorOptions: ExtractorOptions,
|
extractorOptions: ExtractorOptions,
|
||||||
timeout: number,
|
timeout: number,
|
||||||
plan?: PlanType
|
plan?: PlanType,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@@ -55,7 +55,7 @@ export async function scrapeHelper(
|
|||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
returnCode: 403
|
returnCode: 403,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,7 +65,7 @@ export async function scrapeHelper(
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout,
|
timeout,
|
||||||
crawlerOptions
|
crawlerOptions,
|
||||||
);
|
);
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
@@ -77,11 +77,11 @@ export async function scrapeHelper(
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
plan: plan!,
|
plan: plan!,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
is_scrape: true
|
is_scrape: true,
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority
|
jobPriority,
|
||||||
);
|
);
|
||||||
|
|
||||||
let doc;
|
let doc;
|
||||||
@@ -90,7 +90,7 @@ export async function scrapeHelper(
|
|||||||
{
|
{
|
||||||
name: "Wait for job to finish",
|
name: "Wait for job to finish",
|
||||||
op: "bullmq.wait",
|
op: "bullmq.wait",
|
||||||
attributes: { job: jobId }
|
attributes: { job: jobId },
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
try {
|
try {
|
||||||
@@ -104,20 +104,20 @@ export async function scrapeHelper(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Request timed out",
|
error: "Request timed out",
|
||||||
returnCode: 408
|
returnCode: 408,
|
||||||
};
|
};
|
||||||
} else if (
|
} else if (
|
||||||
typeof e === "string" &&
|
typeof e === "string" &&
|
||||||
(e.includes("Error generating completions: ") ||
|
(e.includes("Error generating completions: ") ||
|
||||||
e.includes("Invalid schema for function") ||
|
e.includes("Invalid schema for function") ||
|
||||||
e.includes(
|
e.includes(
|
||||||
"LLM extraction did not match the extraction schema you provided."
|
"LLM extraction did not match the extraction schema you provided.",
|
||||||
))
|
))
|
||||||
) {
|
) {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: e,
|
error: e,
|
||||||
returnCode: 500
|
returnCode: 500,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
throw e;
|
throw e;
|
||||||
@@ -125,7 +125,7 @@ export async function scrapeHelper(
|
|||||||
}
|
}
|
||||||
span.setAttribute("result", JSON.stringify(doc));
|
span.setAttribute("result", JSON.stringify(doc));
|
||||||
return null;
|
return null;
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
if (err !== null) {
|
if (err !== null) {
|
||||||
@@ -140,7 +140,7 @@ export async function scrapeHelper(
|
|||||||
success: true,
|
success: true,
|
||||||
error: "No page found",
|
error: "No page found",
|
||||||
returnCode: 200,
|
returnCode: 200,
|
||||||
data: doc
|
data: doc,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -166,7 +166,7 @@ export async function scrapeHelper(
|
|||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: toLegacyDocument(doc, internalOptions),
|
data: toLegacyDocument(doc, internalOptions),
|
||||||
returnCode: 200
|
returnCode: 200,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -185,7 +185,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||||
const extractorOptions = {
|
const extractorOptions = {
|
||||||
...defaultExtractorOptions,
|
...defaultExtractorOptions,
|
||||||
...req.body.extractorOptions
|
...req.body.extractorOptions,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? defaultOrigin;
|
const origin = req.body.origin ?? defaultOrigin;
|
||||||
let timeout = req.body.timeout ?? defaultTimeout;
|
let timeout = req.body.timeout ?? defaultTimeout;
|
||||||
@@ -197,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
) {
|
) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
error:
|
error:
|
||||||
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified"
|
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -213,7 +213,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(402).json({
|
return res.status(402).json({
|
||||||
error:
|
error:
|
||||||
"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing"
|
"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -221,7 +221,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
error:
|
error:
|
||||||
"Error checking team credits. Please contact help@firecrawl.com for help."
|
"Error checking team credits. Please contact help@firecrawl.com for help.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -236,7 +236,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout,
|
timeout,
|
||||||
plan
|
plan,
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
@@ -244,7 +244,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
result.data && (result.data as Document).markdown
|
result.data && (result.data as Document).markdown
|
||||||
? numTokensFromString(
|
? numTokensFromString(
|
||||||
(result.data as Document).markdown!,
|
(result.data as Document).markdown!,
|
||||||
"gpt-3.5-turbo"
|
"gpt-3.5-turbo",
|
||||||
)
|
)
|
||||||
: 0;
|
: 0;
|
||||||
|
|
||||||
@@ -267,7 +267,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
// billing for doc done on queue end, bill only for llm extraction
|
// billing for doc done on queue end, bill only for llm extraction
|
||||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => {
|
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
|
||||||
);
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
@@ -290,7 +290,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const { scrapeOptions } = fromLegacyScrapeOptions(
|
const { scrapeOptions } = fromLegacyScrapeOptions(
|
||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
@@ -306,7 +306,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
num_tokens: numTokens
|
num_tokens: numTokens,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
@@ -319,7 +319,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
? "Invalid URL"
|
? "Invalid URL"
|
||||||
: typeof error === "string"
|
: typeof error === "string"
|
||||||
? error
|
? error
|
||||||
: (error?.message ?? "Internal Server Error")
|
: (error?.message ?? "Internal Server Error"),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import {
|
import {
|
||||||
billTeam,
|
billTeam,
|
||||||
checkTeamCredits
|
checkTeamCredits,
|
||||||
} from "../../services/billing/credit_billing";
|
} from "../../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
@@ -20,7 +20,7 @@ import {
|
|||||||
Document,
|
Document,
|
||||||
fromLegacyCombo,
|
fromLegacyCombo,
|
||||||
fromLegacyScrapeOptions,
|
fromLegacyScrapeOptions,
|
||||||
toLegacyDocument
|
toLegacyDocument,
|
||||||
} from "../v1/types";
|
} from "../v1/types";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
@@ -31,7 +31,7 @@ export async function searchHelper(
|
|||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
searchOptions: SearchOptions,
|
searchOptions: SearchOptions,
|
||||||
plan: PlanType | undefined
|
plan: PlanType | undefined,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@@ -62,7 +62,7 @@ export async function searchHelper(
|
|||||||
filter: filter,
|
filter: filter,
|
||||||
lang: searchOptions.lang ?? "en",
|
lang: searchOptions.lang ?? "en",
|
||||||
country: searchOptions.country ?? "us",
|
country: searchOptions.country ?? "us",
|
||||||
location: searchOptions.location
|
location: searchOptions.location,
|
||||||
});
|
});
|
||||||
|
|
||||||
let justSearch = pageOptions.fetchPageContent === false;
|
let justSearch = pageOptions.fetchPageContent === false;
|
||||||
@@ -71,13 +71,13 @@ export async function searchHelper(
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
undefined,
|
undefined,
|
||||||
60000,
|
60000,
|
||||||
crawlerOptions
|
crawlerOptions,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (justSearch) {
|
if (justSearch) {
|
||||||
billTeam(team_id, subscription_id, res.length).catch((error) => {
|
billTeam(team_id, subscription_id, res.length).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${team_id} for ${res.length} credits: ${error}`
|
`Failed to bill team ${team_id} for ${res.length} credits: ${error}`,
|
||||||
);
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
@@ -107,12 +107,12 @@ export async function searchHelper(
|
|||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions
|
internalOptions,
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: jobPriority
|
priority: jobPriority,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -123,7 +123,7 @@ export async function searchHelper(
|
|||||||
|
|
||||||
const docs = (
|
const docs = (
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
jobDatas.map((x) => waitForJob<Document>(x.opts.jobId, 60000))
|
jobDatas.map((x) => waitForJob<Document>(x.opts.jobId, 60000)),
|
||||||
)
|
)
|
||||||
).map((x) => toLegacyDocument(x, internalOptions));
|
).map((x) => toLegacyDocument(x, internalOptions));
|
||||||
|
|
||||||
@@ -136,7 +136,7 @@ export async function searchHelper(
|
|||||||
|
|
||||||
// make sure doc.content is not empty
|
// make sure doc.content is not empty
|
||||||
const filteredDocs = docs.filter(
|
const filteredDocs = docs.filter(
|
||||||
(doc: any) => doc && doc.content && doc.content.trim().length > 0
|
(doc: any) => doc && doc.content && doc.content.trim().length > 0,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
@@ -144,14 +144,14 @@ export async function searchHelper(
|
|||||||
success: true,
|
success: true,
|
||||||
error: "No page found",
|
error: "No page found",
|
||||||
returnCode: 200,
|
returnCode: 200,
|
||||||
data: docs
|
data: docs,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: filteredDocs,
|
data: filteredDocs,
|
||||||
returnCode: 200
|
returnCode: 200,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -169,7 +169,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||||
fallback: req.body.pageOptions?.fallback ?? false
|
fallback: req.body.pageOptions?.fallback ?? false,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
||||||
@@ -197,7 +197,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
searchOptions,
|
searchOptions,
|
||||||
plan
|
plan,
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
@@ -212,7 +212,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
mode: "search",
|
mode: "search",
|
||||||
url: req.body.query,
|
url: req.body.query,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
origin: origin
|
origin: origin,
|
||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
|
|
||||||
export async function crawlJobStatusPreviewController(
|
export async function crawlJobStatusPreviewController(
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response
|
res: Response,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
@@ -26,7 +26,7 @@ export async function crawlJobStatusPreviewController(
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort(
|
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort(
|
||||||
(a, b) => a.timestamp - b.timestamp
|
(a, b) => a.timestamp - b.timestamp,
|
||||||
);
|
);
|
||||||
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
||||||
const jobStatus = sc.cancelled
|
const jobStatus = sc.cancelled
|
||||||
@@ -38,7 +38,7 @@ export async function crawlJobStatusPreviewController(
|
|||||||
: "active";
|
: "active";
|
||||||
|
|
||||||
const data = jobs.map((x) =>
|
const data = jobs.map((x) =>
|
||||||
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue,
|
||||||
);
|
);
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
@@ -48,7 +48,7 @@ export async function crawlJobStatusPreviewController(
|
|||||||
total: jobs.length,
|
total: jobs.length,
|
||||||
data: jobStatus === "completed" ? data : null,
|
data: jobStatus === "completed" ? data : null,
|
||||||
partial_data:
|
partial_data:
|
||||||
jobStatus === "completed" ? [] : data.filter((x) => x !== null)
|
jobStatus === "completed" ? [] : data.filter((x) => x !== null),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
|||||||
@@ -25,13 +25,13 @@ describe("URL Schema Validation", () => {
|
|||||||
|
|
||||||
it("should reject URLs without a valid top-level domain", () => {
|
it("should reject URLs without a valid top-level domain", () => {
|
||||||
expect(() => url.parse("http://example")).toThrow(
|
expect(() => url.parse("http://example")).toThrow(
|
||||||
"URL must have a valid top-level domain or be a valid path"
|
"URL must have a valid top-level domain or be a valid path",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject blocked URLs", () => {
|
it("should reject blocked URLs", () => {
|
||||||
expect(() => url.parse("https://facebook.com")).toThrow(
|
expect(() => url.parse("https://facebook.com")).toThrow(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -47,28 +47,28 @@ describe("URL Schema Validation", () => {
|
|||||||
|
|
||||||
it("should handle URLs with subdomains that are blocked", () => {
|
it("should handle URLs with subdomains that are blocked", () => {
|
||||||
expect(() => url.parse("https://sub.facebook.com")).toThrow(
|
expect(() => url.parse("https://sub.facebook.com")).toThrow(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should handle URLs with paths that are blocked", () => {
|
it("should handle URLs with paths that are blocked", () => {
|
||||||
expect(() => url.parse("http://facebook.com/path")).toThrow(
|
expect(() => url.parse("http://facebook.com/path")).toThrow(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
expect(() => url.parse("https://facebook.com/another/path")).toThrow(
|
expect(() => url.parse("https://facebook.com/another/path")).toThrow(
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject malformed URLs starting with 'http://http'", () => {
|
it("should reject malformed URLs starting with 'http://http'", () => {
|
||||||
expect(() => url.parse("http://http://example.com")).toThrow(
|
expect(() => url.parse("http://http://example.com")).toThrow(
|
||||||
"Invalid URL. Invalid protocol."
|
"Invalid URL. Invalid protocol.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||||
expect(() =>
|
expect(() =>
|
||||||
url.parse("http://example.com/http://example.com")
|
url.parse("http://example.com/http://example.com"),
|
||||||
).not.toThrow();
|
).not.toThrow();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -5,14 +5,14 @@ import {
|
|||||||
batchScrapeRequestSchema,
|
batchScrapeRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
ScrapeOptions
|
ScrapeOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJobs,
|
addCrawlJobs,
|
||||||
getCrawl,
|
getCrawl,
|
||||||
lockURLs,
|
lockURLs,
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl
|
StoredCrawl,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
@@ -22,7 +22,7 @@ import { logger as _logger } from "../../lib/logger";
|
|||||||
|
|
||||||
export async function batchScrapeController(
|
export async function batchScrapeController(
|
||||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||||
res: Response<CrawlResponse>
|
res: Response<CrawlResponse>,
|
||||||
) {
|
) {
|
||||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||||
|
|
||||||
@@ -33,12 +33,12 @@ export async function batchScrapeController(
|
|||||||
module: "api/v1",
|
module: "api/v1",
|
||||||
method: "batchScrapeController",
|
method: "batchScrapeController",
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan
|
plan: req.auth.plan,
|
||||||
});
|
});
|
||||||
logger.debug("Batch scrape " + id + " starting", {
|
logger.debug("Batch scrape " + id + " starting", {
|
||||||
urlsLength: req.body.urls,
|
urlsLength: req.body.urls,
|
||||||
appendToId: req.body.appendToId,
|
appendToId: req.body.appendToId,
|
||||||
account: req.account
|
account: req.account,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!req.body.appendToId) {
|
if (!req.body.appendToId) {
|
||||||
@@ -59,7 +59,7 @@ export async function batchScrapeController(
|
|||||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan
|
plan: req.auth.plan,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!req.body.appendToId) {
|
if (!req.body.appendToId) {
|
||||||
@@ -75,7 +75,7 @@ export async function batchScrapeController(
|
|||||||
jobPriority = await getJobPriority({
|
jobPriority = await getJobPriority({
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
basePriority: 21
|
basePriority: 21,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
@@ -97,12 +97,12 @@ export async function batchScrapeController(
|
|||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
v1: true,
|
v1: true,
|
||||||
webhook: req.body.webhook
|
webhook: req.body.webhook,
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuidv4(),
|
jobId: uuidv4(),
|
||||||
priority: 20
|
priority: 20,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -110,19 +110,19 @@ export async function batchScrapeController(
|
|||||||
await lockURLs(
|
await lockURLs(
|
||||||
id,
|
id,
|
||||||
sc,
|
sc,
|
||||||
jobs.map((x) => x.data.url)
|
jobs.map((x) => x.data.url),
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape jobs to Redis...");
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
id,
|
id,
|
||||||
jobs.map((x) => x.opts.jobId)
|
jobs.map((x) => x.opts.jobId),
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await addScrapeJobs(jobs);
|
await addScrapeJobs(jobs);
|
||||||
|
|
||||||
if (req.body.webhook) {
|
if (req.body.webhook) {
|
||||||
logger.debug("Calling webhook with batch_scrape.started...", {
|
logger.debug("Calling webhook with batch_scrape.started...", {
|
||||||
webhook: req.body.webhook
|
webhook: req.body.webhook,
|
||||||
});
|
});
|
||||||
await callWebhook(
|
await callWebhook(
|
||||||
req.auth.team_id,
|
req.auth.team_id,
|
||||||
@@ -130,7 +130,7 @@ export async function batchScrapeController(
|
|||||||
null,
|
null,
|
||||||
req.body.webhook,
|
req.body.webhook,
|
||||||
true,
|
true,
|
||||||
"batch_scrape.started"
|
"batch_scrape.started",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -139,6 +139,6 @@ export async function batchScrapeController(
|
|||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`
|
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { authenticateUser } from "../auth";
|
|||||||
import {
|
import {
|
||||||
ConcurrencyCheckParams,
|
ConcurrencyCheckParams,
|
||||||
ConcurrencyCheckResponse,
|
ConcurrencyCheckResponse,
|
||||||
RequestWithAuth
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { RateLimiterMode } from "../../types";
|
import { RateLimiterMode } from "../../types";
|
||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
@@ -10,14 +10,14 @@ import { redisConnection } from "../../services/queue-service";
|
|||||||
// Basically just middleware and error wrapping
|
// Basically just middleware and error wrapping
|
||||||
export async function concurrencyCheckController(
|
export async function concurrencyCheckController(
|
||||||
req: RequestWithAuth<ConcurrencyCheckParams, undefined, undefined>,
|
req: RequestWithAuth<ConcurrencyCheckParams, undefined, undefined>,
|
||||||
res: Response<ConcurrencyCheckResponse>
|
res: Response<ConcurrencyCheckResponse>,
|
||||||
) {
|
) {
|
||||||
const concurrencyLimiterKey = "concurrency-limiter:" + req.auth.team_id;
|
const concurrencyLimiterKey = "concurrency-limiter:" + req.auth.team_id;
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const activeJobsOfTeam = await redisConnection.zrangebyscore(
|
const activeJobsOfTeam = await redisConnection.zrangebyscore(
|
||||||
concurrencyLimiterKey,
|
concurrencyLimiterKey,
|
||||||
now,
|
now,
|
||||||
Infinity
|
Infinity,
|
||||||
);
|
);
|
||||||
return res
|
return res
|
||||||
.status(200)
|
.status(200)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ configDotenv();
|
|||||||
|
|
||||||
export async function crawlCancelController(
|
export async function crawlCancelController(
|
||||||
req: RequestWithAuth<{ jobId: string }>,
|
req: RequestWithAuth<{ jobId: string }>,
|
||||||
res: Response
|
res: Response,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
@@ -43,7 +43,7 @@ export async function crawlCancelController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: "cancelled"
|
status: "cancelled",
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import {
|
|||||||
CrawlStatusResponse,
|
CrawlStatusResponse,
|
||||||
Document,
|
Document,
|
||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
RequestWithAuth
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { WebSocket } from "ws";
|
import { WebSocket } from "ws";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
@@ -19,7 +19,7 @@ import {
|
|||||||
getDoneJobsOrderedLength,
|
getDoneJobsOrderedLength,
|
||||||
getThrottledJobs,
|
getThrottledJobs,
|
||||||
isCrawlFinished,
|
isCrawlFinished,
|
||||||
isCrawlFinishedLocked
|
isCrawlFinishedLocked,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { getJob, getJobs } from "./crawl-status";
|
import { getJob, getJobs } from "./crawl-status";
|
||||||
@@ -64,7 +64,7 @@ function close(ws: WebSocket, code: number, msg: Message) {
|
|||||||
|
|
||||||
async function crawlStatusWS(
|
async function crawlStatusWS(
|
||||||
ws: WebSocket,
|
ws: WebSocket,
|
||||||
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>
|
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>,
|
||||||
) {
|
) {
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
@@ -89,7 +89,10 @@ async function crawlStatusWS(
|
|||||||
|
|
||||||
const notDoneJobIDs = jobIDs.filter((x) => !doneJobIDs.includes(x));
|
const notDoneJobIDs = jobIDs.filter((x) => !doneJobIDs.includes(x));
|
||||||
const jobStatuses = await Promise.all(
|
const jobStatuses = await Promise.all(
|
||||||
notDoneJobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)])
|
notDoneJobIDs.map(async (x) => [
|
||||||
|
x,
|
||||||
|
await getScrapeQueue().getJobState(x),
|
||||||
|
]),
|
||||||
);
|
);
|
||||||
const newlyDoneJobIDs: string[] = jobStatuses
|
const newlyDoneJobIDs: string[] = jobStatuses
|
||||||
.filter((x) => x[1] === "completed" || x[1] === "failed")
|
.filter((x) => x[1] === "completed" || x[1] === "failed")
|
||||||
@@ -102,7 +105,7 @@ async function crawlStatusWS(
|
|||||||
if (job.returnvalue) {
|
if (job.returnvalue) {
|
||||||
send(ws, {
|
send(ws, {
|
||||||
type: "document",
|
type: "document",
|
||||||
data: job.returnvalue
|
data: job.returnvalue,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||||
@@ -120,7 +123,9 @@ async function crawlStatusWS(
|
|||||||
|
|
||||||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
let jobStatuses = await Promise.all(
|
let jobStatuses = await Promise.all(
|
||||||
jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const)
|
jobIDs.map(
|
||||||
|
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||||
|
|
||||||
@@ -161,8 +166,8 @@ async function crawlStatusWS(
|
|||||||
completed: doneJobIDs.length,
|
completed: doneJobIDs.length,
|
||||||
creditsUsed: jobIDs.length,
|
creditsUsed: jobIDs.length,
|
||||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||||
data: data
|
data: data,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
if (status !== "scraping") {
|
if (status !== "scraping") {
|
||||||
@@ -174,7 +179,7 @@ async function crawlStatusWS(
|
|||||||
// Basically just middleware and error wrapping
|
// Basically just middleware and error wrapping
|
||||||
export async function crawlStatusWSController(
|
export async function crawlStatusWSController(
|
||||||
ws: WebSocket,
|
ws: WebSocket,
|
||||||
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>
|
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const auth = await authenticateUser(req, null, RateLimiterMode.CrawlStatus);
|
const auth = await authenticateUser(req, null, RateLimiterMode.CrawlStatus);
|
||||||
@@ -182,7 +187,7 @@ export async function crawlStatusWSController(
|
|||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return close(ws, 3000, {
|
return close(ws, 3000, {
|
||||||
type: "error",
|
type: "error",
|
||||||
error: auth.error
|
error: auth.error,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -201,7 +206,7 @@ export async function crawlStatusWSController(
|
|||||||
verbose = JSON.stringify({
|
verbose = JSON.stringify({
|
||||||
message: err.message,
|
message: err.message,
|
||||||
name: err.name,
|
name: err.name,
|
||||||
stack: err.stack
|
stack: err.stack,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -212,13 +217,13 @@ export async function crawlStatusWSController(
|
|||||||
") -- ID " +
|
") -- ID " +
|
||||||
id +
|
id +
|
||||||
" -- " +
|
" -- " +
|
||||||
verbose
|
verbose,
|
||||||
);
|
);
|
||||||
return close(ws, 1011, {
|
return close(ws, 1011, {
|
||||||
type: "error",
|
type: "error",
|
||||||
error:
|
error:
|
||||||
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
|
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
|
||||||
id
|
id,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import {
|
|||||||
CrawlStatusParams,
|
CrawlStatusParams,
|
||||||
CrawlStatusResponse,
|
CrawlStatusResponse,
|
||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
RequestWithAuth
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
getCrawl,
|
getCrawl,
|
||||||
@@ -11,12 +11,12 @@ import {
|
|||||||
getCrawlJobs,
|
getCrawlJobs,
|
||||||
getDoneJobsOrdered,
|
getDoneJobsOrdered,
|
||||||
getDoneJobsOrderedLength,
|
getDoneJobsOrderedLength,
|
||||||
getThrottledJobs
|
getThrottledJobs,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import {
|
import {
|
||||||
supabaseGetJobById,
|
supabaseGetJobById,
|
||||||
supabaseGetJobsById
|
supabaseGetJobsById,
|
||||||
} from "../../lib/supabase-jobs";
|
} from "../../lib/supabase-jobs";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { Job, JobState } from "bullmq";
|
import { Job, JobState } from "bullmq";
|
||||||
@@ -70,7 +70,7 @@ export async function getJobs(ids: string[]) {
|
|||||||
export async function crawlStatusController(
|
export async function crawlStatusController(
|
||||||
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>,
|
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>,
|
||||||
res: Response<CrawlStatusResponse>,
|
res: Response<CrawlStatusResponse>,
|
||||||
isBatch = false
|
isBatch = false,
|
||||||
) {
|
) {
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
@@ -90,7 +90,9 @@ export async function crawlStatusController(
|
|||||||
|
|
||||||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
let jobStatuses = await Promise.all(
|
let jobStatuses = await Promise.all(
|
||||||
jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const)
|
jobIDs.map(
|
||||||
|
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||||
|
|
||||||
@@ -124,7 +126,7 @@ export async function crawlStatusController(
|
|||||||
const doneJobsOrder = await getDoneJobsOrdered(
|
const doneJobsOrder = await getDoneJobsOrdered(
|
||||||
req.params.jobId,
|
req.params.jobId,
|
||||||
start,
|
start,
|
||||||
end ?? -1
|
end ?? -1,
|
||||||
);
|
);
|
||||||
|
|
||||||
let doneJobs: Job[] = [];
|
let doneJobs: Job[] = [];
|
||||||
@@ -158,7 +160,7 @@ export async function crawlStatusController(
|
|||||||
if (job.returnvalue === undefined) {
|
if (job.returnvalue === undefined) {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Job was considered done, but returnvalue is undefined!",
|
"Job was considered done, but returnvalue is undefined!",
|
||||||
{ jobId: job.id, state }
|
{ jobId: job.id, state },
|
||||||
);
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -175,8 +177,8 @@ export async function crawlStatusController(
|
|||||||
doneJobs = (
|
doneJobs = (
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
(await getJobs(doneJobsOrder)).map(async (x) =>
|
(await getJobs(doneJobsOrder)).map(async (x) =>
|
||||||
(await x.getState()) === "failed" ? null : x
|
(await x.getState()) === "failed" ? null : x,
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
).filter((x) => x !== null) as Job[];
|
).filter((x) => x !== null) as Job[];
|
||||||
}
|
}
|
||||||
@@ -185,7 +187,7 @@ export async function crawlStatusController(
|
|||||||
|
|
||||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||||
const nextURL = new URL(
|
const nextURL = new URL(
|
||||||
`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`
|
`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||||
@@ -215,6 +217,6 @@ export async function crawlStatusController(
|
|||||||
status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this
|
status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this
|
||||||
? undefined
|
? undefined
|
||||||
: nextURL.href,
|
: nextURL.href,
|
||||||
data: data
|
data: data,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import {
|
|||||||
crawlRequestSchema,
|
crawlRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
toLegacyCrawlerOptions
|
toLegacyCrawlerOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJob,
|
addCrawlJob,
|
||||||
@@ -14,7 +14,7 @@ import {
|
|||||||
lockURL,
|
lockURL,
|
||||||
lockURLs,
|
lockURLs,
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl
|
StoredCrawl,
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
@@ -26,7 +26,7 @@ import { scrapeOptions as scrapeOptionsSchema } from "./types";
|
|||||||
|
|
||||||
export async function crawlController(
|
export async function crawlController(
|
||||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||||
res: Response<CrawlResponse>
|
res: Response<CrawlResponse>,
|
||||||
) {
|
) {
|
||||||
const preNormalizedBody = req.body;
|
const preNormalizedBody = req.body;
|
||||||
req.body = crawlRequestSchema.parse(req.body);
|
req.body = crawlRequestSchema.parse(req.body);
|
||||||
@@ -37,12 +37,12 @@ export async function crawlController(
|
|||||||
module: "api/v1",
|
module: "api/v1",
|
||||||
method: "crawlController",
|
method: "crawlController",
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan
|
plan: req.auth.plan,
|
||||||
});
|
});
|
||||||
logger.debug("Crawl " + id + " starting", {
|
logger.debug("Crawl " + id + " starting", {
|
||||||
request: req.body,
|
request: req.body,
|
||||||
originalRequest: preNormalizedBody,
|
originalRequest: preNormalizedBody,
|
||||||
account: req.account
|
account: req.account,
|
||||||
});
|
});
|
||||||
|
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
@@ -56,7 +56,7 @@ export async function crawlController(
|
|||||||
const crawlerOptions = {
|
const crawlerOptions = {
|
||||||
...req.body,
|
...req.body,
|
||||||
url: undefined,
|
url: undefined,
|
||||||
scrapeOptions: undefined
|
scrapeOptions: undefined,
|
||||||
};
|
};
|
||||||
const scrapeOptions = req.body.scrapeOptions;
|
const scrapeOptions = req.body.scrapeOptions;
|
||||||
|
|
||||||
@@ -86,7 +86,7 @@ export async function crawlController(
|
|||||||
logger.debug("Determined limit: " + crawlerOptions.limit, {
|
logger.debug("Determined limit: " + crawlerOptions.limit, {
|
||||||
remainingCredits,
|
remainingCredits,
|
||||||
bodyLimit: originalLimit,
|
bodyLimit: originalLimit,
|
||||||
originalBodyLimit: preNormalizedBody.limit
|
originalBodyLimit: preNormalizedBody.limit,
|
||||||
});
|
});
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
@@ -96,7 +96,7 @@ export async function crawlController(
|
|||||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan
|
plan: req.auth.plan,
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
@@ -105,7 +105,7 @@ export async function crawlController(
|
|||||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.debug("Failed to get robots.txt (this is probably fine!)", {
|
logger.debug("Failed to get robots.txt (this is probably fine!)", {
|
||||||
error: e
|
error: e,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,7 +117,7 @@ export async function crawlController(
|
|||||||
|
|
||||||
if (sitemap !== null && sitemap.length > 0) {
|
if (sitemap !== null && sitemap.length > 0) {
|
||||||
logger.debug("Using sitemap of length " + sitemap.length, {
|
logger.debug("Using sitemap of length " + sitemap.length, {
|
||||||
sitemapLength: sitemap.length
|
sitemapLength: sitemap.length,
|
||||||
});
|
});
|
||||||
let jobPriority = 20;
|
let jobPriority = 20;
|
||||||
// If it is over 1000, we need to get the job priority,
|
// If it is over 1000, we need to get the job priority,
|
||||||
@@ -127,7 +127,7 @@ export async function crawlController(
|
|||||||
jobPriority = await getJobPriority({
|
jobPriority = await getJobPriority({
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
basePriority: 21
|
basePriority: 21,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
@@ -149,12 +149,12 @@ export async function crawlController(
|
|||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
v1: true
|
v1: true,
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: 20
|
priority: 20,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -162,18 +162,18 @@ export async function crawlController(
|
|||||||
await lockURLs(
|
await lockURLs(
|
||||||
id,
|
id,
|
||||||
sc,
|
sc,
|
||||||
jobs.map((x) => x.data.url)
|
jobs.map((x) => x.data.url),
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape jobs to Redis...");
|
logger.debug("Adding scrape jobs to Redis...");
|
||||||
await addCrawlJobs(
|
await addCrawlJobs(
|
||||||
id,
|
id,
|
||||||
jobs.map((x) => x.opts.jobId)
|
jobs.map((x) => x.opts.jobId),
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await getScrapeQueue().addBulk(jobs);
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Sitemap not found or ignored.", {
|
logger.debug("Sitemap not found or ignored.", {
|
||||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap
|
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.debug("Locking URL...");
|
logger.debug("Locking URL...");
|
||||||
@@ -192,12 +192,12 @@ export async function crawlController(
|
|||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
v1: true
|
v1: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 15
|
priority: 15,
|
||||||
},
|
},
|
||||||
jobId
|
jobId,
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
@@ -206,7 +206,7 @@ export async function crawlController(
|
|||||||
|
|
||||||
if (req.body.webhook) {
|
if (req.body.webhook) {
|
||||||
logger.debug("Calling webhook with crawl.started...", {
|
logger.debug("Calling webhook with crawl.started...", {
|
||||||
webhook: req.body.webhook
|
webhook: req.body.webhook,
|
||||||
});
|
});
|
||||||
await callWebhook(
|
await callWebhook(
|
||||||
req.auth.team_id,
|
req.auth.team_id,
|
||||||
@@ -214,7 +214,7 @@ export async function crawlController(
|
|||||||
null,
|
null,
|
||||||
req.body.webhook,
|
req.body.webhook,
|
||||||
true,
|
true,
|
||||||
"crawl.started"
|
"crawl.started",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,6 +223,6 @@ export async function crawlController(
|
|||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
|
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import {
|
|||||||
extractRequestSchema,
|
extractRequestSchema,
|
||||||
ExtractResponse,
|
ExtractResponse,
|
||||||
MapDocument,
|
MapDocument,
|
||||||
scrapeOptions
|
scrapeOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document } from "../../lib/entities";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
@@ -43,7 +43,7 @@ const MIN_REQUIRED_LINKS = 1;
|
|||||||
*/
|
*/
|
||||||
export async function extractController(
|
export async function extractController(
|
||||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||||
res: Response<ExtractResponse>
|
res: Response<ExtractResponse>,
|
||||||
) {
|
) {
|
||||||
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
|
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
|
||||||
|
|
||||||
@@ -81,7 +81,7 @@ export async function extractController(
|
|||||||
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
||||||
ignoreSitemap: !selfHosted ? true : false,
|
ignoreSitemap: !selfHosted ? true : false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: req.body.includeSubdomains
|
includeSubdomains: req.body.includeSubdomains,
|
||||||
});
|
});
|
||||||
|
|
||||||
let mappedLinks = mapResults.links as MapDocument[];
|
let mappedLinks = mapResults.links as MapDocument[];
|
||||||
@@ -89,7 +89,8 @@ export async function extractController(
|
|||||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||||
|
|
||||||
let mappedLinksRerank = mappedLinks.map(
|
let mappedLinksRerank = mappedLinks.map(
|
||||||
(x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`
|
(x) =>
|
||||||
|
`url: ${x.url}, title: ${x.title}, description: ${x.description}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Filter by path prefix if present
|
// Filter by path prefix if present
|
||||||
@@ -103,31 +104,31 @@ export async function extractController(
|
|||||||
const linksAndScores = await performRanking(
|
const linksAndScores = await performRanking(
|
||||||
mappedLinksRerank,
|
mappedLinksRerank,
|
||||||
mappedLinks.map((l) => l.url),
|
mappedLinks.map((l) => l.url),
|
||||||
mapUrl
|
mapUrl,
|
||||||
);
|
);
|
||||||
|
|
||||||
// First try with high threshold
|
// First try with high threshold
|
||||||
let filteredLinks = filterAndProcessLinks(
|
let filteredLinks = filterAndProcessLinks(
|
||||||
mappedLinks,
|
mappedLinks,
|
||||||
linksAndScores,
|
linksAndScores,
|
||||||
INITIAL_SCORE_THRESHOLD
|
INITIAL_SCORE_THRESHOLD,
|
||||||
);
|
);
|
||||||
|
|
||||||
// If we don't have enough high-quality links, try with lower threshold
|
// If we don't have enough high-quality links, try with lower threshold
|
||||||
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`
|
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
|
||||||
);
|
);
|
||||||
filteredLinks = filterAndProcessLinks(
|
filteredLinks = filterAndProcessLinks(
|
||||||
mappedLinks,
|
mappedLinks,
|
||||||
linksAndScores,
|
linksAndScores,
|
||||||
FALLBACK_SCORE_THRESHOLD
|
FALLBACK_SCORE_THRESHOLD,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (filteredLinks.length === 0) {
|
if (filteredLinks.length === 0) {
|
||||||
// If still no results, take top N results regardless of score
|
// If still no results, take top N results regardless of score
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`
|
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
|
||||||
);
|
);
|
||||||
filteredLinks = linksAndScores
|
filteredLinks = linksAndScores
|
||||||
.sort((a, b) => b.score - a.score)
|
.sort((a, b) => b.score - a.score)
|
||||||
@@ -135,7 +136,9 @@ export async function extractController(
|
|||||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter(
|
.filter(
|
||||||
(x): x is MapDocument =>
|
(x): x is MapDocument =>
|
||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)
|
x !== undefined &&
|
||||||
|
x.url !== undefined &&
|
||||||
|
!isUrlBlocked(x.url),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -161,7 +164,7 @@ export async function extractController(
|
|||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
|
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +177,7 @@ export async function extractController(
|
|||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
plan: req.auth.plan as PlanType,
|
plan: req.auth.plan as PlanType,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
basePriority: 10
|
basePriority: 10,
|
||||||
});
|
});
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
@@ -186,11 +189,11 @@ export async function extractController(
|
|||||||
internalOptions: {},
|
internalOptions: {},
|
||||||
plan: req.auth.plan!,
|
plan: req.auth.plan!,
|
||||||
origin,
|
origin,
|
||||||
is_scrape: true
|
is_scrape: true,
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority
|
jobPriority,
|
||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -208,12 +211,12 @@ export async function extractController(
|
|||||||
) {
|
) {
|
||||||
throw {
|
throw {
|
||||||
status: 408,
|
status: 408,
|
||||||
error: "Request timed out"
|
error: "Request timed out",
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
throw {
|
throw {
|
||||||
status: 500,
|
status: 500,
|
||||||
error: `(Internal server error) - ${e && e.message ? e.message : e}`
|
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -225,7 +228,7 @@ export async function extractController(
|
|||||||
} catch (e) {
|
} catch (e) {
|
||||||
return res.status(e.status).json({
|
return res.status(e.status).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: e.error
|
error: e.error,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -237,11 +240,11 @@ export async function extractController(
|
|||||||
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||||
links.join(", "),
|
links.join(", "),
|
||||||
prompt: req.body.prompt,
|
prompt: req.body.prompt,
|
||||||
schema: req.body.schema
|
schema: req.body.schema,
|
||||||
},
|
},
|
||||||
docs.map((x) => buildDocument(x)).join("\n"),
|
docs.map((x) => buildDocument(x)).join("\n"),
|
||||||
undefined,
|
undefined,
|
||||||
true // isExtractEndpoint
|
true, // isExtractEndpoint
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO: change this later
|
// TODO: change this later
|
||||||
@@ -249,9 +252,9 @@ export async function extractController(
|
|||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
|
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
|
||||||
(error) => {
|
(error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`
|
`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`,
|
||||||
);
|
);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
let data = completions.extract ?? {};
|
let data = completions.extract ?? {};
|
||||||
@@ -269,14 +272,14 @@ export async function extractController(
|
|||||||
url: req.body.urls.join(", "),
|
url: req.body.urls.join(", "),
|
||||||
scrapeOptions: req.body,
|
scrapeOptions: req.body,
|
||||||
origin: req.body.origin ?? "api",
|
origin: req.body.origin ?? "api",
|
||||||
num_tokens: completions.numTokens ?? 0
|
num_tokens: completions.numTokens ?? 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: data,
|
data: data,
|
||||||
scrape_id: id,
|
scrape_id: id,
|
||||||
warning: warning
|
warning: warning,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -295,13 +298,13 @@ function filterAndProcessLinks(
|
|||||||
score: number;
|
score: number;
|
||||||
originalIndex: number;
|
originalIndex: number;
|
||||||
}[],
|
}[],
|
||||||
threshold: number
|
threshold: number,
|
||||||
): MapDocument[] {
|
): MapDocument[] {
|
||||||
return linksAndScores
|
return linksAndScores
|
||||||
.filter((x) => x.score > threshold)
|
.filter((x) => x.score > threshold)
|
||||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter(
|
.filter(
|
||||||
(x): x is MapDocument =>
|
(x): x is MapDocument =>
|
||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import {
|
|||||||
MapDocument,
|
MapDocument,
|
||||||
mapRequestSchema,
|
mapRequestSchema,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
scrapeOptions
|
scrapeOptions,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { MapResponse, MapRequest } from "./types";
|
import { MapResponse, MapRequest } from "./types";
|
||||||
@@ -13,7 +13,7 @@ import {
|
|||||||
checkAndUpdateURLForMap,
|
checkAndUpdateURLForMap,
|
||||||
isSameDomain,
|
isSameDomain,
|
||||||
isSameSubdomain,
|
isSameSubdomain,
|
||||||
removeDuplicateUrls
|
removeDuplicateUrls,
|
||||||
} from "../../lib/validateUrl";
|
} from "../../lib/validateUrl";
|
||||||
import { fireEngineMap } from "../../search/fireEngine";
|
import { fireEngineMap } from "../../search/fireEngine";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
@@ -49,7 +49,7 @@ export async function getMapResults({
|
|||||||
plan,
|
plan,
|
||||||
origin,
|
origin,
|
||||||
includeMetadata = false,
|
includeMetadata = false,
|
||||||
allowExternalLinks
|
allowExternalLinks,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@@ -72,13 +72,13 @@ export async function getMapResults({
|
|||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
|
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
|
||||||
scrapeOptions: undefined
|
scrapeOptions: undefined,
|
||||||
},
|
},
|
||||||
scrapeOptions: scrapeOptions.parse({}),
|
scrapeOptions: scrapeOptions.parse({}),
|
||||||
internalOptions: {},
|
internalOptions: {},
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: plan
|
plan: plan,
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
@@ -114,7 +114,7 @@ export async function getMapResults({
|
|||||||
|
|
||||||
const resultsPerPage = 100;
|
const resultsPerPage = 100;
|
||||||
const maxPages = Math.ceil(
|
const maxPages = Math.ceil(
|
||||||
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
|
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage,
|
||||||
);
|
);
|
||||||
|
|
||||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||||
@@ -129,12 +129,12 @@ export async function getMapResults({
|
|||||||
const fetchPage = async (page: number) => {
|
const fetchPage = async (page: number) => {
|
||||||
return fireEngineMap(mapUrl, {
|
return fireEngineMap(mapUrl, {
|
||||||
numResults: resultsPerPage,
|
numResults: resultsPerPage,
|
||||||
page: page
|
page: page,
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
pagePromises = Array.from({ length: maxPages }, (_, i) =>
|
pagePromises = Array.from({ length: maxPages }, (_, i) =>
|
||||||
fetchPage(i + 1)
|
fetchPage(i + 1),
|
||||||
);
|
);
|
||||||
allResults = await Promise.all(pagePromises);
|
allResults = await Promise.all(pagePromises);
|
||||||
|
|
||||||
@@ -144,7 +144,7 @@ export async function getMapResults({
|
|||||||
// Parallelize sitemap fetch with serper search
|
// Parallelize sitemap fetch with serper search
|
||||||
const [sitemap, ...searchResults] = await Promise.all([
|
const [sitemap, ...searchResults] = await Promise.all([
|
||||||
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||||
...(cachedResult ? [] : pagePromises)
|
...(cachedResult ? [] : pagePromises),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (!cachedResult) {
|
if (!cachedResult) {
|
||||||
@@ -172,7 +172,7 @@ export async function getMapResults({
|
|||||||
links = [
|
links = [
|
||||||
mapResults[0].url,
|
mapResults[0].url,
|
||||||
...mapResults.slice(1).map((x) => x.url),
|
...mapResults.slice(1).map((x) => x.url),
|
||||||
...links
|
...links,
|
||||||
];
|
];
|
||||||
} else {
|
} else {
|
||||||
mapResults.map((x) => {
|
mapResults.map((x) => {
|
||||||
@@ -218,13 +218,13 @@ export async function getMapResults({
|
|||||||
links: includeMetadata ? mapResults : linksToReturn,
|
links: includeMetadata ? mapResults : linksToReturn,
|
||||||
scrape_id: origin?.includes("website") ? id : undefined,
|
scrape_id: origin?.includes("website") ? id : undefined,
|
||||||
job_id: id,
|
job_id: id,
|
||||||
time_taken: (new Date().getTime() - Date.now()) / 1000
|
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function mapController(
|
export async function mapController(
|
||||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||||
res: Response<MapResponse>
|
res: Response<MapResponse>,
|
||||||
) {
|
) {
|
||||||
req.body = mapRequestSchema.parse(req.body);
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
|
|
||||||
@@ -237,13 +237,13 @@ export async function mapController(
|
|||||||
crawlerOptions: req.body,
|
crawlerOptions: req.body,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan
|
plan: req.auth.plan,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Bill the team
|
// Bill the team
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -261,13 +261,13 @@ export async function mapController(
|
|||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
scrapeOptions: {},
|
scrapeOptions: {},
|
||||||
origin: req.body.origin ?? "api",
|
origin: req.body.origin ?? "api",
|
||||||
num_tokens: 0
|
num_tokens: 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
const response = {
|
const response = {
|
||||||
success: true as const,
|
success: true as const,
|
||||||
links: result.links,
|
links: result.links,
|
||||||
scrape_id: result.scrape_id
|
scrape_id: result.scrape_id,
|
||||||
};
|
};
|
||||||
|
|
||||||
return res.status(200).json(response);
|
return res.status(200).json(response);
|
||||||
|
|||||||
@@ -13,29 +13,29 @@ export async function scrapeStatusController(req: any, res: any) {
|
|||||||
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
|
||||||
const allowedTeams = [
|
const allowedTeams = [
|
||||||
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
|
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
|
||||||
"511544f2-2fce-4183-9c59-6c29b02c69b5"
|
"511544f2-2fce-4183-9c59-6c29b02c69b5",
|
||||||
];
|
];
|
||||||
|
|
||||||
if (!allowedTeams.includes(job?.team_id)) {
|
if (!allowedTeams.includes(job?.team_id)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "You are not allowed to access this resource."
|
error: "You are not allowed to access this resource.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: job?.docs[0]
|
data: job?.docs[0],
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message == "Too Many Requests") {
|
if (error instanceof Error && error.message == "Too Many Requests") {
|
||||||
return res.status(429).json({
|
return res.status(429).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Rate limit exceeded. Please try again later."
|
error: "Rate limit exceeded. Please try again later.",
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "An unexpected error occurred."
|
error: "An unexpected error occurred.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import {
|
|||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
ScrapeRequest,
|
ScrapeRequest,
|
||||||
scrapeRequestSchema,
|
scrapeRequestSchema,
|
||||||
ScrapeResponse
|
ScrapeResponse,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
@@ -17,7 +17,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
|||||||
|
|
||||||
export async function scrapeController(
|
export async function scrapeController(
|
||||||
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||||
res: Response<ScrapeResponse>
|
res: Response<ScrapeResponse>,
|
||||||
) {
|
) {
|
||||||
req.body = scrapeRequestSchema.parse(req.body);
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
@@ -30,7 +30,7 @@ export async function scrapeController(
|
|||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
plan: req.auth.plan as PlanType,
|
plan: req.auth.plan as PlanType,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
basePriority: 10
|
basePriority: 10,
|
||||||
});
|
});
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
@@ -42,18 +42,18 @@ export async function scrapeController(
|
|||||||
internalOptions: {},
|
internalOptions: {},
|
||||||
plan: req.auth.plan!,
|
plan: req.auth.plan!,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true
|
is_scrape: true,
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority
|
jobPriority,
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait =
|
const totalWait =
|
||||||
(req.body.waitFor ?? 0) +
|
(req.body.waitFor ?? 0) +
|
||||||
(req.body.actions ?? []).reduce(
|
(req.body.actions ?? []).reduce(
|
||||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 0) : 0) + a,
|
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 0) : 0) + a,
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
|
|
||||||
let doc: Document;
|
let doc: Document;
|
||||||
@@ -67,12 +67,12 @@ export async function scrapeController(
|
|||||||
) {
|
) {
|
||||||
return res.status(408).json({
|
return res.status(408).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Request timed out"
|
error: "Request timed out",
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: `(Internal server error) - ${e && e.message ? e.message : e}`
|
error: `(Internal server error) - ${e && e.message ? e.message : e}`,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -99,10 +99,10 @@ export async function scrapeController(
|
|||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
|
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
|
||||||
(error) => {
|
(error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`
|
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`,
|
||||||
);
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!req.body.formats.includes("rawHtml")) {
|
if (!req.body.formats.includes("rawHtml")) {
|
||||||
@@ -123,12 +123,12 @@ export async function scrapeController(
|
|||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
scrapeOptions: req.body,
|
scrapeOptions: req.body,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
num_tokens: numTokens
|
num_tokens: numTokens,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: doc,
|
data: doc,
|
||||||
scrape_id: origin?.includes("website") ? jobId : undefined
|
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import {
|
|||||||
ExtractorOptions,
|
ExtractorOptions,
|
||||||
PageOptions,
|
PageOptions,
|
||||||
ScrapeActionContent,
|
ScrapeActionContent,
|
||||||
Document as V0Document
|
Document as V0Document,
|
||||||
} from "../../lib/entities";
|
} from "../../lib/entities";
|
||||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ export const url = z.preprocess(
|
|||||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||||
.refine(
|
.refine(
|
||||||
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
|
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
|
||||||
"URL must have a valid top-level domain or be a valid path"
|
"URL must have a valid top-level domain or be a valid path",
|
||||||
)
|
)
|
||||||
.refine((x) => {
|
.refine((x) => {
|
||||||
try {
|
try {
|
||||||
@@ -46,8 +46,8 @@ export const url = z.preprocess(
|
|||||||
}, "Invalid URL")
|
}, "Invalid URL")
|
||||||
.refine(
|
.refine(
|
||||||
(x) => !isUrlBlocked(x as string),
|
(x) => !isUrlBlocked(x as string),
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
const strictMessage =
|
const strictMessage =
|
||||||
@@ -60,9 +60,9 @@ export const extractOptions = z
|
|||||||
systemPrompt: z
|
systemPrompt: z
|
||||||
.string()
|
.string()
|
||||||
.default(
|
.default(
|
||||||
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."
|
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required.",
|
||||||
),
|
),
|
||||||
prompt: z.string().optional()
|
prompt: z.string().optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ export const actionsSchema = z.array(
|
|||||||
.object({
|
.object({
|
||||||
type: z.literal("wait"),
|
type: z.literal("wait"),
|
||||||
milliseconds: z.number().int().positive().finite().optional(),
|
milliseconds: z.number().int().positive().finite().optional(),
|
||||||
selector: z.string().optional()
|
selector: z.string().optional(),
|
||||||
})
|
})
|
||||||
.refine(
|
.refine(
|
||||||
(data) =>
|
(data) =>
|
||||||
@@ -82,38 +82,38 @@ export const actionsSchema = z.array(
|
|||||||
!(data.milliseconds !== undefined && data.selector !== undefined),
|
!(data.milliseconds !== undefined && data.selector !== undefined),
|
||||||
{
|
{
|
||||||
message:
|
message:
|
||||||
"Either 'milliseconds' or 'selector' must be provided, but not both."
|
"Either 'milliseconds' or 'selector' must be provided, but not both.",
|
||||||
}
|
},
|
||||||
),
|
),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("click"),
|
type: z.literal("click"),
|
||||||
selector: z.string()
|
selector: z.string(),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("screenshot"),
|
type: z.literal("screenshot"),
|
||||||
fullPage: z.boolean().default(false)
|
fullPage: z.boolean().default(false),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("write"),
|
type: z.literal("write"),
|
||||||
text: z.string()
|
text: z.string(),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("press"),
|
type: z.literal("press"),
|
||||||
key: z.string()
|
key: z.string(),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("scroll"),
|
type: z.literal("scroll"),
|
||||||
direction: z.enum(["up", "down"]).optional().default("down"),
|
direction: z.enum(["up", "down"]).optional().default("down"),
|
||||||
selector: z.string().optional()
|
selector: z.string().optional(),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("scrape")
|
type: z.literal("scrape"),
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("executeJavascript"),
|
type: z.literal("executeJavascript"),
|
||||||
script: z.string()
|
script: z.string(),
|
||||||
})
|
}),
|
||||||
])
|
]),
|
||||||
);
|
);
|
||||||
|
|
||||||
export const scrapeOptions = z
|
export const scrapeOptions = z
|
||||||
@@ -126,14 +126,14 @@ export const scrapeOptions = z
|
|||||||
"links",
|
"links",
|
||||||
"screenshot",
|
"screenshot",
|
||||||
"screenshot@fullPage",
|
"screenshot@fullPage",
|
||||||
"extract"
|
"extract",
|
||||||
])
|
])
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
.default(["markdown"])
|
.default(["markdown"])
|
||||||
.refine(
|
.refine(
|
||||||
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
|
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
|
||||||
"You may only specify either screenshot or screenshot@fullPage"
|
"You may only specify either screenshot or screenshot@fullPage",
|
||||||
),
|
),
|
||||||
headers: z.record(z.string(), z.string()).optional(),
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
includeTags: z.string().array().optional(),
|
includeTags: z.string().array().optional(),
|
||||||
@@ -155,11 +155,11 @@ export const scrapeOptions = z
|
|||||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||||
{
|
{
|
||||||
message:
|
message:
|
||||||
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code."
|
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
.transform((val) => (val ? val.toUpperCase() : "US")),
|
.transform((val) => (val ? val.toUpperCase() : "US")),
|
||||||
languages: z.string().array().optional()
|
languages: z.string().array().optional(),
|
||||||
})
|
})
|
||||||
.optional(),
|
.optional(),
|
||||||
|
|
||||||
@@ -173,15 +173,15 @@ export const scrapeOptions = z
|
|||||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||||
{
|
{
|
||||||
message:
|
message:
|
||||||
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code."
|
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
.transform((val) => (val ? val.toUpperCase() : "US")),
|
.transform((val) => (val ? val.toUpperCase() : "US")),
|
||||||
languages: z.string().array().optional()
|
languages: z.string().array().optional(),
|
||||||
})
|
})
|
||||||
.optional(),
|
.optional(),
|
||||||
skipTlsVerification: z.boolean().default(false),
|
skipTlsVerification: z.boolean().default(false),
|
||||||
removeBase64Images: z.boolean().default(true)
|
removeBase64Images: z.boolean().default(true),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@@ -199,7 +199,7 @@ export const extractV1Options = z
|
|||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000)
|
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@@ -212,7 +212,7 @@ export const scrapeRequestSchema = scrapeOptions
|
|||||||
.extend({
|
.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(30000)
|
timeout: z.number().int().positive().finite().safe().default(30000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.refine(
|
.refine(
|
||||||
@@ -226,8 +226,8 @@ export const scrapeRequestSchema = scrapeOptions
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
message:
|
message:
|
||||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa"
|
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
.transform((obj) => {
|
.transform((obj) => {
|
||||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||||
@@ -250,9 +250,9 @@ export const webhookSchema = z.preprocess(
|
|||||||
z
|
z
|
||||||
.object({
|
.object({
|
||||||
url: z.string().url(),
|
url: z.string().url(),
|
||||||
headers: z.record(z.string(), z.string()).default({})
|
headers: z.record(z.string(), z.string()).default({}),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage),
|
||||||
);
|
);
|
||||||
|
|
||||||
export const batchScrapeRequestSchema = scrapeOptions
|
export const batchScrapeRequestSchema = scrapeOptions
|
||||||
@@ -260,7 +260,7 @@ export const batchScrapeRequestSchema = scrapeOptions
|
|||||||
urls: url.array(),
|
urls: url.array(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
webhook: webhookSchema.optional(),
|
webhook: webhookSchema.optional(),
|
||||||
appendToId: z.string().uuid().optional()
|
appendToId: z.string().uuid().optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
.refine(
|
.refine(
|
||||||
@@ -274,8 +274,8 @@ export const batchScrapeRequestSchema = scrapeOptions
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
message:
|
message:
|
||||||
"When 'extract' format is specified, 'extract' options must be provided, and vice versa"
|
"When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||||
@@ -292,7 +292,7 @@ const crawlerOptions = z
|
|||||||
ignoreRobotsTxt: z.boolean().default(false),
|
ignoreRobotsTxt: z.boolean().default(false),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
deduplicateSimilarURLs: z.boolean().default(true),
|
deduplicateSimilarURLs: z.boolean().default(true),
|
||||||
ignoreQueryParameters: z.boolean().default(false)
|
ignoreQueryParameters: z.boolean().default(false),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@@ -314,7 +314,7 @@ export const crawlRequestSchema = crawlerOptions
|
|||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
scrapeOptions: scrapeOptions.default({}),
|
scrapeOptions: scrapeOptions.default({}),
|
||||||
webhook: webhookSchema.optional(),
|
webhook: webhookSchema.optional(),
|
||||||
limit: z.number().default(10000)
|
limit: z.number().default(10000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@@ -340,7 +340,7 @@ export const mapRequestSchema = crawlerOptions
|
|||||||
search: z.string().optional(),
|
search: z.string().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
sitemapOnly: z.boolean().default(false),
|
sitemapOnly: z.boolean().default(false),
|
||||||
limit: z.number().min(1).max(5000).default(5000)
|
limit: z.number().min(1).max(5000).default(5000),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@@ -510,7 +510,7 @@ export type AuthCreditUsageChunk = {
|
|||||||
export interface RequestWithMaybeACUC<
|
export interface RequestWithMaybeACUC<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined,
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
acuc?: AuthCreditUsageChunk;
|
acuc?: AuthCreditUsageChunk;
|
||||||
}
|
}
|
||||||
@@ -518,7 +518,7 @@ export interface RequestWithMaybeACUC<
|
|||||||
export interface RequestWithACUC<
|
export interface RequestWithACUC<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined,
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
acuc: AuthCreditUsageChunk;
|
acuc: AuthCreditUsageChunk;
|
||||||
}
|
}
|
||||||
@@ -526,7 +526,7 @@ export interface RequestWithACUC<
|
|||||||
export interface RequestWithAuth<
|
export interface RequestWithAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined,
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
account?: Account;
|
account?: Account;
|
||||||
@@ -535,7 +535,7 @@ export interface RequestWithAuth<
|
|||||||
export interface RequestWithMaybeAuth<
|
export interface RequestWithMaybeAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined,
|
||||||
> extends RequestWithMaybeACUC<ReqParams, ReqBody, ResBody> {
|
> extends RequestWithMaybeACUC<ReqParams, ReqBody, ResBody> {
|
||||||
auth?: AuthObject;
|
auth?: AuthObject;
|
||||||
account?: Account;
|
account?: Account;
|
||||||
@@ -544,7 +544,7 @@ export interface RequestWithMaybeAuth<
|
|||||||
export interface RequestWithAuth<
|
export interface RequestWithAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined,
|
||||||
> extends RequestWithACUC<ReqParams, ReqBody, ResBody> {
|
> extends RequestWithACUC<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
account?: Account;
|
account?: Account;
|
||||||
@@ -569,7 +569,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -589,11 +589,11 @@ export function fromLegacyCrawlerOptions(x: any): {
|
|||||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
v0CrawlOnlyUrls: x.returnOnlyUrls
|
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -605,7 +605,7 @@ export interface MapDocument {
|
|||||||
export function fromLegacyScrapeOptions(
|
export function fromLegacyScrapeOptions(
|
||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
extractorOptions: ExtractorOptions | undefined,
|
extractorOptions: ExtractorOptions | undefined,
|
||||||
timeout: number | undefined
|
timeout: number | undefined,
|
||||||
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||||
return {
|
return {
|
||||||
scrapeOptions: scrapeOptions.parse({
|
scrapeOptions: scrapeOptions.parse({
|
||||||
@@ -621,7 +621,7 @@ export function fromLegacyScrapeOptions(
|
|||||||
extractorOptions.mode.includes("llm-extraction")
|
extractorOptions.mode.includes("llm-extraction")
|
||||||
? ("extract" as const)
|
? ("extract" as const)
|
||||||
: null,
|
: null,
|
||||||
"links"
|
"links",
|
||||||
].filter((x) => x !== null),
|
].filter((x) => x !== null),
|
||||||
waitFor: pageOptions.waitFor,
|
waitFor: pageOptions.waitFor,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
@@ -646,16 +646,16 @@ export function fromLegacyScrapeOptions(
|
|||||||
? {
|
? {
|
||||||
systemPrompt: extractorOptions.extractionPrompt,
|
systemPrompt: extractorOptions.extractionPrompt,
|
||||||
prompt: extractorOptions.userPrompt,
|
prompt: extractorOptions.userPrompt,
|
||||||
schema: extractorOptions.extractionSchema
|
schema: extractorOptions.extractionSchema,
|
||||||
}
|
}
|
||||||
: undefined,
|
: undefined,
|
||||||
mobile: pageOptions.mobile
|
mobile: pageOptions.mobile,
|
||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
atsv: pageOptions.atsv,
|
atsv: pageOptions.atsv,
|
||||||
v0DisableJsDom: pageOptions.disableJsDom,
|
v0DisableJsDom: pageOptions.disableJsDom,
|
||||||
v0UseFastMode: pageOptions.useFastMode
|
v0UseFastMode: pageOptions.useFastMode,
|
||||||
}
|
},
|
||||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -664,12 +664,12 @@ export function fromLegacyCombo(
|
|||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
extractorOptions: ExtractorOptions | undefined,
|
extractorOptions: ExtractorOptions | undefined,
|
||||||
timeout: number | undefined,
|
timeout: number | undefined,
|
||||||
crawlerOptions: any
|
crawlerOptions: any,
|
||||||
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||||
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
|
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
|
||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout
|
timeout,
|
||||||
);
|
);
|
||||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
||||||
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||||
@@ -677,7 +677,7 @@ export function fromLegacyCombo(
|
|||||||
|
|
||||||
export function toLegacyDocument(
|
export function toLegacyDocument(
|
||||||
document: Document,
|
document: Document,
|
||||||
internalOptions: InternalOptions
|
internalOptions: InternalOptions,
|
||||||
): V0Document | { url: string } {
|
): V0Document | { url: string } {
|
||||||
if (internalOptions.v0CrawlOnlyUrls) {
|
if (internalOptions.v0CrawlOnlyUrls) {
|
||||||
return { url: document.metadata.sourceURL! };
|
return { url: document.metadata.sourceURL! };
|
||||||
@@ -696,9 +696,9 @@ export function toLegacyDocument(
|
|||||||
statusCode: undefined,
|
statusCode: undefined,
|
||||||
pageError: document.metadata.error,
|
pageError: document.metadata.error,
|
||||||
pageStatusCode: document.metadata.statusCode,
|
pageStatusCode: document.metadata.statusCode,
|
||||||
screenshot: document.screenshot
|
screenshot: document.screenshot,
|
||||||
},
|
},
|
||||||
actions: document.actions,
|
actions: document.actions,
|
||||||
warning: document.warning
|
warning: document.warning,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
+15
-15
@@ -46,12 +46,12 @@ serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
|||||||
|
|
||||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||||
queues: [new BullAdapter(getScrapeQueue())],
|
queues: [new BullAdapter(getScrapeQueue())],
|
||||||
serverAdapter: serverAdapter
|
serverAdapter: serverAdapter,
|
||||||
});
|
});
|
||||||
|
|
||||||
app.use(
|
app.use(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||||
serverAdapter.getRouter()
|
serverAdapter.getRouter(),
|
||||||
);
|
);
|
||||||
|
|
||||||
app.get("/", (req, res) => {
|
app.get("/", (req, res) => {
|
||||||
@@ -75,7 +75,7 @@ function startServer(port = DEFAULT_PORT) {
|
|||||||
const server = app.listen(Number(port), HOST, () => {
|
const server = app.listen(Number(port), HOST, () => {
|
||||||
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
logger.info(`Worker ${process.pid} listening on port ${port}`);
|
||||||
logger.info(
|
logger.info(
|
||||||
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -103,7 +103,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
|||||||
const noWaitingJobs = waitingJobs === 0;
|
const noWaitingJobs = waitingJobs === 0;
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
return res.status(noWaitingJobs ? 200 : 500).json({
|
return res.status(noWaitingJobs ? 200 : 500).json({
|
||||||
waitingJobs
|
waitingJobs,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
@@ -120,7 +120,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||||||
const getWaitingJobsCount = async () => {
|
const getWaitingJobsCount = async () => {
|
||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const [waitingJobsCount] = await Promise.all([
|
const [waitingJobsCount] = await Promise.all([
|
||||||
scrapeQueue.getWaitingCount()
|
scrapeQueue.getWaitingCount(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return waitingJobsCount;
|
return waitingJobsCount;
|
||||||
@@ -140,15 +140,15 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||||||
const message = {
|
const message = {
|
||||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||||
timeout / 60000
|
timeout / 60000
|
||||||
} minute(s).`
|
} minute(s).`,
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await fetch(slackWebhookUrl, {
|
const response = await fetch(slackWebhookUrl, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify(message)
|
body: JSON.stringify(message),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
@@ -176,7 +176,7 @@ app.use(
|
|||||||
err: unknown,
|
err: unknown,
|
||||||
req: Request<{}, ErrorResponse, undefined>,
|
req: Request<{}, ErrorResponse, undefined>,
|
||||||
res: Response<ErrorResponse>,
|
res: Response<ErrorResponse>,
|
||||||
next: NextFunction
|
next: NextFunction,
|
||||||
) => {
|
) => {
|
||||||
if (err instanceof ZodError) {
|
if (err instanceof ZodError) {
|
||||||
if (
|
if (
|
||||||
@@ -192,7 +192,7 @@ app.use(
|
|||||||
} else {
|
} else {
|
||||||
next(err);
|
next(err);
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
Sentry.setupExpressErrorHandler(app);
|
Sentry.setupExpressErrorHandler(app);
|
||||||
@@ -202,7 +202,7 @@ app.use(
|
|||||||
err: unknown,
|
err: unknown,
|
||||||
req: Request<{}, ErrorResponse, undefined>,
|
req: Request<{}, ErrorResponse, undefined>,
|
||||||
res: ResponseWithSentry<ErrorResponse>,
|
res: ResponseWithSentry<ErrorResponse>,
|
||||||
next: NextFunction
|
next: NextFunction,
|
||||||
) => {
|
) => {
|
||||||
if (
|
if (
|
||||||
err instanceof SyntaxError &&
|
err instanceof SyntaxError &&
|
||||||
@@ -222,7 +222,7 @@ app.use(
|
|||||||
verbose = JSON.stringify({
|
verbose = JSON.stringify({
|
||||||
message: err.message,
|
message: err.message,
|
||||||
name: err.name,
|
name: err.name,
|
||||||
stack: err.stack
|
stack: err.stack,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -233,15 +233,15 @@ app.use(
|
|||||||
") -- ID " +
|
") -- ID " +
|
||||||
id +
|
id +
|
||||||
" -- " +
|
" -- " +
|
||||||
verbose
|
verbose,
|
||||||
);
|
);
|
||||||
res.status(500).json({
|
res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
|
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
|
||||||
id
|
id,
|
||||||
});
|
});
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
logger.info(`Worker ${process.pid} started`);
|
logger.info(`Worker ${process.pid} started`);
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import { logger } from "../logger";
|
|||||||
export async function generateCompletions(
|
export async function generateCompletions(
|
||||||
documents: Document[],
|
documents: Document[],
|
||||||
extractionOptions: ExtractorOptions | undefined,
|
extractionOptions: ExtractorOptions | undefined,
|
||||||
mode: "markdown" | "raw-html"
|
mode: "markdown" | "raw-html",
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
// const schema = zodToJsonSchema(options.schema)
|
// const schema = zodToJsonSchema(options.schema)
|
||||||
|
|
||||||
@@ -32,7 +32,7 @@ export async function generateCompletions(
|
|||||||
schema: schema,
|
schema: schema,
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
systemPrompt: systemPrompt,
|
systemPrompt: systemPrompt,
|
||||||
mode: mode
|
mode: mode,
|
||||||
});
|
});
|
||||||
// Validate the JSON output against the schema using AJV
|
// Validate the JSON output against the schema using AJV
|
||||||
if (schema) {
|
if (schema) {
|
||||||
@@ -43,8 +43,8 @@ export async function generateCompletions(
|
|||||||
`JSON parsing error(s): ${validate.errors
|
`JSON parsing error(s): ${validate.errors
|
||||||
?.map((err) => err.message)
|
?.map((err) => err.message)
|
||||||
.join(
|
.join(
|
||||||
", "
|
", ",
|
||||||
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -57,7 +57,7 @@ export async function generateCompletions(
|
|||||||
default:
|
default:
|
||||||
throw new Error("Invalid client");
|
throw new Error("Invalid client");
|
||||||
}
|
}
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
return completions;
|
return completions;
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ const defaultPrompt =
|
|||||||
|
|
||||||
function prepareOpenAIDoc(
|
function prepareOpenAIDoc(
|
||||||
document: Document,
|
document: Document,
|
||||||
mode: "markdown" | "raw-html"
|
mode: "markdown" | "raw-html",
|
||||||
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
|
): [OpenAI.Chat.Completions.ChatCompletionContentPart[], number] | null {
|
||||||
let markdown = document.markdown;
|
let markdown = document.markdown;
|
||||||
|
|
||||||
@@ -50,7 +50,7 @@ export async function generateOpenAICompletions({
|
|||||||
systemPrompt = defaultPrompt,
|
systemPrompt = defaultPrompt,
|
||||||
prompt,
|
prompt,
|
||||||
temperature,
|
temperature,
|
||||||
mode
|
mode,
|
||||||
}: {
|
}: {
|
||||||
client: OpenAI;
|
client: OpenAI;
|
||||||
model?: string;
|
model?: string;
|
||||||
@@ -68,7 +68,7 @@ export async function generateOpenAICompletions({
|
|||||||
return {
|
return {
|
||||||
...document,
|
...document,
|
||||||
warning:
|
warning:
|
||||||
"LLM extraction was not performed since the document's content is empty or missing."
|
"LLM extraction was not performed since the document's content is empty or missing.",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
const [content, numTokens] = preparedDoc;
|
const [content, numTokens] = preparedDoc;
|
||||||
@@ -81,21 +81,21 @@ export async function generateOpenAICompletions({
|
|||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: systemPrompt
|
content: systemPrompt,
|
||||||
},
|
},
|
||||||
{ role: "user", content },
|
{ role: "user", content },
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content: `Transform the above content into structured json output based on the following user request: ${prompt}`
|
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
|
||||||
}
|
},
|
||||||
],
|
],
|
||||||
response_format: { type: "json_object" },
|
response_format: { type: "json_object" },
|
||||||
temperature
|
temperature,
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llmExtraction = JSON.parse(
|
llmExtraction = JSON.parse(
|
||||||
(jsonCompletion.choices[0].message.content ?? "").trim()
|
(jsonCompletion.choices[0].message.content ?? "").trim(),
|
||||||
);
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
throw new Error("Invalid JSON");
|
throw new Error("Invalid JSON");
|
||||||
@@ -106,9 +106,9 @@ export async function generateOpenAICompletions({
|
|||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: systemPrompt
|
content: systemPrompt,
|
||||||
},
|
},
|
||||||
{ role: "user", content }
|
{ role: "user", content },
|
||||||
],
|
],
|
||||||
tools: [
|
tools: [
|
||||||
{
|
{
|
||||||
@@ -116,12 +116,12 @@ export async function generateOpenAICompletions({
|
|||||||
function: {
|
function: {
|
||||||
name: "extract_content",
|
name: "extract_content",
|
||||||
description: "Extracts the content from the given webpage(s)",
|
description: "Extracts the content from the given webpage(s)",
|
||||||
parameters: schema
|
parameters: schema,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
],
|
],
|
||||||
tool_choice: { type: "function", function: { name: "extract_content" } },
|
tool_choice: { type: "function", function: { name: "extract_content" } },
|
||||||
temperature
|
temperature,
|
||||||
});
|
});
|
||||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||||
|
|
||||||
@@ -140,6 +140,6 @@ export async function generateOpenAICompletions({
|
|||||||
warning:
|
warning:
|
||||||
numTokens > maxTokens
|
numTokens > maxTokens
|
||||||
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
||||||
: undefined
|
: undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,16 +31,16 @@ describe("parseMarkdown", () => {
|
|||||||
{ html: "<html><p>Unclosed tag", expected: "Unclosed tag" },
|
{ html: "<html><p>Unclosed tag", expected: "Unclosed tag" },
|
||||||
{
|
{
|
||||||
html: "<div><span>Missing closing div",
|
html: "<div><span>Missing closing div",
|
||||||
expected: "Missing closing div"
|
expected: "Missing closing div",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
html: "<p><strong>Wrong nesting</em></strong></p>",
|
html: "<p><strong>Wrong nesting</em></strong></p>",
|
||||||
expected: "**Wrong nesting**"
|
expected: "**Wrong nesting**",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
html: '<a href="http://example.com">Link without closing tag',
|
html: '<a href="http://example.com">Link without closing tag',
|
||||||
expected: "[Link without closing tag](http://example.com)"
|
expected: "[Link without closing tag](http://example.com)",
|
||||||
}
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const { html, expected } of invalidHtmls) {
|
for (const { html, expected } of invalidHtmls) {
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import {
|
import {
|
||||||
getJobPriority,
|
getJobPriority,
|
||||||
addJobPriority,
|
addJobPriority,
|
||||||
deleteJobPriority
|
deleteJobPriority,
|
||||||
} from "../job-priority";
|
} from "../job-priority";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisConnection } from "../../services/queue-service";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
@@ -11,8 +11,8 @@ jest.mock("../../services/queue-service", () => ({
|
|||||||
sadd: jest.fn(),
|
sadd: jest.fn(),
|
||||||
srem: jest.fn(),
|
srem: jest.fn(),
|
||||||
scard: jest.fn(),
|
scard: jest.fn(),
|
||||||
expire: jest.fn()
|
expire: jest.fn(),
|
||||||
}
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
describe("Job Priority Tests", () => {
|
describe("Job Priority Tests", () => {
|
||||||
@@ -26,11 +26,11 @@ describe("Job Priority Tests", () => {
|
|||||||
await addJobPriority(team_id, job_id);
|
await addJobPriority(team_id, job_id);
|
||||||
expect(redisConnection.sadd).toHaveBeenCalledWith(
|
expect(redisConnection.sadd).toHaveBeenCalledWith(
|
||||||
`limit_team_id:${team_id}`,
|
`limit_team_id:${team_id}`,
|
||||||
job_id
|
job_id,
|
||||||
);
|
);
|
||||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||||
`limit_team_id:${team_id}`,
|
`limit_team_id:${team_id}`,
|
||||||
60
|
60,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ describe("Job Priority Tests", () => {
|
|||||||
await deleteJobPriority(team_id, job_id);
|
await deleteJobPriority(team_id, job_id);
|
||||||
expect(redisConnection.srem).toHaveBeenCalledWith(
|
expect(redisConnection.srem).toHaveBeenCalledWith(
|
||||||
`limit_team_id:${team_id}`,
|
`limit_team_id:${team_id}`,
|
||||||
job_id
|
job_id,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ describe("Job Priority Tests", () => {
|
|||||||
await addJobPriority(team_id, job_id1);
|
await addJobPriority(team_id, job_id1);
|
||||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||||
`limit_team_id:${team_id}`,
|
`limit_team_id:${team_id}`,
|
||||||
60
|
60,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Clear the mock calls
|
// Clear the mock calls
|
||||||
@@ -99,7 +99,7 @@ describe("Job Priority Tests", () => {
|
|||||||
await addJobPriority(team_id, job_id2);
|
await addJobPriority(team_id, job_id2);
|
||||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||||
`limit_team_id:${team_id}`,
|
`limit_team_id:${team_id}`,
|
||||||
60
|
60,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -112,7 +112,7 @@ describe("Job Priority Tests", () => {
|
|||||||
await addJobPriority(team_id, job_id);
|
await addJobPriority(team_id, job_id);
|
||||||
expect(redisConnection.expire).toHaveBeenCalledWith(
|
expect(redisConnection.expire).toHaveBeenCalledWith(
|
||||||
`limit_team_id:${team_id}`,
|
`limit_team_id:${team_id}`,
|
||||||
60
|
60,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Fast-forward time by 59 seconds
|
// Fast-forward time by 59 seconds
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
export async function batchProcess<T>(
|
export async function batchProcess<T>(
|
||||||
array: T[],
|
array: T[],
|
||||||
batchSize: number,
|
batchSize: number,
|
||||||
asyncFunction: (item: T, index: number) => Promise<void>
|
asyncFunction: (item: T, index: number) => Promise<void>,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const batches: T[][] = [];
|
const batches: T[][] = [];
|
||||||
for (let i = 0; i < array.length; i += batchSize) {
|
for (let i = 0; i < array.length; i += batchSize) {
|
||||||
|
|||||||
@@ -6,14 +6,14 @@ const logger = _logger.child({ module: "cache" });
|
|||||||
|
|
||||||
export const cacheRedis = process.env.CACHE_REDIS_URL
|
export const cacheRedis = process.env.CACHE_REDIS_URL
|
||||||
? new IORedis(process.env.CACHE_REDIS_URL, {
|
? new IORedis(process.env.CACHE_REDIS_URL, {
|
||||||
maxRetriesPerRequest: null
|
maxRetriesPerRequest: null,
|
||||||
})
|
})
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
export function cacheKey(
|
export function cacheKey(
|
||||||
url: string,
|
url: string,
|
||||||
scrapeOptions: ScrapeOptions,
|
scrapeOptions: ScrapeOptions,
|
||||||
internalOptions: InternalOptions
|
internalOptions: InternalOptions,
|
||||||
): string | null {
|
): string | null {
|
||||||
if (!cacheRedis) return null;
|
if (!cacheRedis) return null;
|
||||||
|
|
||||||
@@ -49,7 +49,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function getEntryFromCache(
|
export async function getEntryFromCache(
|
||||||
key: string
|
key: string,
|
||||||
): Promise<CacheEntry | null> {
|
): Promise<CacheEntry | null> {
|
||||||
if (!cacheRedis) return null;
|
if (!cacheRedis) return null;
|
||||||
|
|
||||||
|
|||||||
@@ -14,37 +14,37 @@ export function getConcurrencyLimitMax(plan: string): number {
|
|||||||
|
|
||||||
export async function cleanOldConcurrencyLimitEntries(
|
export async function cleanOldConcurrencyLimitEntries(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
now: number = Date.now()
|
now: number = Date.now(),
|
||||||
) {
|
) {
|
||||||
await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
|
await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getConcurrencyLimitActiveJobs(
|
export async function getConcurrencyLimitActiveJobs(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
now: number = Date.now()
|
now: number = Date.now(),
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
return await redisConnection.zrangebyscore(
|
return await redisConnection.zrangebyscore(
|
||||||
constructKey(team_id),
|
constructKey(team_id),
|
||||||
now,
|
now,
|
||||||
Infinity
|
Infinity,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function pushConcurrencyLimitActiveJob(
|
export async function pushConcurrencyLimitActiveJob(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
id: string,
|
id: string,
|
||||||
now: number = Date.now()
|
now: number = Date.now(),
|
||||||
) {
|
) {
|
||||||
await redisConnection.zadd(
|
await redisConnection.zadd(
|
||||||
constructKey(team_id),
|
constructKey(team_id),
|
||||||
now + stalledJobTimeoutMs,
|
now + stalledJobTimeoutMs,
|
||||||
id
|
id,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function removeConcurrencyLimitActiveJob(
|
export async function removeConcurrencyLimitActiveJob(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
id: string
|
id: string,
|
||||||
) {
|
) {
|
||||||
await redisConnection.zrem(constructKey(team_id), id);
|
await redisConnection.zrem(constructKey(team_id), id);
|
||||||
}
|
}
|
||||||
@@ -57,7 +57,7 @@ export type ConcurrencyLimitedJob = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export async function takeConcurrencyLimitedJob(
|
export async function takeConcurrencyLimitedJob(
|
||||||
team_id: string
|
team_id: string,
|
||||||
): Promise<ConcurrencyLimitedJob | null> {
|
): Promise<ConcurrencyLimitedJob | null> {
|
||||||
const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
|
const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
|
||||||
if (res === null || res === undefined) {
|
if (res === null || res === undefined) {
|
||||||
@@ -69,11 +69,11 @@ export async function takeConcurrencyLimitedJob(
|
|||||||
|
|
||||||
export async function pushConcurrencyLimitedJob(
|
export async function pushConcurrencyLimitedJob(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
job: ConcurrencyLimitedJob
|
job: ConcurrencyLimitedJob,
|
||||||
) {
|
) {
|
||||||
await redisConnection.zadd(
|
await redisConnection.zadd(
|
||||||
constructQueueKey(team_id),
|
constructQueueKey(team_id),
|
||||||
job.priority ?? 1,
|
job.priority ?? 1,
|
||||||
JSON.stringify(job)
|
JSON.stringify(job),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import { generateURLPermutations } from "./crawl-redis";
|
|||||||
describe("generateURLPermutations", () => {
|
describe("generateURLPermutations", () => {
|
||||||
it("generates permutations correctly", () => {
|
it("generates permutations correctly", () => {
|
||||||
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(
|
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(
|
||||||
(x) => x.href
|
(x) => x.href,
|
||||||
);
|
);
|
||||||
expect(bareHttps.length).toBe(4);
|
expect(bareHttps.length).toBe(4);
|
||||||
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
@@ -12,7 +12,7 @@ describe("generateURLPermutations", () => {
|
|||||||
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(
|
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(
|
||||||
(x) => x.href
|
(x) => x.href,
|
||||||
);
|
);
|
||||||
expect(bareHttp.length).toBe(4);
|
expect(bareHttp.length).toBe(4);
|
||||||
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
@@ -21,7 +21,7 @@ describe("generateURLPermutations", () => {
|
|||||||
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(
|
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(
|
||||||
(x) => x.href
|
(x) => x.href,
|
||||||
);
|
);
|
||||||
expect(wwwHttps.length).toBe(4);
|
expect(wwwHttps.length).toBe(4);
|
||||||
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
@@ -30,7 +30,7 @@ describe("generateURLPermutations", () => {
|
|||||||
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(
|
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(
|
||||||
(x) => x.href
|
(x) => x.href,
|
||||||
);
|
);
|
||||||
expect(wwwHttp.length).toBe(4);
|
expect(wwwHttp.length).toBe(4);
|
||||||
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
|||||||
method: "saveCrawl",
|
method: "saveCrawl",
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
teamId: crawl.team_id,
|
teamId: crawl.team_id,
|
||||||
plan: crawl.plan
|
plan: crawl.plan,
|
||||||
});
|
});
|
||||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
||||||
@@ -53,7 +53,7 @@ export async function addCrawlJob(id: string, job_id: string) {
|
|||||||
jobId: job_id,
|
jobId: job_id,
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "addCrawlJob",
|
method: "addCrawlJob",
|
||||||
crawlId: id
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
@@ -64,7 +64,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
|||||||
jobIds: job_ids,
|
jobIds: job_ids,
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "addCrawlJobs",
|
method: "addCrawlJobs",
|
||||||
crawlId: id
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
@@ -73,19 +73,19 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
|||||||
export async function addCrawlJobDone(
|
export async function addCrawlJobDone(
|
||||||
id: string,
|
id: string,
|
||||||
job_id: string,
|
job_id: string,
|
||||||
success: boolean
|
success: boolean,
|
||||||
) {
|
) {
|
||||||
_logger.debug("Adding done crawl job to Redis...", {
|
_logger.debug("Adding done crawl job to Redis...", {
|
||||||
jobId: job_id,
|
jobId: job_id,
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "addCrawlJobDone",
|
method: "addCrawlJobDone",
|
||||||
crawlId: id
|
crawlId: id,
|
||||||
});
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":jobs_done",
|
"crawl:" + id + ":jobs_done",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX"
|
"NX",
|
||||||
);
|
);
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
@@ -93,7 +93,7 @@ export async function addCrawlJobDone(
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX"
|
"NX",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -105,12 +105,12 @@ export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
|||||||
export async function getDoneJobsOrdered(
|
export async function getDoneJobsOrdered(
|
||||||
id: string,
|
id: string,
|
||||||
start = 0,
|
start = 0,
|
||||||
end = -1
|
end = -1,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
return await redisConnection.lrange(
|
return await redisConnection.lrange(
|
||||||
"crawl:" + id + ":jobs_done_ordered",
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
start,
|
start,
|
||||||
end
|
end,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,7 +130,7 @@ export async function finishCrawl(id: string) {
|
|||||||
_logger.debug("Marking crawl as finished.", {
|
_logger.debug("Marking crawl as finished.", {
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "finishCrawl",
|
method: "finishCrawl",
|
||||||
crawlId: id
|
crawlId: id,
|
||||||
});
|
});
|
||||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||||
if (set === 1) {
|
if (set === 1) {
|
||||||
@@ -141,7 +141,7 @@ export async function finishCrawl(id: string) {
|
|||||||
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
|
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
|
||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "finishCrawl",
|
method: "finishCrawl",
|
||||||
crawlId: id
|
crawlId: id,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -154,7 +154,7 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
|||||||
return await redisConnection.zrangebyscore(
|
return await redisConnection.zrangebyscore(
|
||||||
"concurrency-limiter:" + teamId + ":throttled",
|
"concurrency-limiter:" + teamId + ":throttled",
|
||||||
Date.now(),
|
Date.now(),
|
||||||
Infinity
|
Infinity,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -201,7 +201,7 @@ export function generateURLPermutations(url: string | URL): URL[] {
|
|||||||
export async function lockURL(
|
export async function lockURL(
|
||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
url: string
|
url: string,
|
||||||
): Promise<boolean> {
|
): Promise<boolean> {
|
||||||
let logger = _logger.child({
|
let logger = _logger.child({
|
||||||
crawlId: id,
|
crawlId: id,
|
||||||
@@ -209,7 +209,7 @@ export async function lockURL(
|
|||||||
method: "lockURL",
|
method: "lockURL",
|
||||||
preNormalizedURL: url,
|
preNormalizedURL: url,
|
||||||
teamId: sc.team_id,
|
teamId: sc.team_id,
|
||||||
plan: sc.plan
|
plan: sc.plan,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||||
@@ -218,7 +218,7 @@ export async function lockURL(
|
|||||||
sc.crawlerOptions.limit
|
sc.crawlerOptions.limit
|
||||||
) {
|
) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Crawl has already hit visited_unique limit, not locking URL."
|
"Crawl has already hit visited_unique limit, not locking URL.",
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -231,7 +231,7 @@ export async function lockURL(
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":visited_unique",
|
"crawl:" + id + ":visited_unique",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX"
|
"NX",
|
||||||
);
|
);
|
||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
@@ -242,7 +242,7 @@ export async function lockURL(
|
|||||||
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
||||||
const x = await redisConnection.sadd(
|
const x = await redisConnection.sadd(
|
||||||
"crawl:" + id + ":visited",
|
"crawl:" + id + ":visited",
|
||||||
...permutations
|
...permutations,
|
||||||
);
|
);
|
||||||
res = x === permutations.length;
|
res = x === permutations.length;
|
||||||
}
|
}
|
||||||
@@ -250,7 +250,7 @@ export async function lockURL(
|
|||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
|
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
|
||||||
res
|
res,
|
||||||
});
|
});
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
@@ -259,7 +259,7 @@ export async function lockURL(
|
|||||||
export async function lockURLs(
|
export async function lockURLs(
|
||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
urls: string[]
|
urls: string[],
|
||||||
): Promise<boolean> {
|
): Promise<boolean> {
|
||||||
urls = urls.map((url) => normalizeURL(url, sc));
|
urls = urls.map((url) => normalizeURL(url, sc));
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
@@ -267,7 +267,7 @@ export async function lockURLs(
|
|||||||
module: "crawl-redis",
|
module: "crawl-redis",
|
||||||
method: "lockURL",
|
method: "lockURL",
|
||||||
teamId: sc.team_id,
|
teamId: sc.team_id,
|
||||||
plan: sc.plan
|
plan: sc.plan,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add to visited_unique set
|
// Add to visited_unique set
|
||||||
@@ -276,7 +276,7 @@ export async function lockURLs(
|
|||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"crawl:" + id + ":visited_unique",
|
"crawl:" + id + ":visited_unique",
|
||||||
24 * 60 * 60,
|
24 * 60 * 60,
|
||||||
"NX"
|
"NX",
|
||||||
);
|
);
|
||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
@@ -285,12 +285,12 @@ export async function lockURLs(
|
|||||||
res = x === urls.length;
|
res = x === urls.length;
|
||||||
} else {
|
} else {
|
||||||
const allPermutations = urls.flatMap((url) =>
|
const allPermutations = urls.flatMap((url) =>
|
||||||
generateURLPermutations(url).map((x) => x.href)
|
generateURLPermutations(url).map((x) => x.href),
|
||||||
);
|
);
|
||||||
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
||||||
const x = await redisConnection.sadd(
|
const x = await redisConnection.sadd(
|
||||||
"crawl:" + id + ":visited",
|
"crawl:" + id + ":visited",
|
||||||
...allPermutations
|
...allPermutations,
|
||||||
);
|
);
|
||||||
res = x === allPermutations.length;
|
res = x === allPermutations.length;
|
||||||
}
|
}
|
||||||
@@ -304,7 +304,7 @@ export async function lockURLs(
|
|||||||
export function crawlToCrawler(
|
export function crawlToCrawler(
|
||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
newBase?: string
|
newBase?: string,
|
||||||
): WebCrawler {
|
): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
@@ -315,7 +315,7 @@ export function crawlToCrawler(
|
|||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
maxCrawledDepth: getAdjustedMaxDepth(
|
maxCrawledDepth: getAdjustedMaxDepth(
|
||||||
sc.originUrl!,
|
sc.originUrl!,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10,
|
||||||
),
|
),
|
||||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||||
@@ -323,7 +323,7 @@ export function crawlToCrawler(
|
|||||||
allowExternalContentLinks:
|
allowExternalContentLinks:
|
||||||
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false
|
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ export class CustomError extends Error {
|
|||||||
statusCode: number,
|
statusCode: number,
|
||||||
status: string,
|
status: string,
|
||||||
message: string = "",
|
message: string = "",
|
||||||
dataIngestionJob?: any
|
dataIngestionJob?: any,
|
||||||
) {
|
) {
|
||||||
super(message);
|
super(message);
|
||||||
this.statusCode = statusCode;
|
this.statusCode = statusCode;
|
||||||
|
|||||||
@@ -8,21 +8,21 @@ export const defaultPageOptions = {
|
|||||||
waitFor: 0,
|
waitFor: 0,
|
||||||
screenshot: false,
|
screenshot: false,
|
||||||
fullPageScreenshot: false,
|
fullPageScreenshot: false,
|
||||||
parsePDF: true
|
parsePDF: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
export const defaultCrawlerOptions = {
|
export const defaultCrawlerOptions = {
|
||||||
allowBackwardCrawling: false,
|
allowBackwardCrawling: false,
|
||||||
limit: 10000
|
limit: 10000,
|
||||||
};
|
};
|
||||||
|
|
||||||
export const defaultCrawlPageOptions = {
|
export const defaultCrawlPageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
removeTags: [],
|
removeTags: [],
|
||||||
parsePDF: true
|
parsePDF: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
export const defaultExtractorOptions = {
|
export const defaultExtractorOptions = {
|
||||||
mode: "markdown"
|
mode: "markdown",
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,21 +1,21 @@
|
|||||||
import { CohereClient } from "cohere-ai";
|
import { CohereClient } from "cohere-ai";
|
||||||
import { MapDocument } from "../../controllers/v1/types";
|
import { MapDocument } from "../../controllers/v1/types";
|
||||||
const cohere = new CohereClient({
|
const cohere = new CohereClient({
|
||||||
token: process.env.COHERE_API_KEY
|
token: process.env.COHERE_API_KEY,
|
||||||
});
|
});
|
||||||
|
|
||||||
export async function rerankDocuments(
|
export async function rerankDocuments(
|
||||||
documents: (string | Record<string, string>)[],
|
documents: (string | Record<string, string>)[],
|
||||||
query: string,
|
query: string,
|
||||||
topN = 3,
|
topN = 3,
|
||||||
model = "rerank-english-v3.0"
|
model = "rerank-english-v3.0",
|
||||||
) {
|
) {
|
||||||
const rerank = await cohere.v2.rerank({
|
const rerank = await cohere.v2.rerank({
|
||||||
documents,
|
documents,
|
||||||
query,
|
query,
|
||||||
topN,
|
topN,
|
||||||
model,
|
model,
|
||||||
returnDocuments: true
|
returnDocuments: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
return rerank.results
|
return rerank.results
|
||||||
@@ -23,6 +23,6 @@ export async function rerankDocuments(
|
|||||||
.map((x) => ({
|
.map((x) => ({
|
||||||
document: x.document,
|
document: x.document,
|
||||||
index: x.index,
|
index: x.index,
|
||||||
relevanceScore: x.relevanceScore
|
relevanceScore: x.relevanceScore,
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ const goExecutablePath = join(
|
|||||||
process.cwd(),
|
process.cwd(),
|
||||||
"sharedLibs",
|
"sharedLibs",
|
||||||
"go-html-to-md",
|
"go-html-to-md",
|
||||||
"html-to-markdown.so"
|
"html-to-markdown.so",
|
||||||
);
|
);
|
||||||
|
|
||||||
class GoMarkdownConverter {
|
class GoMarkdownConverter {
|
||||||
@@ -51,7 +51,7 @@ class GoMarkdownConverter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function parseMarkdown(
|
export async function parseMarkdown(
|
||||||
html: string | null | undefined
|
html: string | null | undefined,
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
if (!html) {
|
if (!html) {
|
||||||
return "";
|
return "";
|
||||||
@@ -74,12 +74,12 @@ export async function parseMarkdown(
|
|||||||
) {
|
) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
logger.error(
|
logger.error(
|
||||||
`Error converting HTML to Markdown with Go parser: ${error}`
|
`Error converting HTML to Markdown with Go parser: ${error}`,
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Tried to use Go parser, but it doesn't exist in the file system.",
|
"Tried to use Go parser, but it doesn't exist in the file system.",
|
||||||
{ goExecutablePath }
|
{ goExecutablePath },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -101,7 +101,7 @@ export async function parseMarkdown(
|
|||||||
var href = node.getAttribute("href").trim();
|
var href = node.getAttribute("href").trim();
|
||||||
var title = node.title ? ' "' + node.title + '"' : "";
|
var title = node.title ? ' "' + node.title + '"' : "";
|
||||||
return "[" + content.trim() + "](" + href + title + ")\n";
|
return "[" + content.trim() + "](" + href + title + ")\n";
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
var gfm = turndownPluginGfm.gfm;
|
var gfm = turndownPluginGfm.gfm;
|
||||||
turndownService.use(gfm);
|
turndownService.use(gfm);
|
||||||
@@ -145,7 +145,7 @@ function removeSkipToContentLinks(markdownContent: string): string {
|
|||||||
// Remove [Skip to Content](#page) and [Skip to content](#skip)
|
// Remove [Skip to Content](#page) and [Skip to content](#skip)
|
||||||
const newMarkdownContent = markdownContent.replace(
|
const newMarkdownContent = markdownContent.replace(
|
||||||
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
||||||
""
|
"",
|
||||||
);
|
);
|
||||||
return newMarkdownContent;
|
return newMarkdownContent;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ export async function deleteJobPriority(team_id, job_id) {
|
|||||||
export async function getJobPriority({
|
export async function getJobPriority({
|
||||||
plan,
|
plan,
|
||||||
team_id,
|
team_id,
|
||||||
basePriority = 10
|
basePriority = 10,
|
||||||
}: {
|
}: {
|
||||||
plan: PlanType | undefined;
|
plan: PlanType | undefined;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
@@ -91,12 +91,12 @@ export async function getJobPriority({
|
|||||||
} else {
|
} else {
|
||||||
// If not, we keep base priority + planModifier
|
// If not, we keep base priority + planModifier
|
||||||
return Math.ceil(
|
return Math.ceil(
|
||||||
basePriority + Math.ceil((setLength - bucketLimit) * planModifier)
|
basePriority + Math.ceil((setLength - bucketLimit) * planModifier),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
|
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}`,
|
||||||
);
|
);
|
||||||
return basePriority;
|
return basePriority;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,14 +14,14 @@ const logFormat = winston.format.printf(
|
|||||||
name: value.name,
|
name: value.name,
|
||||||
message: value.message,
|
message: value.message,
|
||||||
stack: value.stack,
|
stack: value.stack,
|
||||||
cause: value.cause
|
cause: value.cause,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
: ""
|
: ""
|
||||||
}`
|
}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
export const logger = winston.createLogger({
|
export const logger = winston.createLogger({
|
||||||
@@ -34,26 +34,26 @@ export const logger = winston.createLogger({
|
|||||||
name: value.name,
|
name: value.name,
|
||||||
message: value.message,
|
message: value.message,
|
||||||
stack: value.stack,
|
stack: value.stack,
|
||||||
cause: value.cause
|
cause: value.cause,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
}),
|
}),
|
||||||
transports: [
|
transports: [
|
||||||
new winston.transports.Console({
|
new winston.transports.Console({
|
||||||
format: winston.format.combine(
|
format: winston.format.combine(
|
||||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||||
winston.format.metadata({
|
winston.format.metadata({
|
||||||
fillExcept: ["message", "level", "timestamp"]
|
fillExcept: ["message", "level", "timestamp"],
|
||||||
}),
|
}),
|
||||||
...((process.env.ENV === "production" &&
|
...((process.env.ENV === "production" &&
|
||||||
process.env.SENTRY_ENVIRONMENT === "dev") ||
|
process.env.SENTRY_ENVIRONMENT === "dev") ||
|
||||||
process.env.ENV !== "production"
|
process.env.ENV !== "production"
|
||||||
? [winston.format.colorize(), logFormat]
|
? [winston.format.colorize(), logFormat]
|
||||||
: [])
|
: []),
|
||||||
)
|
),
|
||||||
})
|
}),
|
||||||
]
|
],
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -6,10 +6,10 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
|
|||||||
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
||||||
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
||||||
const magnitude1 = Math.sqrt(
|
const magnitude1 = Math.sqrt(
|
||||||
vec1.reduce((sum, val) => sum + val * val, 0)
|
vec1.reduce((sum, val) => sum + val * val, 0),
|
||||||
);
|
);
|
||||||
const magnitude2 = Math.sqrt(
|
const magnitude2 = Math.sqrt(
|
||||||
vec2.reduce((sum, val) => sum + val * val, 0)
|
vec2.reduce((sum, val) => sum + val * val, 0),
|
||||||
);
|
);
|
||||||
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
||||||
return dotProduct / (magnitude1 * magnitude2);
|
return dotProduct / (magnitude1 * magnitude2);
|
||||||
|
|||||||
@@ -5,13 +5,13 @@ describe("performRanking", () => {
|
|||||||
const linksWithContext = [
|
const linksWithContext = [
|
||||||
"url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds",
|
"url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds",
|
||||||
"url: https://example.com/cats, title: Cat care guide, description: Everything about cats",
|
"url: https://example.com/cats, title: Cat care guide, description: Everything about cats",
|
||||||
"url: https://example.com/pets, title: General pet care, description: Care for all types of pets"
|
"url: https://example.com/pets, title: General pet care, description: Care for all types of pets",
|
||||||
];
|
];
|
||||||
|
|
||||||
const links = [
|
const links = [
|
||||||
"https://example.com/dogs",
|
"https://example.com/dogs",
|
||||||
"https://example.com/cats",
|
"https://example.com/cats",
|
||||||
"https://example.com/pets"
|
"https://example.com/pets",
|
||||||
];
|
];
|
||||||
|
|
||||||
const searchQuery = "cats training";
|
const searchQuery = "cats training";
|
||||||
@@ -50,7 +50,7 @@ describe("performRanking", () => {
|
|||||||
it("should maintain original order for equal scores", async () => {
|
it("should maintain original order for equal scores", async () => {
|
||||||
const linksWithContext = [
|
const linksWithContext = [
|
||||||
"url: https://example.com/1, title: Similar content A, description: test",
|
"url: https://example.com/1, title: Similar content A, description: test",
|
||||||
"url: https://example.com/2, title: Similar content B, description: test"
|
"url: https://example.com/2, title: Similar content B, description: test",
|
||||||
];
|
];
|
||||||
|
|
||||||
const links = ["https://example.com/1", "https://example.com/2"];
|
const links = ["https://example.com/1", "https://example.com/2"];
|
||||||
|
|||||||
@@ -5,14 +5,14 @@ import OpenAI from "openai";
|
|||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
apiKey: process.env.OPENAI_API_KEY
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
});
|
});
|
||||||
|
|
||||||
async function getEmbedding(text: string) {
|
async function getEmbedding(text: string) {
|
||||||
const embedding = await openai.embeddings.create({
|
const embedding = await openai.embeddings.create({
|
||||||
model: "text-embedding-ada-002",
|
model: "text-embedding-ada-002",
|
||||||
input: text,
|
input: text,
|
||||||
encoding_format: "float"
|
encoding_format: "float",
|
||||||
});
|
});
|
||||||
|
|
||||||
return embedding.data[0].embedding;
|
return embedding.data[0].embedding;
|
||||||
@@ -39,7 +39,7 @@ const textToVector = (searchQuery: string, text: string): number[] => {
|
|||||||
async function performRanking(
|
async function performRanking(
|
||||||
linksWithContext: string[],
|
linksWithContext: string[],
|
||||||
links: string[],
|
links: string[],
|
||||||
searchQuery: string
|
searchQuery: string,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
// Handle invalid inputs
|
// Handle invalid inputs
|
||||||
@@ -64,7 +64,7 @@ async function performRanking(
|
|||||||
link: links[index],
|
link: links[index],
|
||||||
linkWithContext,
|
linkWithContext,
|
||||||
score,
|
score,
|
||||||
originalIndex: index
|
originalIndex: index,
|
||||||
};
|
};
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// If embedding fails for a link, return with score 0
|
// If embedding fails for a link, return with score 0
|
||||||
@@ -72,10 +72,10 @@ async function performRanking(
|
|||||||
link: links[index],
|
link: links[index],
|
||||||
linkWithContext,
|
linkWithContext,
|
||||||
score: 0,
|
score: 0,
|
||||||
originalIndex: index
|
originalIndex: index,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Sort links based on similarity scores while preserving original order for equal scores
|
// Sort links based on similarity scores while preserving original order for equal scores
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ export class ScrapeEvents {
|
|||||||
.insert({
|
.insert({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
type: content.type,
|
type: content.type,
|
||||||
content: content
|
content: content,
|
||||||
// created_at
|
// created_at
|
||||||
})
|
})
|
||||||
.select()
|
.select()
|
||||||
@@ -73,7 +73,7 @@ export class ScrapeEvents {
|
|||||||
|
|
||||||
static async updateScrapeResult(
|
static async updateScrapeResult(
|
||||||
logId: number | null,
|
logId: number | null,
|
||||||
result: ScrapeScrapeEvent["result"]
|
result: ScrapeScrapeEvent["result"],
|
||||||
) {
|
) {
|
||||||
if (logId === null) return;
|
if (logId === null) return;
|
||||||
|
|
||||||
@@ -86,8 +86,8 @@ export class ScrapeEvents {
|
|||||||
.update({
|
.update({
|
||||||
content: {
|
content: {
|
||||||
...previousLog.content,
|
...previousLog.content,
|
||||||
result
|
result,
|
||||||
}
|
},
|
||||||
})
|
})
|
||||||
.eq("id", logId);
|
.eq("id", logId);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -100,7 +100,7 @@ export class ScrapeEvents {
|
|||||||
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||||
type: "queue",
|
type: "queue",
|
||||||
event,
|
event,
|
||||||
worker: process.env.FLY_MACHINE_ID
|
worker: process.env.FLY_MACHINE_ID,
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error logging job event: ${error}`);
|
logger.error(`Error logging job event: ${error}`);
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -20,7 +20,7 @@ describe("isSameDomain", () => {
|
|||||||
it("should return true for a subdomain with different protocols", () => {
|
it("should return true for a subdomain with different protocols", () => {
|
||||||
const result = isSameDomain(
|
const result = isSameDomain(
|
||||||
"https://sub.example.com",
|
"https://sub.example.com",
|
||||||
"http://example.com"
|
"http://example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -35,7 +35,7 @@ describe("isSameDomain", () => {
|
|||||||
it("should return true for a subdomain with www prefix", () => {
|
it("should return true for a subdomain with www prefix", () => {
|
||||||
const result = isSameDomain(
|
const result = isSameDomain(
|
||||||
"http://www.sub.example.com",
|
"http://www.sub.example.com",
|
||||||
"http://example.com"
|
"http://example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -43,7 +43,7 @@ describe("isSameDomain", () => {
|
|||||||
it("should return true for the same domain with www prefix", () => {
|
it("should return true for the same domain with www prefix", () => {
|
||||||
const result = isSameDomain(
|
const result = isSameDomain(
|
||||||
"http://docs.s.s.example.com",
|
"http://docs.s.s.example.com",
|
||||||
"http://example.com"
|
"http://example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -53,7 +53,7 @@ describe("isSameSubdomain", () => {
|
|||||||
it("should return false for a subdomain", () => {
|
it("should return false for a subdomain", () => {
|
||||||
const result = isSameSubdomain(
|
const result = isSameSubdomain(
|
||||||
"http://example.com",
|
"http://example.com",
|
||||||
"http://docs.example.com"
|
"http://docs.example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
@@ -61,7 +61,7 @@ describe("isSameSubdomain", () => {
|
|||||||
it("should return true for the same subdomain", () => {
|
it("should return true for the same subdomain", () => {
|
||||||
const result = isSameSubdomain(
|
const result = isSameSubdomain(
|
||||||
"http://docs.example.com",
|
"http://docs.example.com",
|
||||||
"http://docs.example.com"
|
"http://docs.example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -69,7 +69,7 @@ describe("isSameSubdomain", () => {
|
|||||||
it("should return false for different subdomains", () => {
|
it("should return false for different subdomains", () => {
|
||||||
const result = isSameSubdomain(
|
const result = isSameSubdomain(
|
||||||
"http://docs.example.com",
|
"http://docs.example.com",
|
||||||
"http://blog.example.com"
|
"http://blog.example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
@@ -89,7 +89,7 @@ describe("isSameSubdomain", () => {
|
|||||||
it("should return true for the same subdomain with different protocols", () => {
|
it("should return true for the same subdomain with different protocols", () => {
|
||||||
const result = isSameSubdomain(
|
const result = isSameSubdomain(
|
||||||
"https://docs.example.com",
|
"https://docs.example.com",
|
||||||
"http://docs.example.com"
|
"http://docs.example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -97,7 +97,7 @@ describe("isSameSubdomain", () => {
|
|||||||
it("should return true for the same subdomain with www prefix", () => {
|
it("should return true for the same subdomain with www prefix", () => {
|
||||||
const result = isSameSubdomain(
|
const result = isSameSubdomain(
|
||||||
"http://www.docs.example.com",
|
"http://www.docs.example.com",
|
||||||
"http://docs.example.com"
|
"http://docs.example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -105,7 +105,7 @@ describe("isSameSubdomain", () => {
|
|||||||
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||||
const result = isSameSubdomain(
|
const result = isSameSubdomain(
|
||||||
"http://www.docs.example.com",
|
"http://www.docs.example.com",
|
||||||
"http://blog.example.com"
|
"http://blog.example.com",
|
||||||
);
|
);
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
@@ -117,7 +117,7 @@ describe("removeDuplicateUrls", () => {
|
|||||||
"http://example.com",
|
"http://example.com",
|
||||||
"https://example.com",
|
"https://example.com",
|
||||||
"http://www.example.com",
|
"http://www.example.com",
|
||||||
"https://www.example.com"
|
"https://www.example.com",
|
||||||
];
|
];
|
||||||
const result = removeDuplicateUrls(urls);
|
const result = removeDuplicateUrls(urls);
|
||||||
expect(result).toEqual(["https://example.com"]);
|
expect(result).toEqual(["https://example.com"]);
|
||||||
@@ -128,14 +128,14 @@ describe("removeDuplicateUrls", () => {
|
|||||||
"https://example.com/page1",
|
"https://example.com/page1",
|
||||||
"https://example.com/page2",
|
"https://example.com/page2",
|
||||||
"https://example.com/page1?param=1",
|
"https://example.com/page1?param=1",
|
||||||
"https://example.com/page1#section1"
|
"https://example.com/page1#section1",
|
||||||
];
|
];
|
||||||
const result = removeDuplicateUrls(urls);
|
const result = removeDuplicateUrls(urls);
|
||||||
expect(result).toEqual([
|
expect(result).toEqual([
|
||||||
"https://example.com/page1",
|
"https://example.com/page1",
|
||||||
"https://example.com/page2",
|
"https://example.com/page2",
|
||||||
"https://example.com/page1?param=1",
|
"https://example.com/page1?param=1",
|
||||||
"https://example.com/page1#section1"
|
"https://example.com/page1#section1",
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ let warningCount = 0;
|
|||||||
|
|
||||||
export function withAuth<T, U extends any[]>(
|
export function withAuth<T, U extends any[]>(
|
||||||
originalFunction: (...args: U) => Promise<T>,
|
originalFunction: (...args: U) => Promise<T>,
|
||||||
mockSuccess: T
|
mockSuccess: T,
|
||||||
) {
|
) {
|
||||||
return async function (...args: U): Promise<T> {
|
return async function (...args: U): Promise<T> {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { Job } from "bullmq";
|
|||||||
import {
|
import {
|
||||||
WebScraperOptions,
|
WebScraperOptions,
|
||||||
RunWebScraperParams,
|
RunWebScraperParams,
|
||||||
RunWebScraperResult
|
RunWebScraperResult,
|
||||||
} from "../types";
|
} from "../types";
|
||||||
import { billTeam } from "../services/billing/credit_billing";
|
import { billTeam } from "../services/billing/credit_billing";
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
@@ -13,14 +13,14 @@ import { configDotenv } from "dotenv";
|
|||||||
import {
|
import {
|
||||||
EngineResultsTracker,
|
EngineResultsTracker,
|
||||||
scrapeURL,
|
scrapeURL,
|
||||||
ScrapeUrlResponse
|
ScrapeUrlResponse,
|
||||||
} from "../scraper/scrapeURL";
|
} from "../scraper/scrapeURL";
|
||||||
import { Engine } from "../scraper/scrapeURL/engines";
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
token
|
token,
|
||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions> & { id: string };
|
job: Job<WebScraperOptions> & { id: string };
|
||||||
token: string;
|
token: string;
|
||||||
@@ -32,9 +32,9 @@ export async function startWebScraperPipeline({
|
|||||||
...job.data.scrapeOptions,
|
...job.data.scrapeOptions,
|
||||||
...(job.data.crawl_id
|
...(job.data.crawl_id
|
||||||
? {
|
? {
|
||||||
formats: job.data.scrapeOptions.formats.concat(["rawHtml"])
|
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
},
|
},
|
||||||
internalOptions: job.data.internalOptions,
|
internalOptions: job.data.internalOptions,
|
||||||
// onSuccess: (result, mode) => {
|
// onSuccess: (result, mode) => {
|
||||||
@@ -48,7 +48,7 @@ export async function startWebScraperPipeline({
|
|||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
bull_job_id: job.id.toString(),
|
bull_job_id: job.id.toString(),
|
||||||
priority: job.opts.priority,
|
priority: job.opts.priority,
|
||||||
is_scrape: job.data.is_scrape ?? false
|
is_scrape: job.data.is_scrape ?? false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -62,14 +62,14 @@ export async function runWebScraper({
|
|||||||
team_id,
|
team_id,
|
||||||
bull_job_id,
|
bull_job_id,
|
||||||
priority,
|
priority,
|
||||||
is_scrape = false
|
is_scrape = false,
|
||||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||||
let response: ScrapeUrlResponse | undefined = undefined;
|
let response: ScrapeUrlResponse | undefined = undefined;
|
||||||
let engines: EngineResultsTracker = {};
|
let engines: EngineResultsTracker = {};
|
||||||
try {
|
try {
|
||||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||||
priority,
|
priority,
|
||||||
...internalOptions
|
...internalOptions,
|
||||||
});
|
});
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
if (response.error instanceof Error) {
|
if (response.error instanceof Error) {
|
||||||
@@ -81,7 +81,7 @@ export async function runWebScraper({
|
|||||||
? JSON.stringify(response.error)
|
? JSON.stringify(response.error)
|
||||||
: typeof response.error === "object"
|
: typeof response.error === "object"
|
||||||
? JSON.stringify({ ...response.error })
|
? JSON.stringify({ ...response.error })
|
||||||
: response.error)
|
: response.error),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -94,7 +94,7 @@ export async function runWebScraper({
|
|||||||
|
|
||||||
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
|
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
|
||||||
);
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
@@ -117,14 +117,14 @@ export async function runWebScraper({
|
|||||||
return {
|
return {
|
||||||
...response,
|
...response,
|
||||||
success: false,
|
success: false,
|
||||||
error
|
error,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error,
|
error,
|
||||||
logs: ["no logs -- error coming from runWebScraper"],
|
logs: ["no logs -- error coming from runWebScraper"],
|
||||||
engines
|
engines,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
// onError(error);
|
// onError(error);
|
||||||
@@ -154,8 +154,8 @@ export async function runWebScraper({
|
|||||||
: result.state === "timeout"
|
: result.state === "timeout"
|
||||||
? "Timed out"
|
? "Timed out"
|
||||||
: undefined,
|
: undefined,
|
||||||
time_taken: result.finishedAt - result.startedAt
|
time_taken: result.finishedAt - result.startedAt,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -166,7 +166,7 @@ const saveJob = async (
|
|||||||
result: any,
|
result: any,
|
||||||
token: string,
|
token: string,
|
||||||
mode: string,
|
mode: string,
|
||||||
engines?: EngineResultsTracker
|
engines?: EngineResultsTracker,
|
||||||
) => {
|
) => {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import {
|
|||||||
autoscalerController,
|
autoscalerController,
|
||||||
checkQueuesController,
|
checkQueuesController,
|
||||||
cleanBefore24hCompleteJobsController,
|
cleanBefore24hCompleteJobsController,
|
||||||
queuesController
|
queuesController,
|
||||||
} from "../controllers/v0/admin/queue";
|
} from "../controllers/v0/admin/queue";
|
||||||
import { wrap } from "./v1";
|
import { wrap } from "./v1";
|
||||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||||
@@ -13,27 +13,27 @@ export const adminRouter = express.Router();
|
|||||||
|
|
||||||
adminRouter.get(
|
adminRouter.get(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
`/admin/${process.env.BULL_AUTH_KEY}/redis-health`,
|
||||||
redisHealthController
|
redisHealthController,
|
||||||
);
|
);
|
||||||
|
|
||||||
adminRouter.get(
|
adminRouter.get(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||||
cleanBefore24hCompleteJobsController
|
cleanBefore24hCompleteJobsController,
|
||||||
);
|
);
|
||||||
|
|
||||||
adminRouter.get(
|
adminRouter.get(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
`/admin/${process.env.BULL_AUTH_KEY}/check-queues`,
|
||||||
checkQueuesController
|
checkQueuesController,
|
||||||
);
|
);
|
||||||
|
|
||||||
adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController);
|
adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController);
|
||||||
|
|
||||||
adminRouter.get(
|
adminRouter.get(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||||
autoscalerController
|
autoscalerController,
|
||||||
);
|
);
|
||||||
|
|
||||||
adminRouter.post(
|
adminRouter.post(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
||||||
wrap(acucCacheClearController)
|
wrap(acucCacheClearController),
|
||||||
);
|
);
|
||||||
|
|||||||
+18
-18
@@ -8,7 +8,7 @@ import {
|
|||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
RequestWithACUC,
|
RequestWithACUC,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
RequestWithMaybeAuth
|
RequestWithMaybeAuth,
|
||||||
} from "../controllers/v1/types";
|
} from "../controllers/v1/types";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { authenticateUser } from "../controllers/auth";
|
import { authenticateUser } from "../controllers/auth";
|
||||||
@@ -33,7 +33,7 @@ import { extractController } from "../controllers/v1/extract";
|
|||||||
// import { readinessController } from "../controllers/v1/readiness";
|
// import { readinessController } from "../controllers/v1/readiness";
|
||||||
|
|
||||||
function checkCreditsMiddleware(
|
function checkCreditsMiddleware(
|
||||||
minimum?: number
|
minimum?: number,
|
||||||
): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
@@ -44,20 +44,20 @@ function checkCreditsMiddleware(
|
|||||||
const { success, remainingCredits, chunk } = await checkTeamCredits(
|
const { success, remainingCredits, chunk } = await checkTeamCredits(
|
||||||
req.acuc,
|
req.acuc,
|
||||||
req.auth.team_id,
|
req.auth.team_id,
|
||||||
minimum ?? 1
|
minimum ?? 1,
|
||||||
);
|
);
|
||||||
if (chunk) {
|
if (chunk) {
|
||||||
req.acuc = chunk;
|
req.acuc = chunk;
|
||||||
}
|
}
|
||||||
if (!success) {
|
if (!success) {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`
|
`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`,
|
||||||
);
|
);
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(402).json({
|
return res.status(402).json({
|
||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value."
|
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -68,7 +68,7 @@ function checkCreditsMiddleware(
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function authMiddleware(
|
export function authMiddleware(
|
||||||
rateLimiterMode: RateLimiterMode
|
rateLimiterMode: RateLimiterMode,
|
||||||
): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
@@ -99,7 +99,7 @@ export function authMiddleware(
|
|||||||
function idempotencyMiddleware(
|
function idempotencyMiddleware(
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
next: NextFunction
|
next: NextFunction,
|
||||||
) {
|
) {
|
||||||
(async () => {
|
(async () => {
|
||||||
if (req.headers["x-idempotency-key"]) {
|
if (req.headers["x-idempotency-key"]) {
|
||||||
@@ -123,7 +123,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
|||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
"URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions."
|
"URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -131,7 +131,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function wrap(
|
export function wrap(
|
||||||
controller: (req: Request, res: Response) => Promise<any>
|
controller: (req: Request, res: Response) => Promise<any>,
|
||||||
): (req: Request, res: Response, next: NextFunction) => any {
|
): (req: Request, res: Response, next: NextFunction) => any {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
controller(req, res).catch((err) => next(err));
|
controller(req, res).catch((err) => next(err));
|
||||||
@@ -147,7 +147,7 @@ v1Router.post(
|
|||||||
authMiddleware(RateLimiterMode.Scrape),
|
authMiddleware(RateLimiterMode.Scrape),
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(1),
|
||||||
blocklistMiddleware,
|
blocklistMiddleware,
|
||||||
wrap(scrapeController)
|
wrap(scrapeController),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
@@ -156,7 +156,7 @@ v1Router.post(
|
|||||||
checkCreditsMiddleware(),
|
checkCreditsMiddleware(),
|
||||||
blocklistMiddleware,
|
blocklistMiddleware,
|
||||||
idempotencyMiddleware,
|
idempotencyMiddleware,
|
||||||
wrap(crawlController)
|
wrap(crawlController),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
@@ -165,7 +165,7 @@ v1Router.post(
|
|||||||
checkCreditsMiddleware(),
|
checkCreditsMiddleware(),
|
||||||
blocklistMiddleware,
|
blocklistMiddleware,
|
||||||
idempotencyMiddleware,
|
idempotencyMiddleware,
|
||||||
wrap(batchScrapeController)
|
wrap(batchScrapeController),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
@@ -173,20 +173,20 @@ v1Router.post(
|
|||||||
authMiddleware(RateLimiterMode.Map),
|
authMiddleware(RateLimiterMode.Map),
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(1),
|
||||||
blocklistMiddleware,
|
blocklistMiddleware,
|
||||||
wrap(mapController)
|
wrap(mapController),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.get(
|
v1Router.get(
|
||||||
"/crawl/:jobId",
|
"/crawl/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
wrap(crawlStatusController)
|
wrap(crawlStatusController),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.get(
|
v1Router.get(
|
||||||
"/batch/scrape/:jobId",
|
"/batch/scrape/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
// Yes, it uses the same controller as the normal crawl status controller
|
// Yes, it uses the same controller as the normal crawl status controller
|
||||||
wrap((req: any, res): any => crawlStatusController(req, res, true))
|
wrap((req: any, res): any => crawlStatusController(req, res, true)),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.get("/scrape/:jobId", wrap(scrapeStatusController));
|
v1Router.get("/scrape/:jobId", wrap(scrapeStatusController));
|
||||||
@@ -194,7 +194,7 @@ v1Router.get("/scrape/:jobId", wrap(scrapeStatusController));
|
|||||||
v1Router.get(
|
v1Router.get(
|
||||||
"/concurrency-check",
|
"/concurrency-check",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
wrap(concurrencyCheckController)
|
wrap(concurrencyCheckController),
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.ws("/crawl/:jobId", crawlStatusWSController);
|
v1Router.ws("/crawl/:jobId", crawlStatusWSController);
|
||||||
@@ -203,7 +203,7 @@ v1Router.post(
|
|||||||
"/extract",
|
"/extract",
|
||||||
authMiddleware(RateLimiterMode.Scrape),
|
authMiddleware(RateLimiterMode.Scrape),
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(1),
|
||||||
wrap(extractController)
|
wrap(extractController),
|
||||||
);
|
);
|
||||||
|
|
||||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||||
@@ -211,7 +211,7 @@ v1Router.post(
|
|||||||
v1Router.delete(
|
v1Router.delete(
|
||||||
"/crawl/:jobId",
|
"/crawl/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
crawlCancelController
|
crawlCancelController,
|
||||||
);
|
);
|
||||||
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||||
|
|
||||||
|
|||||||
+11
-11
@@ -18,20 +18,20 @@ async function sendCrawl(result: Result): Promise<string | undefined> {
|
|||||||
{
|
{
|
||||||
url: url,
|
url: url,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
limit: 75
|
limit: 75,
|
||||||
},
|
},
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
includeHtml: true,
|
includeHtml: true,
|
||||||
replaceAllPathsWithAbsolutePaths: true,
|
replaceAllPathsWithAbsolutePaths: true,
|
||||||
waitFor: 1000
|
waitFor: 1000,
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
Authorization: `Bearer `
|
Authorization: `Bearer `,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
result.idempotency_key = idempotencyKey;
|
result.idempotency_key = idempotencyKey;
|
||||||
return response.data.jobId;
|
return response.data.jobId;
|
||||||
@@ -51,9 +51,9 @@ async function getContent(result: Result): Promise<boolean> {
|
|||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
Authorization: `Bearer `
|
Authorization: `Bearer `,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
if (response.data.status === "completed") {
|
if (response.data.status === "completed") {
|
||||||
result.result_data_jsonb = response.data.data;
|
result.result_data_jsonb = response.data.data;
|
||||||
@@ -97,11 +97,11 @@ async function processResults(results: Result[]): Promise<void> {
|
|||||||
// Save job id along with the start_url
|
// Save job id along with the start_url
|
||||||
const resultWithJobId = results.map((r) => ({
|
const resultWithJobId = results.map((r) => ({
|
||||||
start_url: r.start_url,
|
start_url: r.start_url,
|
||||||
job_id: r.job_id
|
job_id: r.job_id,
|
||||||
}));
|
}));
|
||||||
await fs.writeFile(
|
await fs.writeFile(
|
||||||
"results_with_job_id_4000_6000.json",
|
"results_with_job_id_4000_6000.json",
|
||||||
JSON.stringify(resultWithJobId, null, 4)
|
JSON.stringify(resultWithJobId, null, 4),
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error writing to results_with_content.json:", error);
|
console.error("Error writing to results_with_content.json:", error);
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ describe("WebCrawler", () => {
|
|||||||
getMatchingLineNumber: jest.fn().mockReturnValue(0),
|
getMatchingLineNumber: jest.fn().mockReturnValue(0),
|
||||||
getCrawlDelay: jest.fn().mockReturnValue(0),
|
getCrawlDelay: jest.fn().mockReturnValue(0),
|
||||||
getSitemaps: jest.fn().mockReturnValue([]),
|
getSitemaps: jest.fn().mockReturnValue([]),
|
||||||
getPreferredHost: jest.fn().mockReturnValue("example.com")
|
getPreferredHost: jest.fn().mockReturnValue("example.com"),
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -46,7 +46,7 @@ describe("WebCrawler", () => {
|
|||||||
includes: [],
|
includes: [],
|
||||||
excludes: [],
|
excludes: [],
|
||||||
limit: limit, // Apply the limit
|
limit: limit, // Apply the limit
|
||||||
maxCrawledDepth: 10
|
maxCrawledDepth: 10,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Mock sitemap fetching function to return more links than the limit
|
// Mock sitemap fetching function to return more links than the limit
|
||||||
@@ -56,7 +56,7 @@ describe("WebCrawler", () => {
|
|||||||
initialUrl,
|
initialUrl,
|
||||||
initialUrl + "/page1",
|
initialUrl + "/page1",
|
||||||
initialUrl + "/page2",
|
initialUrl + "/page2",
|
||||||
initialUrl + "/page3"
|
initialUrl + "/page3",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const filteredLinks = crawler["filterLinks"](
|
const filteredLinks = crawler["filterLinks"](
|
||||||
@@ -64,10 +64,10 @@ describe("WebCrawler", () => {
|
|||||||
initialUrl,
|
initialUrl,
|
||||||
initialUrl + "/page1",
|
initialUrl + "/page1",
|
||||||
initialUrl + "/page2",
|
initialUrl + "/page2",
|
||||||
initialUrl + "/page3"
|
initialUrl + "/page3",
|
||||||
],
|
],
|
||||||
limit,
|
limit,
|
||||||
10
|
10,
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit
|
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false,
|
allowExternalContentLinks = false,
|
||||||
allowSubdomains = false,
|
allowSubdomains = false,
|
||||||
ignoreRobotsTxt = false
|
ignoreRobotsTxt = false,
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@@ -79,7 +79,7 @@ export class WebCrawler {
|
|||||||
sitemapLinks: string[],
|
sitemapLinks: string[],
|
||||||
limit: number,
|
limit: number,
|
||||||
maxDepth: number,
|
maxDepth: number,
|
||||||
fromMap: boolean = false
|
fromMap: boolean = false,
|
||||||
): string[] {
|
): string[] {
|
||||||
// If the initial URL is a sitemap.xml, skip filtering
|
// If the initial URL is a sitemap.xml, skip filtering
|
||||||
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
|
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
|
||||||
@@ -95,7 +95,7 @@ export class WebCrawler {
|
|||||||
this.logger.debug(`Error processing link: ${link}`, {
|
this.logger.debug(`Error processing link: ${link}`, {
|
||||||
link,
|
link,
|
||||||
error,
|
error,
|
||||||
method: "filterLinks"
|
method: "filterLinks",
|
||||||
});
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -112,7 +112,7 @@ export class WebCrawler {
|
|||||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||||
if (
|
if (
|
||||||
this.excludes.some((excludePattern) =>
|
this.excludes.some((excludePattern) =>
|
||||||
new RegExp(excludePattern).test(path)
|
new RegExp(excludePattern).test(path),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
return false;
|
return false;
|
||||||
@@ -123,7 +123,7 @@ export class WebCrawler {
|
|||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
if (
|
if (
|
||||||
!this.includes.some((includePattern) =>
|
!this.includes.some((includePattern) =>
|
||||||
new RegExp(includePattern).test(path)
|
new RegExp(includePattern).test(path),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
return false;
|
return false;
|
||||||
@@ -140,7 +140,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
const initialHostname = normalizedInitialUrl.hostname.replace(
|
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||||
/^www\./,
|
/^www\./,
|
||||||
""
|
"",
|
||||||
);
|
);
|
||||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
||||||
|
|
||||||
@@ -165,7 +165,7 @@ export class WebCrawler {
|
|||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
this.logger.debug(`Link disallowed by robots.txt: ${link}`, {
|
this.logger.debug(`Link disallowed by robots.txt: ${link}`, {
|
||||||
method: "filterLinks",
|
method: "filterLinks",
|
||||||
link
|
link,
|
||||||
});
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -183,12 +183,12 @@ export class WebCrawler {
|
|||||||
let extraArgs = {};
|
let extraArgs = {};
|
||||||
if (skipTlsVerification) {
|
if (skipTlsVerification) {
|
||||||
extraArgs["httpsAgent"] = new https.Agent({
|
extraArgs["httpsAgent"] = new https.Agent({
|
||||||
rejectUnauthorized: false
|
rejectUnauthorized: false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const response = await axios.get(this.robotsTxtUrl, {
|
const response = await axios.get(this.robotsTxtUrl, {
|
||||||
timeout: axiosTimeout,
|
timeout: axiosTimeout,
|
||||||
...extraArgs
|
...extraArgs,
|
||||||
});
|
});
|
||||||
return response.data;
|
return response.data;
|
||||||
}
|
}
|
||||||
@@ -199,10 +199,10 @@ export class WebCrawler {
|
|||||||
|
|
||||||
public async tryGetSitemap(
|
public async tryGetSitemap(
|
||||||
fromMap: boolean = false,
|
fromMap: boolean = false,
|
||||||
onlySitemap: boolean = false
|
onlySitemap: boolean = false,
|
||||||
): Promise<{ url: string; html: string }[] | null> {
|
): Promise<{ url: string; html: string }[] | null> {
|
||||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||||
method: "tryGetSitemap"
|
method: "tryGetSitemap",
|
||||||
});
|
});
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (fromMap && onlySitemap) {
|
if (fromMap && onlySitemap) {
|
||||||
@@ -213,7 +213,7 @@ export class WebCrawler {
|
|||||||
sitemapLinks,
|
sitemapLinks,
|
||||||
this.limit,
|
this.limit,
|
||||||
this.maxCrawledDepth,
|
this.maxCrawledDepth,
|
||||||
fromMap
|
fromMap,
|
||||||
);
|
);
|
||||||
return filteredLinks.map((link) => ({ url: link, html: "" }));
|
return filteredLinks.map((link) => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
@@ -303,7 +303,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
private isRobotsAllowed(
|
private isRobotsAllowed(
|
||||||
url: string,
|
url: string,
|
||||||
ignoreRobotsTxt: boolean = false
|
ignoreRobotsTxt: boolean = false,
|
||||||
): boolean {
|
): boolean {
|
||||||
return ignoreRobotsTxt
|
return ignoreRobotsTxt
|
||||||
? true
|
? true
|
||||||
@@ -352,7 +352,7 @@ export class WebCrawler {
|
|||||||
url
|
url
|
||||||
.split("/")
|
.split("/")
|
||||||
.slice(3)
|
.slice(3)
|
||||||
.filter((subArray) => subArray.length > 0).length
|
.filter((subArray) => subArray.length > 0).length,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -373,7 +373,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
private isSubdomain(link: string): boolean {
|
private isSubdomain(link: string): boolean {
|
||||||
return new URL(link, this.baseUrl).hostname.endsWith(
|
return new URL(link, this.baseUrl).hostname.endsWith(
|
||||||
"." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".")
|
"." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -405,7 +405,7 @@ export class WebCrawler {
|
|||||||
".ttf",
|
".ttf",
|
||||||
".woff2",
|
".woff2",
|
||||||
".webp",
|
".webp",
|
||||||
".inc"
|
".inc",
|
||||||
];
|
];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -414,7 +414,7 @@ export class WebCrawler {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error(`Error processing URL in isFile`, {
|
this.logger.error(`Error processing URL in isFile`, {
|
||||||
method: "isFile",
|
method: "isFile",
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -431,7 +431,7 @@ export class WebCrawler {
|
|||||||
"github.com",
|
"github.com",
|
||||||
"calendly.com",
|
"calendly.com",
|
||||||
"discord.gg",
|
"discord.gg",
|
||||||
"discord.com"
|
"discord.com",
|
||||||
];
|
];
|
||||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||||
}
|
}
|
||||||
@@ -457,14 +457,14 @@ export class WebCrawler {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(
|
this.logger.debug(
|
||||||
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
|
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
|
||||||
{ method: "tryFetchSitemapLinks", sitemapUrl, error }
|
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
||||||
);
|
);
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
const response = await getLinksFromSitemap(
|
const response = await getLinksFromSitemap(
|
||||||
{ sitemapUrl, mode: "fire-engine" },
|
{ sitemapUrl, mode: "fire-engine" },
|
||||||
this.logger
|
this.logger,
|
||||||
);
|
);
|
||||||
if (response) {
|
if (response) {
|
||||||
sitemapLinks = response;
|
sitemapLinks = response;
|
||||||
@@ -476,26 +476,26 @@ export class WebCrawler {
|
|||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap, {
|
const response = await axios.get(baseUrlSitemap, {
|
||||||
timeout: axiosTimeout
|
timeout: axiosTimeout,
|
||||||
});
|
});
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap(
|
sitemapLinks = await getLinksFromSitemap(
|
||||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
||||||
this.logger
|
this.logger,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
method: "tryFetchSitemapLinks",
|
method: "tryFetchSitemapLinks",
|
||||||
sitemapUrl: baseUrlSitemap,
|
sitemapUrl: baseUrlSitemap,
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
sitemapLinks = await getLinksFromSitemap(
|
sitemapLinks = await getLinksFromSitemap(
|
||||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
||||||
this.logger
|
this.logger,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -503,7 +503,7 @@ export class WebCrawler {
|
|||||||
|
|
||||||
const normalizedUrl = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
const normalizedSitemapLinks = sitemapLinks.map((link) =>
|
const normalizedSitemapLinks = sitemapLinks.map((link) =>
|
||||||
normalizeUrl(link)
|
normalizeUrl(link),
|
||||||
);
|
);
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||||
if (
|
if (
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { logger } from "../../../lib/logger";
|
|||||||
|
|
||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
scraper: string;
|
scraper: string;
|
||||||
url: string;
|
url: string;
|
||||||
@@ -15,7 +15,7 @@ export async function handleCustomScraping(
|
|||||||
!url.includes("developers.notion.com")
|
!url.includes("developers.notion.com")
|
||||||
) {
|
) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`,
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
scraper: "fire-engine",
|
scraper: "fire-engine",
|
||||||
@@ -23,21 +23,21 @@ export async function handleCustomScraping(
|
|||||||
waitAfterLoad: 1000,
|
waitAfterLoad: 1000,
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
scrollXPaths: [
|
scrollXPaths: [
|
||||||
'//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]'
|
'//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]',
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for Vanta security portals
|
// Check for Vanta security portals
|
||||||
if (text.includes('<link href="https://static.vanta.com')) {
|
if (text.includes('<link href="https://static.vanta.com')) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
|
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`,
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
scraper: "fire-engine",
|
scraper: "fire-engine",
|
||||||
url: url,
|
url: url,
|
||||||
waitAfterLoad: 3000
|
waitAfterLoad: 3000,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,7 +50,7 @@ export async function handleCustomScraping(
|
|||||||
logger.debug(`Google Drive PDF link detected: ${url}`);
|
logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||||
|
|
||||||
const fileIdMatch = url.match(
|
const fileIdMatch = url.match(
|
||||||
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/
|
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/,
|
||||||
);
|
);
|
||||||
if (fileIdMatch) {
|
if (fileIdMatch) {
|
||||||
const fileId = fileIdMatch[1];
|
const fileId = fileIdMatch[1];
|
||||||
@@ -58,7 +58,7 @@ export async function handleCustomScraping(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
scraper: "pdf",
|
scraper: "pdf",
|
||||||
url: pdfUrl
|
url: pdfUrl,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,13 +10,13 @@ export async function getLinksFromSitemap(
|
|||||||
{
|
{
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
allUrls = [],
|
allUrls = [],
|
||||||
mode = "axios"
|
mode = "axios",
|
||||||
}: {
|
}: {
|
||||||
sitemapUrl: string;
|
sitemapUrl: string;
|
||||||
allUrls?: string[];
|
allUrls?: string[];
|
||||||
mode?: "axios" | "fire-engine";
|
mode?: "axios" | "fire-engine";
|
||||||
},
|
},
|
||||||
logger: Logger
|
logger: Logger,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
@@ -29,7 +29,7 @@ export async function getLinksFromSitemap(
|
|||||||
"sitemap",
|
"sitemap",
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }
|
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||||
);
|
);
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
throw response.error;
|
throw response.error;
|
||||||
@@ -41,7 +41,7 @@ export async function getLinksFromSitemap(
|
|||||||
method: "getLinksFromSitemap",
|
method: "getLinksFromSitemap",
|
||||||
mode,
|
mode,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
@@ -56,8 +56,8 @@ export async function getLinksFromSitemap(
|
|||||||
.map((sitemap) =>
|
.map((sitemap) =>
|
||||||
getLinksFromSitemap(
|
getLinksFromSitemap(
|
||||||
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
||||||
logger
|
logger,
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
await Promise.all(sitemapPromises);
|
await Promise.all(sitemapPromises);
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
@@ -66,7 +66,7 @@ export async function getLinksFromSitemap(
|
|||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
!WebCrawler.prototype.isFile(url.loc[0])
|
!WebCrawler.prototype.isFile(url.loc[0]),
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0]);
|
.map((url) => url.loc[0]);
|
||||||
allUrls.push(...validUrls);
|
allUrls.push(...validUrls);
|
||||||
@@ -76,7 +76,7 @@ export async function getLinksFromSitemap(
|
|||||||
method: "getLinksFromSitemap",
|
method: "getLinksFromSitemap",
|
||||||
mode,
|
mode,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,12 +85,12 @@ export async function getLinksFromSitemap(
|
|||||||
|
|
||||||
export const fetchSitemapData = async (
|
export const fetchSitemapData = async (
|
||||||
url: string,
|
url: string,
|
||||||
timeout?: number
|
timeout?: number,
|
||||||
): Promise<SitemapEntry[] | null> => {
|
): Promise<SitemapEntry[] | null> => {
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, {
|
const response = await axios.get(sitemapUrl, {
|
||||||
timeout: timeout || axiosTimeout
|
timeout: timeout || axiosTimeout,
|
||||||
});
|
});
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const xml = response.data;
|
const xml = response.data;
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ describe("Blocklist Functionality", () => {
|
|||||||
"https://flickr.com/photos/johndoe",
|
"https://flickr.com/photos/johndoe",
|
||||||
"https://whatsapp.com/download",
|
"https://whatsapp.com/download",
|
||||||
"https://wechat.com/features",
|
"https://wechat.com/features",
|
||||||
"https://telegram.org/apps"
|
"https://telegram.org/apps",
|
||||||
])("should return true for blocklisted URL %s", (url) => {
|
])("should return true for blocklisted URL %s", (url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
});
|
});
|
||||||
@@ -33,7 +33,7 @@ describe("Blocklist Functionality", () => {
|
|||||||
"https://flickr.com/help/terms",
|
"https://flickr.com/help/terms",
|
||||||
"https://whatsapp.com/legal",
|
"https://whatsapp.com/legal",
|
||||||
"https://wechat.com/en/privacy-policy",
|
"https://wechat.com/en/privacy-policy",
|
||||||
"https://telegram.org/tos"
|
"https://telegram.org/tos",
|
||||||
])("should return false for allowed URLs with keywords %s", (url) => {
|
])("should return false for allowed URLs with keywords %s", (url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
});
|
});
|
||||||
@@ -54,35 +54,35 @@ describe("Blocklist Functionality", () => {
|
|||||||
"https://facebook.com.someotherdomain.com",
|
"https://facebook.com.someotherdomain.com",
|
||||||
"https://www.facebook.com/profile",
|
"https://www.facebook.com/profile",
|
||||||
"https://api.twitter.com/info",
|
"https://api.twitter.com/info",
|
||||||
"https://instagram.com/accounts/login"
|
"https://instagram.com/accounts/login",
|
||||||
])(
|
])(
|
||||||
"should return true for URLs with blocklisted domains in subdomains or paths %s",
|
"should return true for URLs with blocklisted domains in subdomains or paths %s",
|
||||||
(url) => {
|
(url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
"https://example.com/facebook.com",
|
"https://example.com/facebook.com",
|
||||||
"https://example.com/redirect?url=https://twitter.com",
|
"https://example.com/redirect?url=https://twitter.com",
|
||||||
"https://facebook.com.policy.example.com"
|
"https://facebook.com.policy.example.com",
|
||||||
])(
|
])(
|
||||||
"should return false for URLs where blocklisted domain is part of another domain or path %s",
|
"should return false for URLs where blocklisted domain is part of another domain or path %s",
|
||||||
(url) => {
|
(url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
|
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
|
||||||
"should handle case variations %s",
|
"should handle case variations %s",
|
||||||
(url) => {
|
(url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
"https://facebook.com?redirect=https://example.com",
|
"https://facebook.com?redirect=https://example.com",
|
||||||
"https://twitter.com?query=something"
|
"https://twitter.com?query=something",
|
||||||
])("should handle query parameters %s", (url) => {
|
])("should handle query parameters %s", (url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ const socialMediaBlocklist = [
|
|||||||
"youtube.com",
|
"youtube.com",
|
||||||
"corterix.com",
|
"corterix.com",
|
||||||
"southwest.com",
|
"southwest.com",
|
||||||
"ryanair.com"
|
"ryanair.com",
|
||||||
];
|
];
|
||||||
|
|
||||||
const allowedKeywords = [
|
const allowedKeywords = [
|
||||||
@@ -41,7 +41,7 @@ const allowedKeywords = [
|
|||||||
"://library.tiktok.com",
|
"://library.tiktok.com",
|
||||||
"://ads.tiktok.com",
|
"://ads.tiktok.com",
|
||||||
"://tiktok.com/business",
|
"://tiktok.com/business",
|
||||||
"://developers.facebook.com"
|
"://developers.facebook.com",
|
||||||
];
|
];
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string): boolean {
|
||||||
@@ -50,7 +50,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||||||
// Check if the URL contains any allowed keywords as whole words
|
// Check if the URL contains any allowed keywords as whole words
|
||||||
if (
|
if (
|
||||||
allowedKeywords.some((keyword) =>
|
allowedKeywords.some((keyword) =>
|
||||||
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl)
|
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
return false;
|
return false;
|
||||||
@@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||||||
const isBlocked = socialMediaBlocklist.some((domain) => {
|
const isBlocked = socialMediaBlocklist.some((domain) => {
|
||||||
const domainPattern = new RegExp(
|
const domainPattern = new RegExp(
|
||||||
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
|
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
|
||||||
"i"
|
"i",
|
||||||
);
|
);
|
||||||
return domainPattern.test(hostname);
|
return domainPattern.test(hostname);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
export function getAdjustedMaxDepth(
|
export function getAdjustedMaxDepth(
|
||||||
url: string,
|
url: string,
|
||||||
maxCrawlDepth: number
|
maxCrawlDepth: number,
|
||||||
): number {
|
): number {
|
||||||
const baseURLDepth = getURLDepth(url);
|
const baseURLDepth = getURLDepth(url);
|
||||||
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
|
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
|
||||||
|
|||||||
+1
-1
@@ -14,6 +14,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
url: entry.url,
|
url: entry.url,
|
||||||
html: entry.html,
|
html: entry.html,
|
||||||
statusCode: entry.statusCode,
|
statusCode: entry.statusCode,
|
||||||
error: entry.error
|
error: entry.error,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,6 @@ export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
url: response.url,
|
url: response.url,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
|
||||||
html: (await mammoth.convertToHtml({ path: tempFilePath })).value
|
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,33 +4,33 @@ import { TimeoutError } from "../../error";
|
|||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
|
||||||
export async function scrapeURLWithFetch(
|
export async function scrapeURLWithFetch(
|
||||||
meta: Meta
|
meta: Meta,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const timeout = 20000;
|
const timeout = 20000;
|
||||||
|
|
||||||
const response = await Promise.race([
|
const response = await Promise.race([
|
||||||
fetch(meta.url, {
|
fetch(meta.url, {
|
||||||
redirect: "follow",
|
redirect: "follow",
|
||||||
headers: meta.options.headers
|
headers: meta.options.headers,
|
||||||
}),
|
}),
|
||||||
(async () => {
|
(async () => {
|
||||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||||
throw new TimeoutError(
|
throw new TimeoutError(
|
||||||
"Fetch was unable to scrape the page before timing out",
|
"Fetch was unable to scrape the page before timing out",
|
||||||
{ cause: { timeout } }
|
{ cause: { timeout } },
|
||||||
);
|
);
|
||||||
})()
|
})(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
||||||
Object.fromEntries(response.headers as any)
|
Object.fromEntries(response.headers as any),
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: response.url,
|
url: response.url,
|
||||||
html: await response.text(),
|
html: await response.text(),
|
||||||
statusCode: response.status
|
statusCode: response.status,
|
||||||
// TODO: error?
|
// TODO: error?
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,10 +31,10 @@ const successSchema = z.object({
|
|||||||
actionContent: z
|
actionContent: z
|
||||||
.object({
|
.object({
|
||||||
url: z.string(),
|
url: z.string(),
|
||||||
html: z.string()
|
html: z.string(),
|
||||||
})
|
})
|
||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||||
@@ -47,16 +47,16 @@ const processingSchema = z.object({
|
|||||||
"waiting",
|
"waiting",
|
||||||
"waiting-children",
|
"waiting-children",
|
||||||
"unknown",
|
"unknown",
|
||||||
"prioritized"
|
"prioritized",
|
||||||
]),
|
]),
|
||||||
processing: z.boolean()
|
processing: z.boolean(),
|
||||||
});
|
});
|
||||||
|
|
||||||
const failedSchema = z.object({
|
const failedSchema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
state: z.literal("failed"),
|
state: z.literal("failed"),
|
||||||
processing: z.literal(false),
|
processing: z.literal(false),
|
||||||
error: z.string()
|
error: z.string(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export class StillProcessingError extends Error {
|
export class StillProcessingError extends Error {
|
||||||
@@ -67,7 +67,7 @@ export class StillProcessingError extends Error {
|
|||||||
|
|
||||||
export async function fireEngineCheckStatus(
|
export async function fireEngineCheckStatus(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
jobId: string
|
jobId: string,
|
||||||
): Promise<FireEngineCheckStatusSuccess> {
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
@@ -75,8 +75,8 @@ export async function fireEngineCheckStatus(
|
|||||||
{
|
{
|
||||||
name: "fire-engine: Check status",
|
name: "fire-engine: Check status",
|
||||||
attributes: {
|
attributes: {
|
||||||
jobId
|
jobId,
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
@@ -87,12 +87,12 @@ export async function fireEngineCheckStatus(
|
|||||||
...(Sentry.isInitialized()
|
...(Sentry.isInitialized()
|
||||||
? {
|
? {
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
baggage: Sentry.spanToBaggageHeader(span)
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
const successParse = successSchema.safeParse(status);
|
const successParse = successSchema.safeParse(status);
|
||||||
@@ -115,23 +115,23 @@ export async function fireEngineCheckStatus(
|
|||||||
throw new EngineError("Scrape job failed", {
|
throw new EngineError("Scrape job failed", {
|
||||||
cause: {
|
cause: {
|
||||||
status,
|
status,
|
||||||
jobId
|
jobId,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Check status returned response not matched by any schema", {
|
logger.debug("Check status returned response not matched by any schema", {
|
||||||
status,
|
status,
|
||||||
jobId
|
jobId,
|
||||||
});
|
});
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"Check status returned response not matched by any schema",
|
"Check status returned response not matched by any schema",
|
||||||
{
|
{
|
||||||
cause: {
|
cause: {
|
||||||
status,
|
status,
|
||||||
jobId
|
jobId,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ export async function fireEngineDelete(logger: Logger, jobId: string) {
|
|||||||
{
|
{
|
||||||
name: "fire-engine: Delete scrape",
|
name: "fire-engine: Delete scrape",
|
||||||
attributes: {
|
attributes: {
|
||||||
jobId
|
jobId,
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
await robustFetch({
|
await robustFetch({
|
||||||
@@ -21,15 +21,15 @@ export async function fireEngineDelete(logger: Logger, jobId: string) {
|
|||||||
...(Sentry.isInitialized()
|
...(Sentry.isInitialized()
|
||||||
? {
|
? {
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
baggage: Sentry.spanToBaggageHeader(span)
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
},
|
},
|
||||||
ignoreResponse: true,
|
ignoreResponse: true,
|
||||||
ignoreFailure: true,
|
ignoreFailure: true,
|
||||||
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId })
|
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
||||||
});
|
});
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
// We do not care whether this fails or not.
|
// We do not care whether this fails or not.
|
||||||
|
|||||||
@@ -5,13 +5,13 @@ import {
|
|||||||
FireEngineScrapeRequestChromeCDP,
|
FireEngineScrapeRequestChromeCDP,
|
||||||
FireEngineScrapeRequestCommon,
|
FireEngineScrapeRequestCommon,
|
||||||
FireEngineScrapeRequestPlaywright,
|
FireEngineScrapeRequestPlaywright,
|
||||||
FireEngineScrapeRequestTLSClient
|
FireEngineScrapeRequestTLSClient,
|
||||||
} from "./scrape";
|
} from "./scrape";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import {
|
import {
|
||||||
fireEngineCheckStatus,
|
fireEngineCheckStatus,
|
||||||
FireEngineCheckStatusSuccess,
|
FireEngineCheckStatusSuccess,
|
||||||
StillProcessingError
|
StillProcessingError,
|
||||||
} from "./checkStatus";
|
} from "./checkStatus";
|
||||||
import { EngineError, SiteError, TimeoutError } from "../../error";
|
import { EngineError, SiteError, TimeoutError } from "../../error";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
@@ -27,15 +27,15 @@ async function performFireEngineScrape<
|
|||||||
Engine extends
|
Engine extends
|
||||||
| FireEngineScrapeRequestChromeCDP
|
| FireEngineScrapeRequestChromeCDP
|
||||||
| FireEngineScrapeRequestPlaywright
|
| FireEngineScrapeRequestPlaywright
|
||||||
| FireEngineScrapeRequestTLSClient
|
| FireEngineScrapeRequestTLSClient,
|
||||||
>(
|
>(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
request: FireEngineScrapeRequestCommon & Engine,
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
timeout = defaultTimeout
|
timeout = defaultTimeout,
|
||||||
): Promise<FireEngineCheckStatusSuccess> {
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
const scrape = await fireEngineScrape(
|
const scrape = await fireEngineScrape(
|
||||||
logger.child({ method: "fireEngineScrape" }),
|
logger.child({ method: "fireEngineScrape" }),
|
||||||
request
|
request,
|
||||||
);
|
);
|
||||||
|
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
@@ -47,25 +47,25 @@ async function performFireEngineScrape<
|
|||||||
if (errors.length >= errorLimit) {
|
if (errors.length >= errorLimit) {
|
||||||
logger.error("Error limit hit.", { errors });
|
logger.error("Error limit hit.", { errors });
|
||||||
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
||||||
cause: { errors }
|
cause: { errors },
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Date.now() - startTime > timeout) {
|
if (Date.now() - startTime > timeout) {
|
||||||
logger.info(
|
logger.info(
|
||||||
"Fire-engine was unable to scrape the page before timing out.",
|
"Fire-engine was unable to scrape the page before timing out.",
|
||||||
{ errors, timeout }
|
{ errors, timeout },
|
||||||
);
|
);
|
||||||
throw new TimeoutError(
|
throw new TimeoutError(
|
||||||
"Fire-engine was unable to scrape the page before timing out",
|
"Fire-engine was unable to scrape the page before timing out",
|
||||||
{ cause: { errors, timeout } }
|
{ cause: { errors, timeout } },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
status = await fireEngineCheckStatus(
|
status = await fireEngineCheckStatus(
|
||||||
logger.child({ method: "fireEngineCheckStatus" }),
|
logger.child({ method: "fireEngineCheckStatus" }),
|
||||||
scrape.jobId
|
scrape.jobId,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof StillProcessingError) {
|
if (error instanceof StillProcessingError) {
|
||||||
@@ -73,7 +73,7 @@ async function performFireEngineScrape<
|
|||||||
} else if (error instanceof EngineError || error instanceof SiteError) {
|
} else if (error instanceof EngineError || error instanceof SiteError) {
|
||||||
logger.debug("Fire-engine scrape job failed.", {
|
logger.debug("Fire-engine scrape job failed.", {
|
||||||
error,
|
error,
|
||||||
jobId: scrape.jobId
|
jobId: scrape.jobId,
|
||||||
});
|
});
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
@@ -81,7 +81,7 @@ async function performFireEngineScrape<
|
|||||||
errors.push(error);
|
errors.push(error);
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`,
|
`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`,
|
||||||
{ error, jobId: scrape.jobId }
|
{ error, jobId: scrape.jobId },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -93,7 +93,7 @@ async function performFireEngineScrape<
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithFireEngineChromeCDP(
|
export async function scrapeURLWithFireEngineChromeCDP(
|
||||||
meta: Meta
|
meta: Meta,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const actions: Action[] = [
|
const actions: Action[] = [
|
||||||
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||||
@@ -101,8 +101,8 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
? [
|
? [
|
||||||
{
|
{
|
||||||
type: "wait" as const,
|
type: "wait" as const,
|
||||||
milliseconds: meta.options.waitFor
|
milliseconds: meta.options.waitFor,
|
||||||
}
|
},
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
|
|
||||||
@@ -112,13 +112,13 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
? [
|
? [
|
||||||
{
|
{
|
||||||
type: "screenshot" as const,
|
type: "screenshot" as const,
|
||||||
fullPage: meta.options.formats.includes("screenshot@fullPage")
|
fullPage: meta.options.formats.includes("screenshot@fullPage"),
|
||||||
}
|
},
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
|
|
||||||
// Include specified actions
|
// Include specified actions
|
||||||
...(meta.options.actions ?? [])
|
...(meta.options.actions ?? []),
|
||||||
];
|
];
|
||||||
|
|
||||||
const request: FireEngineScrapeRequestCommon &
|
const request: FireEngineScrapeRequestCommon &
|
||||||
@@ -130,36 +130,36 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
headers: meta.options.headers,
|
headers: meta.options.headers,
|
||||||
...(actions.length > 0
|
...(actions.length > 0
|
||||||
? {
|
? {
|
||||||
actions
|
actions,
|
||||||
}
|
}
|
||||||
: {}),
|
: {}),
|
||||||
priority: meta.internalOptions.priority,
|
priority: meta.internalOptions.priority,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation,
|
||||||
mobile: meta.options.mobile,
|
mobile: meta.options.mobile,
|
||||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache
|
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
||||||
// TODO: scrollXPaths
|
// TODO: scrollXPaths
|
||||||
};
|
};
|
||||||
|
|
||||||
const totalWait = actions.reduce(
|
const totalWait = actions.reduce(
|
||||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
|
|
||||||
let response = await performFireEngineScrape(
|
let response = await performFireEngineScrape(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
request
|
request,
|
||||||
}),
|
}),
|
||||||
request,
|
request,
|
||||||
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity // TODO: better timeout handling
|
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity, // TODO: better timeout handling
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck"
|
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck",
|
||||||
}),
|
}),
|
||||||
response.responseHeaders
|
response.responseHeaders,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -168,20 +168,20 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
) {
|
) {
|
||||||
meta.logger.debug(
|
meta.logger.debug(
|
||||||
"Transforming screenshots from actions into screenshot field",
|
"Transforming screenshots from actions into screenshot field",
|
||||||
{ screenshots: response.screenshots }
|
{ screenshots: response.screenshots },
|
||||||
);
|
);
|
||||||
response.screenshot = (response.screenshots ?? [])[0];
|
response.screenshot = (response.screenshots ?? [])[0];
|
||||||
(response.screenshots ?? []).splice(0, 1);
|
(response.screenshots ?? []).splice(0, 1);
|
||||||
meta.logger.debug("Screenshot transformation done", {
|
meta.logger.debug("Screenshot transformation done", {
|
||||||
screenshots: response.screenshots,
|
screenshots: response.screenshots,
|
||||||
screenshot: response.screenshot
|
screenshot: response.screenshot,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
response,
|
response,
|
||||||
sourceURL: meta.url
|
sourceURL: meta.url,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,15 +197,15 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
? {
|
? {
|
||||||
actions: {
|
actions: {
|
||||||
screenshots: response.screenshots ?? [],
|
screenshots: response.screenshots ?? [],
|
||||||
scrapes: response.actionContent ?? []
|
scrapes: response.actionContent ?? [],
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithFireEnginePlaywright(
|
export async function scrapeURLWithFireEnginePlaywright(
|
||||||
meta: Meta
|
meta: Meta,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const request: FireEngineScrapeRequestCommon &
|
const request: FireEngineScrapeRequestCommon &
|
||||||
FireEngineScrapeRequestPlaywright = {
|
FireEngineScrapeRequestPlaywright = {
|
||||||
@@ -220,31 +220,31 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
wait: meta.options.waitFor,
|
wait: meta.options.waitFor,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation,
|
||||||
|
|
||||||
timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic
|
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = await performFireEngineScrape(
|
let response = await performFireEngineScrape(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
request
|
request,
|
||||||
}),
|
}),
|
||||||
request,
|
request,
|
||||||
meta.options.timeout !== undefined
|
meta.options.timeout !== undefined
|
||||||
? defaultTimeout + meta.options.waitFor
|
? defaultTimeout + meta.options.waitFor
|
||||||
: Infinity // TODO: better timeout handling
|
: Infinity, // TODO: better timeout handling
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck"
|
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck",
|
||||||
}),
|
}),
|
||||||
response.responseHeaders
|
response.responseHeaders,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
response,
|
response,
|
||||||
sourceURL: meta.url
|
sourceURL: meta.url,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -257,14 +257,14 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
|
|
||||||
...(response.screenshots !== undefined && response.screenshots.length > 0
|
...(response.screenshots !== undefined && response.screenshots.length > 0
|
||||||
? {
|
? {
|
||||||
screenshot: response.screenshots[0]
|
screenshot: response.screenshots[0],
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithFireEngineTLSClient(
|
export async function scrapeURLWithFireEngineTLSClient(
|
||||||
meta: Meta
|
meta: Meta,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const request: FireEngineScrapeRequestCommon &
|
const request: FireEngineScrapeRequestCommon &
|
||||||
FireEngineScrapeRequestTLSClient = {
|
FireEngineScrapeRequestTLSClient = {
|
||||||
@@ -279,29 +279,29 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation,
|
||||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||||
|
|
||||||
timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic
|
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = await performFireEngineScrape(
|
let response = await performFireEngineScrape(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
request
|
request,
|
||||||
}),
|
}),
|
||||||
request,
|
request,
|
||||||
meta.options.timeout !== undefined ? defaultTimeout : Infinity // TODO: better timeout handling
|
meta.options.timeout !== undefined ? defaultTimeout : Infinity, // TODO: better timeout handling
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck"
|
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck",
|
||||||
}),
|
}),
|
||||||
response.responseHeaders
|
response.responseHeaders,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
response,
|
response,
|
||||||
sourceURL: meta.url
|
sourceURL: meta.url,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -310,6 +310,6 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
|
|
||||||
html: response.content,
|
html: response.content,
|
||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode
|
statusCode: response.pageStatusCode,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,17 +58,17 @@ export type FireEngineScrapeRequestTLSClient = {
|
|||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
processing: z.boolean()
|
processing: z.boolean(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export async function fireEngineScrape<
|
export async function fireEngineScrape<
|
||||||
Engine extends
|
Engine extends
|
||||||
| FireEngineScrapeRequestChromeCDP
|
| FireEngineScrapeRequestChromeCDP
|
||||||
| FireEngineScrapeRequestPlaywright
|
| FireEngineScrapeRequestPlaywright
|
||||||
| FireEngineScrapeRequestTLSClient
|
| FireEngineScrapeRequestTLSClient,
|
||||||
>(
|
>(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
request: FireEngineScrapeRequestCommon & Engine
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
): Promise<z.infer<typeof schema>> {
|
): Promise<z.infer<typeof schema>> {
|
||||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
@@ -78,8 +78,8 @@ export async function fireEngineScrape<
|
|||||||
{
|
{
|
||||||
name: "fire-engine: Scrape",
|
name: "fire-engine: Scrape",
|
||||||
attributes: {
|
attributes: {
|
||||||
url: request.url
|
url: request.url,
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
@@ -89,16 +89,16 @@ export async function fireEngineScrape<
|
|||||||
...(Sentry.isInitialized()
|
...(Sentry.isInitialized()
|
||||||
? {
|
? {
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
baggage: Sentry.spanToBaggageHeader(span)
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
},
|
},
|
||||||
body: request,
|
body: request,
|
||||||
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
||||||
schema,
|
schema,
|
||||||
tryCount: 3
|
tryCount: 3,
|
||||||
});
|
});
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
return scrapeRequest;
|
return scrapeRequest;
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import { scrapeDOCX } from "./docx";
|
|||||||
import {
|
import {
|
||||||
scrapeURLWithFireEngineChromeCDP,
|
scrapeURLWithFireEngineChromeCDP,
|
||||||
scrapeURLWithFireEnginePlaywright,
|
scrapeURLWithFireEnginePlaywright,
|
||||||
scrapeURLWithFireEngineTLSClient
|
scrapeURLWithFireEngineTLSClient,
|
||||||
} from "./fire-engine";
|
} from "./fire-engine";
|
||||||
import { scrapePDF } from "./pdf";
|
import { scrapePDF } from "./pdf";
|
||||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||||
@@ -43,7 +43,7 @@ export const engines: Engine[] = [
|
|||||||
? [
|
? [
|
||||||
"fire-engine;chrome-cdp" as const,
|
"fire-engine;chrome-cdp" as const,
|
||||||
"fire-engine;playwright" as const,
|
"fire-engine;playwright" as const,
|
||||||
"fire-engine;tlsclient" as const
|
"fire-engine;tlsclient" as const,
|
||||||
]
|
]
|
||||||
: []),
|
: []),
|
||||||
...(useScrapingBee
|
...(useScrapingBee
|
||||||
@@ -52,7 +52,7 @@ export const engines: Engine[] = [
|
|||||||
...(usePlaywright ? ["playwright" as const] : []),
|
...(usePlaywright ? ["playwright" as const] : []),
|
||||||
"fetch",
|
"fetch",
|
||||||
"pdf",
|
"pdf",
|
||||||
"docx"
|
"docx",
|
||||||
];
|
];
|
||||||
|
|
||||||
export const featureFlags = [
|
export const featureFlags = [
|
||||||
@@ -66,7 +66,7 @@ export const featureFlags = [
|
|||||||
"location",
|
"location",
|
||||||
"mobile",
|
"mobile",
|
||||||
"skipTlsVerification",
|
"skipTlsVerification",
|
||||||
"useFastMode"
|
"useFastMode",
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
export type FeatureFlag = (typeof featureFlags)[number];
|
export type FeatureFlag = (typeof featureFlags)[number];
|
||||||
@@ -86,7 +86,7 @@ export const featureFlagOptions: {
|
|||||||
useFastMode: { priority: 90 },
|
useFastMode: { priority: 90 },
|
||||||
location: { priority: 10 },
|
location: { priority: 10 },
|
||||||
mobile: { priority: 10 },
|
mobile: { priority: 10 },
|
||||||
skipTlsVerification: { priority: 10 }
|
skipTlsVerification: { priority: 10 },
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
export type EngineScrapeResult = {
|
export type EngineScrapeResult = {
|
||||||
@@ -116,7 +116,7 @@ const engineHandlers: {
|
|||||||
playwright: scrapeURLWithPlaywright,
|
playwright: scrapeURLWithPlaywright,
|
||||||
fetch: scrapeURLWithFetch,
|
fetch: scrapeURLWithFetch,
|
||||||
pdf: scrapePDF,
|
pdf: scrapePDF,
|
||||||
docx: scrapeDOCX
|
docx: scrapeDOCX,
|
||||||
};
|
};
|
||||||
|
|
||||||
export const engineOptions: {
|
export const engineOptions: {
|
||||||
@@ -141,9 +141,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false
|
useFastMode: false,
|
||||||
},
|
},
|
||||||
quality: 1000 // cache should always be tried first
|
quality: 1000, // cache should always be tried first
|
||||||
},
|
},
|
||||||
"fire-engine;chrome-cdp": {
|
"fire-engine;chrome-cdp": {
|
||||||
features: {
|
features: {
|
||||||
@@ -157,9 +157,9 @@ export const engineOptions: {
|
|||||||
location: true,
|
location: true,
|
||||||
mobile: true,
|
mobile: true,
|
||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
useFastMode: false
|
useFastMode: false,
|
||||||
},
|
},
|
||||||
quality: 50
|
quality: 50,
|
||||||
},
|
},
|
||||||
"fire-engine;playwright": {
|
"fire-engine;playwright": {
|
||||||
features: {
|
features: {
|
||||||
@@ -173,9 +173,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false
|
useFastMode: false,
|
||||||
},
|
},
|
||||||
quality: 40
|
quality: 40,
|
||||||
},
|
},
|
||||||
scrapingbee: {
|
scrapingbee: {
|
||||||
features: {
|
features: {
|
||||||
@@ -189,9 +189,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false
|
useFastMode: false,
|
||||||
},
|
},
|
||||||
quality: 30
|
quality: 30,
|
||||||
},
|
},
|
||||||
scrapingbeeLoad: {
|
scrapingbeeLoad: {
|
||||||
features: {
|
features: {
|
||||||
@@ -205,9 +205,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false
|
useFastMode: false,
|
||||||
},
|
},
|
||||||
quality: 29
|
quality: 29,
|
||||||
},
|
},
|
||||||
playwright: {
|
playwright: {
|
||||||
features: {
|
features: {
|
||||||
@@ -221,9 +221,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: false
|
useFastMode: false,
|
||||||
},
|
},
|
||||||
quality: 20
|
quality: 20,
|
||||||
},
|
},
|
||||||
"fire-engine;tlsclient": {
|
"fire-engine;tlsclient": {
|
||||||
features: {
|
features: {
|
||||||
@@ -237,9 +237,9 @@ export const engineOptions: {
|
|||||||
location: true,
|
location: true,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true
|
useFastMode: true,
|
||||||
},
|
},
|
||||||
quality: 10
|
quality: 10,
|
||||||
},
|
},
|
||||||
fetch: {
|
fetch: {
|
||||||
features: {
|
features: {
|
||||||
@@ -253,9 +253,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true
|
useFastMode: true,
|
||||||
},
|
},
|
||||||
quality: 5
|
quality: 5,
|
||||||
},
|
},
|
||||||
pdf: {
|
pdf: {
|
||||||
features: {
|
features: {
|
||||||
@@ -269,9 +269,9 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true
|
useFastMode: true,
|
||||||
},
|
},
|
||||||
quality: -10
|
quality: -10,
|
||||||
},
|
},
|
||||||
docx: {
|
docx: {
|
||||||
features: {
|
features: {
|
||||||
@@ -285,10 +285,10 @@ export const engineOptions: {
|
|||||||
location: false,
|
location: false,
|
||||||
mobile: false,
|
mobile: false,
|
||||||
skipTlsVerification: false,
|
skipTlsVerification: false,
|
||||||
useFastMode: true
|
useFastMode: true,
|
||||||
},
|
},
|
||||||
quality: -10
|
quality: -10,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export function buildFallbackList(meta: Meta): {
|
export function buildFallbackList(meta: Meta): {
|
||||||
@@ -297,7 +297,7 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
}[] {
|
}[] {
|
||||||
const prioritySum = [...meta.featureFlags].reduce(
|
const prioritySum = [...meta.featureFlags].reduce(
|
||||||
(a, x) => a + featureFlagOptions[x].priority,
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
const priorityThreshold = Math.floor(prioritySum / 2);
|
const priorityThreshold = Math.floor(prioritySum / 2);
|
||||||
let selectedEngines: {
|
let selectedEngines: {
|
||||||
@@ -315,13 +315,13 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
const supportedFlags = new Set([
|
const supportedFlags = new Set([
|
||||||
...Object.entries(engineOptions[engine].features)
|
...Object.entries(engineOptions[engine].features)
|
||||||
.filter(
|
.filter(
|
||||||
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true
|
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true,
|
||||||
)
|
)
|
||||||
.map(([k, _]) => k)
|
.map(([k, _]) => k),
|
||||||
]);
|
]);
|
||||||
const supportScore = [...supportedFlags].reduce(
|
const supportScore = [...supportedFlags].reduce(
|
||||||
(a, x) => a + featureFlagOptions[x].priority,
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
|
|
||||||
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
||||||
@@ -338,7 +338,7 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
prioritySum,
|
prioritySum,
|
||||||
priorityThreshold,
|
priorityThreshold,
|
||||||
featureFlags: [...meta.featureFlags],
|
featureFlags: [...meta.featureFlags],
|
||||||
unsupportedFeatures
|
unsupportedFeatures,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
meta.logger.debug(
|
meta.logger.debug(
|
||||||
@@ -348,22 +348,22 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
prioritySum,
|
prioritySum,
|
||||||
priorityThreshold,
|
priorityThreshold,
|
||||||
featureFlags: [...meta.featureFlags],
|
featureFlags: [...meta.featureFlags],
|
||||||
unsupportedFeatures
|
unsupportedFeatures,
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) {
|
if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) {
|
||||||
selectedEngines = selectedEngines.filter(
|
selectedEngines = selectedEngines.filter(
|
||||||
(x) => engineOptions[x.engine].quality > 0
|
(x) => engineOptions[x.engine].quality > 0,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
selectedEngines.sort(
|
selectedEngines.sort(
|
||||||
(a, b) =>
|
(a, b) =>
|
||||||
b.supportScore - a.supportScore ||
|
b.supportScore - a.supportScore ||
|
||||||
engineOptions[b.engine].quality - engineOptions[a.engine].quality
|
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
|
||||||
);
|
);
|
||||||
|
|
||||||
return selectedEngines;
|
return selectedEngines;
|
||||||
@@ -371,16 +371,16 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
|
|
||||||
export async function scrapeURLWithEngine(
|
export async function scrapeURLWithEngine(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
engine: Engine
|
engine: Engine,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const fn = engineHandlers[engine];
|
const fn = engineHandlers[engine];
|
||||||
const logger = meta.logger.child({
|
const logger = meta.logger.child({
|
||||||
method: fn.name ?? "scrapeURLWithEngine",
|
method: fn.name ?? "scrapeURLWithEngine",
|
||||||
engine
|
engine,
|
||||||
});
|
});
|
||||||
const _meta = {
|
const _meta = {
|
||||||
...meta,
|
...meta,
|
||||||
logger
|
logger,
|
||||||
};
|
};
|
||||||
|
|
||||||
return await fn(_meta);
|
return await fn(_meta);
|
||||||
|
|||||||
@@ -14,10 +14,10 @@ type PDFProcessorResult = { html: string; markdown?: string };
|
|||||||
|
|
||||||
async function scrapePDFWithLlamaParse(
|
async function scrapePDFWithLlamaParse(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
tempFilePath: string
|
tempFilePath: string,
|
||||||
): Promise<PDFProcessorResult> {
|
): Promise<PDFProcessorResult> {
|
||||||
meta.logger.debug("Processing PDF document with LlamaIndex", {
|
meta.logger.debug("Processing PDF document with LlamaIndex", {
|
||||||
tempFilePath
|
tempFilePath,
|
||||||
});
|
});
|
||||||
|
|
||||||
const uploadForm = new FormData();
|
const uploadForm = new FormData();
|
||||||
@@ -28,7 +28,7 @@ async function scrapePDFWithLlamaParse(
|
|||||||
name: tempFilePath,
|
name: tempFilePath,
|
||||||
stream() {
|
stream() {
|
||||||
return createReadStream(
|
return createReadStream(
|
||||||
tempFilePath
|
tempFilePath,
|
||||||
) as unknown as ReadableStream<Uint8Array>;
|
) as unknown as ReadableStream<Uint8Array>;
|
||||||
},
|
},
|
||||||
arrayBuffer() {
|
arrayBuffer() {
|
||||||
@@ -41,22 +41,22 @@ async function scrapePDFWithLlamaParse(
|
|||||||
slice(start, end, contentType) {
|
slice(start, end, contentType) {
|
||||||
throw Error("Unimplemented in mock Blob: slice");
|
throw Error("Unimplemented in mock Blob: slice");
|
||||||
},
|
},
|
||||||
type: "application/pdf"
|
type: "application/pdf",
|
||||||
} as Blob);
|
} as Blob);
|
||||||
|
|
||||||
const upload = await robustFetch({
|
const upload = await robustFetch({
|
||||||
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
|
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||||
},
|
},
|
||||||
body: uploadForm,
|
body: uploadForm,
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "scrapePDFWithLlamaParse/upload/robustFetch"
|
method: "scrapePDFWithLlamaParse/upload/robustFetch",
|
||||||
}),
|
}),
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
id: z.string()
|
id: z.string(),
|
||||||
})
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
const jobId = upload.id;
|
const jobId = upload.id;
|
||||||
@@ -70,18 +70,18 @@ async function scrapePDFWithLlamaParse(
|
|||||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
|
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
||||||
},
|
},
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "scrapePDFWithLlamaParse/result/robustFetch"
|
method: "scrapePDFWithLlamaParse/result/robustFetch",
|
||||||
}),
|
}),
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
markdown: z.string()
|
markdown: z.string(),
|
||||||
})
|
}),
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
markdown: result.markdown,
|
markdown: result.markdown,
|
||||||
html: await marked.parse(result.markdown, { async: true })
|
html: await marked.parse(result.markdown, { async: true }),
|
||||||
};
|
};
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof Error && e.message === "Request sent failure status") {
|
if (e instanceof Error && e.message === "Request sent failure status") {
|
||||||
@@ -93,7 +93,7 @@ async function scrapePDFWithLlamaParse(
|
|||||||
throw new RemoveFeatureError(["pdf"]);
|
throw new RemoveFeatureError(["pdf"]);
|
||||||
} else {
|
} else {
|
||||||
throw new Error("LlamaParse threw an error", {
|
throw new Error("LlamaParse threw an error", {
|
||||||
cause: e.cause
|
cause: e.cause,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -109,7 +109,7 @@ async function scrapePDFWithLlamaParse(
|
|||||||
|
|
||||||
async function scrapePDFWithParsePDF(
|
async function scrapePDFWithParsePDF(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
tempFilePath: string
|
tempFilePath: string,
|
||||||
): Promise<PDFProcessorResult> {
|
): Promise<PDFProcessorResult> {
|
||||||
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
||||||
|
|
||||||
@@ -118,7 +118,7 @@ async function scrapePDFWithParsePDF(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
markdown: escaped,
|
markdown: escaped,
|
||||||
html: escaped
|
html: escaped,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -131,7 +131,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
statusCode: file.response.status,
|
statusCode: file.response.status,
|
||||||
|
|
||||||
html: content,
|
html: content,
|
||||||
markdown: content
|
markdown: content,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -144,22 +144,22 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
{
|
{
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "scrapePDF/scrapePDFWithLlamaParse"
|
method: "scrapePDF/scrapePDFWithLlamaParse",
|
||||||
})
|
}),
|
||||||
},
|
},
|
||||||
tempFilePath
|
tempFilePath,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||||
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
} else if (error instanceof RemoveFeatureError) {
|
} else if (error instanceof RemoveFeatureError) {
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
||||||
{ error }
|
{ error },
|
||||||
);
|
);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
}
|
}
|
||||||
@@ -170,9 +170,11 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
result = await scrapePDFWithParsePDF(
|
result = await scrapePDFWithParsePDF(
|
||||||
{
|
{
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" })
|
logger: meta.logger.child({
|
||||||
|
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||||
|
}),
|
||||||
},
|
},
|
||||||
tempFilePath
|
tempFilePath,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -183,6 +185,6 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
|
||||||
html: result.html,
|
html: result.html,
|
||||||
markdown: result.markdown
|
markdown: result.markdown,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import { TimeoutError } from "../../error";
|
|||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
|
||||||
export async function scrapeURLWithPlaywright(
|
export async function scrapeURLWithPlaywright(
|
||||||
meta: Meta
|
meta: Meta,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const timeout = 20000 + meta.options.waitFor;
|
const timeout = 20000 + meta.options.waitFor;
|
||||||
|
|
||||||
@@ -13,35 +13,35 @@ export async function scrapeURLWithPlaywright(
|
|||||||
await robustFetch({
|
await robustFetch({
|
||||||
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: {
|
body: {
|
||||||
url: meta.url,
|
url: meta.url,
|
||||||
wait_after_load: meta.options.waitFor,
|
wait_after_load: meta.options.waitFor,
|
||||||
timeout,
|
timeout,
|
||||||
headers: meta.options.headers
|
headers: meta.options.headers,
|
||||||
},
|
},
|
||||||
method: "POST",
|
method: "POST",
|
||||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
content: z.string(),
|
content: z.string(),
|
||||||
pageStatusCode: z.number(),
|
pageStatusCode: z.number(),
|
||||||
pageError: z.string().optional()
|
pageError: z.string().optional(),
|
||||||
})
|
}),
|
||||||
}),
|
}),
|
||||||
(async () => {
|
(async () => {
|
||||||
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||||
throw new TimeoutError(
|
throw new TimeoutError(
|
||||||
"Playwright was unable to scrape the page before timing out",
|
"Playwright was unable to scrape the page before timing out",
|
||||||
{ cause: { timeout } }
|
{ cause: { timeout } },
|
||||||
);
|
);
|
||||||
})()
|
})(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: meta.url, // TODO: impove redirect following
|
url: meta.url, // TODO: impove redirect following
|
||||||
html: response.content,
|
html: response.content,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
error: response.pageError
|
error: response.pageError,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import { EngineError } from "../../error";
|
|||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||||
|
|
||||||
export function scrapeURLWithScrapingBee(
|
export function scrapeURLWithScrapingBee(
|
||||||
wait_browser: "domcontentloaded" | "networkidle2"
|
wait_browser: "domcontentloaded" | "networkidle2",
|
||||||
): (meta: Meta) => Promise<EngineScrapeResult> {
|
): (meta: Meta) => Promise<EngineScrapeResult> {
|
||||||
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
||||||
let response: AxiosResponse<any>;
|
let response: AxiosResponse<any>;
|
||||||
@@ -23,12 +23,12 @@ export function scrapeURLWithScrapingBee(
|
|||||||
json_response: true,
|
json_response: true,
|
||||||
screenshot: meta.options.formats.includes("screenshot"),
|
screenshot: meta.options.formats.includes("screenshot"),
|
||||||
screenshot_full_page: meta.options.formats.includes(
|
screenshot_full_page: meta.options.formats.includes(
|
||||||
"screenshot@fullPage"
|
"screenshot@fullPage",
|
||||||
)
|
),
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
"ScrapingService-Request": "TRUE" // this is sent to the page, not to ScrapingBee - mogery
|
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof AxiosError && error.response !== undefined) {
|
if (error instanceof AxiosError && error.response !== undefined) {
|
||||||
@@ -51,25 +51,25 @@ export function scrapeURLWithScrapingBee(
|
|||||||
|
|
||||||
if (body.errors || body.body?.error || isHiddenEngineError) {
|
if (body.errors || body.body?.error || isHiddenEngineError) {
|
||||||
meta.logger.error("ScrapingBee threw an error", {
|
meta.logger.error("ScrapingBee threw an error", {
|
||||||
body: body.body?.error ?? body.errors ?? body.body ?? body
|
body: body.body?.error ?? body.errors ?? body.body ?? body,
|
||||||
});
|
});
|
||||||
throw new EngineError("Engine error #34", {
|
throw new EngineError("Engine error #34", {
|
||||||
cause: { body, statusCode: response.status }
|
cause: { body, statusCode: response.status },
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (typeof body.body !== "string") {
|
if (typeof body.body !== "string") {
|
||||||
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
||||||
throw new EngineError("Engine error #35", {
|
throw new EngineError("Engine error #35", {
|
||||||
cause: { body, statusCode: response.status }
|
cause: { body, statusCode: response.status },
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
specialtyScrapeCheck(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck"
|
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
|
||||||
}),
|
}),
|
||||||
body.headers
|
body.headers,
|
||||||
);
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -80,9 +80,9 @@ export function scrapeURLWithScrapingBee(
|
|||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
...(body.screenshot
|
...(body.screenshot
|
||||||
? {
|
? {
|
||||||
screenshot: `data:image/png;base64,${body.screenshot}`
|
screenshot: `data:image/png;base64,${body.screenshot}`,
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,13 +13,13 @@ export async function fetchFileToBuffer(url: string): Promise<{
|
|||||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||||
return {
|
return {
|
||||||
response,
|
response,
|
||||||
buffer: Buffer.from(await response.arrayBuffer())
|
buffer: Buffer.from(await response.arrayBuffer()),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function downloadFile(
|
export async function downloadFile(
|
||||||
id: string,
|
id: string,
|
||||||
url: string
|
url: string,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
response: undici.Response;
|
response: undici.Response;
|
||||||
tempFilePath: string;
|
tempFilePath: string;
|
||||||
@@ -32,9 +32,9 @@ export async function downloadFile(
|
|||||||
const response = await undici.fetch(url, {
|
const response = await undici.fetch(url, {
|
||||||
dispatcher: new undici.Agent({
|
dispatcher: new undici.Agent({
|
||||||
connect: {
|
connect: {
|
||||||
rejectUnauthorized: false
|
rejectUnauthorized: false,
|
||||||
}
|
},
|
||||||
})
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
// This should never happen in the current state of JS (2024), but let's check anyways.
|
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||||
@@ -47,13 +47,13 @@ export async function downloadFile(
|
|||||||
tempFileWrite.on("finish", () => resolve(null));
|
tempFileWrite.on("finish", () => resolve(null));
|
||||||
tempFileWrite.on("error", (error) => {
|
tempFileWrite.on("error", (error) => {
|
||||||
reject(
|
reject(
|
||||||
new EngineError("Failed to write to temp file", { cause: { error } })
|
new EngineError("Failed to write to temp file", { cause: { error } }),
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
response,
|
response,
|
||||||
tempFilePath
|
tempFilePath,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,15 +3,15 @@ import { AddFeatureError } from "../../error";
|
|||||||
|
|
||||||
export function specialtyScrapeCheck(
|
export function specialtyScrapeCheck(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
headers: Record<string, string> | undefined
|
headers: Record<string, string> | undefined,
|
||||||
) {
|
) {
|
||||||
const contentType = (Object.entries(headers ?? {}).find(
|
const contentType = (Object.entries(headers ?? {}).find(
|
||||||
(x) => x[0].toLowerCase() === "content-type"
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
) ?? [])[1];
|
) ?? [])[1];
|
||||||
|
|
||||||
if (contentType === undefined) {
|
if (contentType === undefined) {
|
||||||
logger.warn("Failed to check contentType -- was not present in headers", {
|
logger.warn("Failed to check contentType -- was not present in headers", {
|
||||||
headers
|
headers,
|
||||||
});
|
});
|
||||||
} else if (
|
} else if (
|
||||||
contentType === "application/pdf" ||
|
contentType === "application/pdf" ||
|
||||||
@@ -23,7 +23,7 @@ export function specialtyScrapeCheck(
|
|||||||
contentType ===
|
contentType ===
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
||||||
contentType.startsWith(
|
contentType.startsWith(
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document;"
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document;",
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
// .docx
|
// .docx
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ export class NoEnginesLeftError extends Error {
|
|||||||
|
|
||||||
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
||||||
super(
|
super(
|
||||||
"All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com."
|
"All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.",
|
||||||
);
|
);
|
||||||
this.fallbackList = fallbackList;
|
this.fallbackList = fallbackList;
|
||||||
this.results = results;
|
this.results = results;
|
||||||
@@ -40,7 +40,8 @@ export class RemoveFeatureError extends Error {
|
|||||||
|
|
||||||
constructor(featureFlags: FeatureFlag[]) {
|
constructor(featureFlags: FeatureFlag[]) {
|
||||||
super(
|
super(
|
||||||
"Incorrect feature flags have been discovered: " + featureFlags.join(", ")
|
"Incorrect feature flags have been discovered: " +
|
||||||
|
featureFlags.join(", "),
|
||||||
);
|
);
|
||||||
this.featureFlags = featureFlags;
|
this.featureFlags = featureFlags;
|
||||||
}
|
}
|
||||||
@@ -50,7 +51,7 @@ export class SiteError extends Error {
|
|||||||
public code: string;
|
public code: string;
|
||||||
constructor(code: string) {
|
constructor(code: string) {
|
||||||
super(
|
super(
|
||||||
"Specified URL is failing to load in the browser. Error code: " + code
|
"Specified URL is failing to load in the browser. Error code: " + code,
|
||||||
);
|
);
|
||||||
this.code = code;
|
this.code = code;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import {
|
|||||||
Engine,
|
Engine,
|
||||||
EngineScrapeResult,
|
EngineScrapeResult,
|
||||||
FeatureFlag,
|
FeatureFlag,
|
||||||
scrapeURLWithEngine
|
scrapeURLWithEngine,
|
||||||
} from "./engines";
|
} from "./engines";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import {
|
import {
|
||||||
@@ -17,7 +17,7 @@ import {
|
|||||||
NoEnginesLeftError,
|
NoEnginesLeftError,
|
||||||
RemoveFeatureError,
|
RemoveFeatureError,
|
||||||
SiteError,
|
SiteError,
|
||||||
TimeoutError
|
TimeoutError,
|
||||||
} from "./error";
|
} from "./error";
|
||||||
import { executeTransformers } from "./transformers";
|
import { executeTransformers } from "./transformers";
|
||||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||||
@@ -50,7 +50,7 @@ export type Meta = {
|
|||||||
function buildFeatureFlags(
|
function buildFeatureFlags(
|
||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions
|
internalOptions: InternalOptions,
|
||||||
): Set<FeatureFlag> {
|
): Set<FeatureFlag> {
|
||||||
const flags: Set<FeatureFlag> = new Set();
|
const flags: Set<FeatureFlag> = new Set();
|
||||||
|
|
||||||
@@ -112,7 +112,7 @@ function buildMetaObject(
|
|||||||
id: string,
|
id: string,
|
||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions
|
internalOptions: InternalOptions,
|
||||||
): Meta {
|
): Meta {
|
||||||
const specParams =
|
const specParams =
|
||||||
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
||||||
@@ -120,14 +120,14 @@ function buildMetaObject(
|
|||||||
options = Object.assign(options, specParams.scrapeOptions);
|
options = Object.assign(options, specParams.scrapeOptions);
|
||||||
internalOptions = Object.assign(
|
internalOptions = Object.assign(
|
||||||
internalOptions,
|
internalOptions,
|
||||||
specParams.internalOptions
|
specParams.internalOptions,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const _logger = logger.child({
|
const _logger = logger.child({
|
||||||
module: "ScrapeURL",
|
module: "ScrapeURL",
|
||||||
scrapeId: id,
|
scrapeId: id,
|
||||||
scrapeURL: url
|
scrapeURL: url,
|
||||||
});
|
});
|
||||||
const logs: any[] = [];
|
const logs: any[] = [];
|
||||||
|
|
||||||
@@ -138,7 +138,7 @@ function buildMetaObject(
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
logger: _logger,
|
logger: _logger,
|
||||||
logs,
|
logs,
|
||||||
featureFlags: buildFeatureFlags(url, options, internalOptions)
|
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -229,7 +229,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now()
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: TODO: what to do when status code is bad is tough...
|
// NOTE: TODO: what to do when status code is bad is tough...
|
||||||
@@ -237,35 +237,35 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
||||||
if (isLongEnough || !isGoodStatusCode) {
|
if (isLongEnough || !isGoodStatusCode) {
|
||||||
meta.logger.info("Scrape via " + engine + " deemed successful.", {
|
meta.logger.info("Scrape via " + engine + " deemed successful.", {
|
||||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError }
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||||
});
|
});
|
||||||
result = {
|
result = {
|
||||||
engine,
|
engine,
|
||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
result: engineResult as EngineScrapeResult & { markdown: string }
|
result: engineResult as EngineScrapeResult & { markdown: string },
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof EngineError) {
|
if (error instanceof EngineError) {
|
||||||
meta.logger.info("Engine " + engine + " could not scrape the page.", {
|
meta.logger.info("Engine " + engine + " could not scrape the page.", {
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "error",
|
state: "error",
|
||||||
error: safeguardCircularError(error),
|
error: safeguardCircularError(error),
|
||||||
unexpected: false,
|
unexpected: false,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now()
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
} else if (error instanceof TimeoutError) {
|
} else if (error instanceof TimeoutError) {
|
||||||
meta.logger.info("Engine " + engine + " timed out while scraping.", {
|
meta.logger.info("Engine " + engine + " timed out while scraping.", {
|
||||||
error
|
error,
|
||||||
});
|
});
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "timeout",
|
state: "timeout",
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now()
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
} else if (
|
} else if (
|
||||||
error instanceof AddFeatureError ||
|
error instanceof AddFeatureError ||
|
||||||
@@ -278,7 +278,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
error: safeguardCircularError(error),
|
error: safeguardCircularError(error),
|
||||||
unexpected: true,
|
unexpected: true,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now()
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
error.results = results;
|
error.results = results;
|
||||||
meta.logger.warn("LLM refusal encountered", { error });
|
meta.logger.warn("LLM refusal encountered", { error });
|
||||||
@@ -289,14 +289,14 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.info(
|
meta.logger.info(
|
||||||
"An unexpected error happened while scraping with " + engine + ".",
|
"An unexpected error happened while scraping with " + engine + ".",
|
||||||
{ error }
|
{ error },
|
||||||
);
|
);
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "error",
|
state: "error",
|
||||||
error: safeguardCircularError(error),
|
error: safeguardCircularError(error),
|
||||||
unexpected: true,
|
unexpected: true,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now()
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -305,7 +305,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
if (result === null) {
|
if (result === null) {
|
||||||
throw new NoEnginesLeftError(
|
throw new NoEnginesLeftError(
|
||||||
fallbackList.map((x) => x.engine),
|
fallbackList.map((x) => x.engine),
|
||||||
results
|
results,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -318,15 +318,15 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
sourceURL: meta.url,
|
sourceURL: meta.url,
|
||||||
url: result.result.url,
|
url: result.result.url,
|
||||||
statusCode: result.result.statusCode,
|
statusCode: result.result.statusCode,
|
||||||
error: result.result.error
|
error: result.result.error,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (result.unsupportedFeatures.size > 0) {
|
if (result.unsupportedFeatures.size > 0) {
|
||||||
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
|
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
|
||||||
meta.logger.warn(warning, {
|
meta.logger.warn(warning, {
|
||||||
engine: result.engine,
|
engine: result.engine,
|
||||||
unsupportedFeatures: result.unsupportedFeatures
|
unsupportedFeatures: result.unsupportedFeatures,
|
||||||
});
|
});
|
||||||
document.warning =
|
document.warning =
|
||||||
document.warning !== undefined
|
document.warning !== undefined
|
||||||
@@ -340,7 +340,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
success: true,
|
success: true,
|
||||||
document,
|
document,
|
||||||
logs: meta.logs,
|
logs: meta.logs,
|
||||||
engines: results
|
engines: results,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -348,7 +348,7 @@ export async function scrapeURL(
|
|||||||
id: string,
|
id: string,
|
||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions = {}
|
internalOptions: InternalOptions = {},
|
||||||
): Promise<ScrapeUrlResponse> {
|
): Promise<ScrapeUrlResponse> {
|
||||||
const meta = buildMetaObject(id, url, options, internalOptions);
|
const meta = buildMetaObject(id, url, options, internalOptions);
|
||||||
try {
|
try {
|
||||||
@@ -363,10 +363,10 @@ export async function scrapeURL(
|
|||||||
meta.logger.debug(
|
meta.logger.debug(
|
||||||
"More feature flags requested by scraper: adding " +
|
"More feature flags requested by scraper: adding " +
|
||||||
error.featureFlags.join(", "),
|
error.featureFlags.join(", "),
|
||||||
{ error, existingFlags: meta.featureFlags }
|
{ error, existingFlags: meta.featureFlags },
|
||||||
);
|
);
|
||||||
meta.featureFlags = new Set(
|
meta.featureFlags = new Set(
|
||||||
[...meta.featureFlags].concat(error.featureFlags)
|
[...meta.featureFlags].concat(error.featureFlags),
|
||||||
);
|
);
|
||||||
} else if (
|
} else if (
|
||||||
error instanceof RemoveFeatureError &&
|
error instanceof RemoveFeatureError &&
|
||||||
@@ -375,12 +375,12 @@ export async function scrapeURL(
|
|||||||
meta.logger.debug(
|
meta.logger.debug(
|
||||||
"Incorrect feature flags reported by scraper: removing " +
|
"Incorrect feature flags reported by scraper: removing " +
|
||||||
error.featureFlags.join(","),
|
error.featureFlags.join(","),
|
||||||
{ error, existingFlags: meta.featureFlags }
|
{ error, existingFlags: meta.featureFlags },
|
||||||
);
|
);
|
||||||
meta.featureFlags = new Set(
|
meta.featureFlags = new Set(
|
||||||
[...meta.featureFlags].filter(
|
[...meta.featureFlags].filter(
|
||||||
(x) => !error.featureFlags.includes(x)
|
(x) => !error.featureFlags.includes(x),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
throw error;
|
throw error;
|
||||||
@@ -415,7 +415,7 @@ export async function scrapeURL(
|
|||||||
success: false,
|
success: false,
|
||||||
error,
|
error,
|
||||||
logs: meta.logs,
|
logs: meta.logs,
|
||||||
engines: results
|
engines: results,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(
|
logger.error(
|
||||||
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
|
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
|
||||||
{ error }
|
{ error },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import { Meta } from "..";
|
|||||||
|
|
||||||
export function extractMetadata(
|
export function extractMetadata(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
html: string
|
html: string,
|
||||||
): Document["metadata"] {
|
): Document["metadata"] {
|
||||||
let title: string | undefined = undefined;
|
let title: string | undefined = undefined;
|
||||||
let description: string | undefined = undefined;
|
let description: string | undefined = undefined;
|
||||||
@@ -148,6 +148,6 @@ export function extractMetadata(
|
|||||||
publishedTime,
|
publishedTime,
|
||||||
articleTag,
|
articleTag,
|
||||||
articleSection,
|
articleSection,
|
||||||
...customMetadata
|
...customMetadata,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
|||||||
|
|
||||||
export async function robustFetch<
|
export async function robustFetch<
|
||||||
Schema extends z.Schema<any>,
|
Schema extends z.Schema<any>,
|
||||||
Output = z.infer<Schema>
|
Output = z.infer<Schema>,
|
||||||
>({
|
>({
|
||||||
url,
|
url,
|
||||||
logger,
|
logger,
|
||||||
@@ -32,7 +32,7 @@ export async function robustFetch<
|
|||||||
ignoreFailure = false,
|
ignoreFailure = false,
|
||||||
requestId = uuid(),
|
requestId = uuid(),
|
||||||
tryCount = 1,
|
tryCount = 1,
|
||||||
tryCooldown
|
tryCooldown,
|
||||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||||
const params = {
|
const params = {
|
||||||
url,
|
url,
|
||||||
@@ -44,7 +44,7 @@ export async function robustFetch<
|
|||||||
ignoreResponse,
|
ignoreResponse,
|
||||||
ignoreFailure,
|
ignoreFailure,
|
||||||
tryCount,
|
tryCount,
|
||||||
tryCooldown
|
tryCooldown,
|
||||||
};
|
};
|
||||||
|
|
||||||
let request: Response;
|
let request: Response;
|
||||||
@@ -56,20 +56,20 @@ export async function robustFetch<
|
|||||||
? {}
|
? {}
|
||||||
: body !== undefined
|
: body !== undefined
|
||||||
? {
|
? {
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json",
|
||||||
}
|
}
|
||||||
: {}),
|
: {}),
|
||||||
...(headers !== undefined ? headers : {})
|
...(headers !== undefined ? headers : {}),
|
||||||
},
|
},
|
||||||
...(body instanceof FormData
|
...(body instanceof FormData
|
||||||
? {
|
? {
|
||||||
body
|
body,
|
||||||
}
|
}
|
||||||
: body !== undefined
|
: body !== undefined
|
||||||
? {
|
? {
|
||||||
body: JSON.stringify(body)
|
body: JSON.stringify(body),
|
||||||
}
|
}
|
||||||
: {})
|
: {}),
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (!ignoreFailure) {
|
if (!ignoreFailure) {
|
||||||
@@ -77,12 +77,12 @@ export async function robustFetch<
|
|||||||
if (tryCount > 1) {
|
if (tryCount > 1) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Request failed, trying " + (tryCount - 1) + " more times",
|
"Request failed, trying " + (tryCount - 1) + " more times",
|
||||||
{ params, error, requestId }
|
{ params, error, requestId },
|
||||||
);
|
);
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
...params,
|
...params,
|
||||||
requestId,
|
requestId,
|
||||||
tryCount: tryCount - 1
|
tryCount: tryCount - 1,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Request failed", { params, error, requestId });
|
logger.debug("Request failed", { params, error, requestId });
|
||||||
@@ -90,8 +90,8 @@ export async function robustFetch<
|
|||||||
cause: {
|
cause: {
|
||||||
params,
|
params,
|
||||||
requestId,
|
requestId,
|
||||||
error
|
error,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -106,39 +106,39 @@ export async function robustFetch<
|
|||||||
const response = {
|
const response = {
|
||||||
status: request.status,
|
status: request.status,
|
||||||
headers: request.headers,
|
headers: request.headers,
|
||||||
body: await request.text() // NOTE: can this throw an exception?
|
body: await request.text(), // NOTE: can this throw an exception?
|
||||||
};
|
};
|
||||||
|
|
||||||
if (request.status >= 300) {
|
if (request.status >= 300) {
|
||||||
if (tryCount > 1) {
|
if (tryCount > 1) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Request sent failure status, trying " + (tryCount - 1) + " more times",
|
"Request sent failure status, trying " + (tryCount - 1) + " more times",
|
||||||
{ params, request, response, requestId }
|
{ params, request, response, requestId },
|
||||||
);
|
);
|
||||||
if (tryCooldown !== undefined) {
|
if (tryCooldown !== undefined) {
|
||||||
await new Promise((resolve) =>
|
await new Promise((resolve) =>
|
||||||
setTimeout(() => resolve(null), tryCooldown)
|
setTimeout(() => resolve(null), tryCooldown),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
...params,
|
...params,
|
||||||
requestId,
|
requestId,
|
||||||
tryCount: tryCount - 1
|
tryCount: tryCount - 1,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Request sent failure status", {
|
logger.debug("Request sent failure status", {
|
||||||
params,
|
params,
|
||||||
request,
|
request,
|
||||||
response,
|
response,
|
||||||
requestId
|
requestId,
|
||||||
});
|
});
|
||||||
throw new Error("Request sent failure status", {
|
throw new Error("Request sent failure status", {
|
||||||
cause: {
|
cause: {
|
||||||
params,
|
params,
|
||||||
request,
|
request,
|
||||||
response,
|
response,
|
||||||
requestId
|
requestId,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -151,15 +151,15 @@ export async function robustFetch<
|
|||||||
params,
|
params,
|
||||||
request,
|
request,
|
||||||
response,
|
response,
|
||||||
requestId
|
requestId,
|
||||||
});
|
});
|
||||||
throw new Error("Request sent malformed JSON", {
|
throw new Error("Request sent malformed JSON", {
|
||||||
cause: {
|
cause: {
|
||||||
params,
|
params,
|
||||||
request,
|
request,
|
||||||
response,
|
response,
|
||||||
requestId
|
requestId,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +174,7 @@ export async function robustFetch<
|
|||||||
response,
|
response,
|
||||||
requestId,
|
requestId,
|
||||||
error,
|
error,
|
||||||
schema
|
schema,
|
||||||
});
|
});
|
||||||
throw new Error("Response does not match provided schema", {
|
throw new Error("Response does not match provided schema", {
|
||||||
cause: {
|
cause: {
|
||||||
@@ -183,8 +183,8 @@ export async function robustFetch<
|
|||||||
response,
|
response,
|
||||||
requestId,
|
requestId,
|
||||||
error,
|
error,
|
||||||
schema
|
schema,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Parsing response with provided schema failed", {
|
logger.debug("Parsing response with provided schema failed", {
|
||||||
@@ -193,7 +193,7 @@ export async function robustFetch<
|
|||||||
response,
|
response,
|
||||||
requestId,
|
requestId,
|
||||||
error,
|
error,
|
||||||
schema
|
schema,
|
||||||
});
|
});
|
||||||
throw new Error("Parsing response with provided schema failed", {
|
throw new Error("Parsing response with provided schema failed", {
|
||||||
cause: {
|
cause: {
|
||||||
@@ -202,8 +202,8 @@ export async function robustFetch<
|
|||||||
response,
|
response,
|
||||||
requestId,
|
requestId,
|
||||||
error,
|
error,
|
||||||
schema
|
schema,
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,14 +47,14 @@ const excludeNonMainTags = [
|
|||||||
".widget",
|
".widget",
|
||||||
"#widget",
|
"#widget",
|
||||||
".cookie",
|
".cookie",
|
||||||
"#cookie"
|
"#cookie",
|
||||||
];
|
];
|
||||||
|
|
||||||
const forceIncludeMainTags = ["#main"];
|
const forceIncludeMainTags = ["#main"];
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const removeUnwantedElements = (
|
||||||
html: string,
|
html: string,
|
||||||
scrapeOptions: ScrapeOptions
|
scrapeOptions: ScrapeOptions,
|
||||||
) => {
|
) => {
|
||||||
const soup = load(html);
|
const soup = load(html);
|
||||||
|
|
||||||
@@ -89,11 +89,11 @@ export const removeUnwantedElements = (
|
|||||||
const attributes = element.attribs;
|
const attributes = element.attribs;
|
||||||
const tagNameMatches = regexPattern.test(element.name);
|
const tagNameMatches = regexPattern.test(element.name);
|
||||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||||
);
|
);
|
||||||
if (tag.startsWith("*.")) {
|
if (tag.startsWith("*.")) {
|
||||||
classMatch = Object.keys(attributes).some((attr) =>
|
classMatch = Object.keys(attributes).some((attr) =>
|
||||||
regexPattern.test(`class="${attributes[attr]}"`)
|
regexPattern.test(`class="${attributes[attr]}"`),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return tagNameMatches || attributesMatch || classMatch;
|
return tagNameMatches || attributesMatch || classMatch;
|
||||||
@@ -110,7 +110,7 @@ export const removeUnwantedElements = (
|
|||||||
if (scrapeOptions.onlyMainContent) {
|
if (scrapeOptions.onlyMainContent) {
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
const elementsToRemove = soup(tag).filter(
|
const elementsToRemove = soup(tag).filter(
|
||||||
forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join("")
|
forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join(""),
|
||||||
);
|
);
|
||||||
|
|
||||||
elementsToRemove.remove();
|
elementsToRemove.remove();
|
||||||
|
|||||||
@@ -42,10 +42,10 @@ export const urlSpecificParams: Record<string, UrlSpecificParams> = {
|
|||||||
// },
|
// },
|
||||||
"digikey.com": {
|
"digikey.com": {
|
||||||
scrapeOptions: {},
|
scrapeOptions: {},
|
||||||
internalOptions: { forceEngine: "fire-engine;tlsclient" }
|
internalOptions: { forceEngine: "fire-engine;tlsclient" },
|
||||||
},
|
},
|
||||||
"lorealparis.hu": {
|
"lorealparis.hu": {
|
||||||
scrapeOptions: {},
|
scrapeOptions: {},
|
||||||
internalOptions: { forceEngine: "fire-engine;tlsclient" }
|
internalOptions: { forceEngine: "fire-engine;tlsclient" },
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ const testEngines: (Engine | undefined)[] = [
|
|||||||
"fire-engine;tlsclient",
|
"fire-engine;tlsclient",
|
||||||
"scrapingbee",
|
"scrapingbee",
|
||||||
"scrapingbeeLoad",
|
"scrapingbeeLoad",
|
||||||
"fetch"
|
"fetch",
|
||||||
];
|
];
|
||||||
|
|
||||||
const testEnginesScreenshot: (Engine | undefined)[] = [
|
const testEnginesScreenshot: (Engine | undefined)[] = [
|
||||||
@@ -21,7 +21,7 @@ const testEnginesScreenshot: (Engine | undefined)[] = [
|
|||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
"fire-engine;playwright",
|
"fire-engine;playwright",
|
||||||
"scrapingbee",
|
"scrapingbee",
|
||||||
"scrapingbeeLoad"
|
"scrapingbeeLoad",
|
||||||
];
|
];
|
||||||
|
|
||||||
describe("Standalone scrapeURL tests", () => {
|
describe("Standalone scrapeURL tests", () => {
|
||||||
@@ -31,7 +31,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-basic",
|
"test:scrape-basic",
|
||||||
"https://www.roastmywebsite.ai/",
|
"https://www.roastmywebsite.ai/",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -46,26 +46,26 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
expect(out.document.metadata.title).toBe("Roast My Website");
|
expect(out.document.metadata.title).toBe("Roast My Website");
|
||||||
expect(out.document.metadata.description).toBe(
|
expect(out.document.metadata.description).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.keywords).toBe(
|
expect(out.document.metadata.keywords).toBe(
|
||||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
"Roast My Website,Roast,Website,GitHub,Firecrawl",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.robots).toBe("follow, index");
|
expect(out.document.metadata.robots).toBe("follow, index");
|
||||||
expect(out.document.metadata.ogTitle).toBe("Roast My Website");
|
expect(out.document.metadata.ogTitle).toBe("Roast My Website");
|
||||||
expect(out.document.metadata.ogDescription).toBe(
|
expect(out.document.metadata.ogDescription).toBe(
|
||||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.ogUrl).toBe(
|
expect(out.document.metadata.ogUrl).toBe(
|
||||||
"https://www.roastmywebsite.ai"
|
"https://www.roastmywebsite.ai",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.ogImage).toBe(
|
expect(out.document.metadata.ogImage).toBe(
|
||||||
"https://www.roastmywebsite.ai/og.png"
|
"https://www.roastmywebsite.ai/og.png",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.ogLocaleAlternate).toStrictEqual([]);
|
expect(out.document.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||||
expect(out.document.metadata.ogSiteName).toBe("Roast My Website");
|
expect(out.document.metadata.ogSiteName).toBe("Roast My Website");
|
||||||
expect(out.document.metadata.sourceURL).toBe(
|
expect(out.document.metadata.sourceURL).toBe(
|
||||||
"https://www.roastmywebsite.ai/"
|
"https://www.roastmywebsite.ai/",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
}
|
}
|
||||||
@@ -76,9 +76,9 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-formats-markdown-html",
|
"test:scrape-formats-markdown-html",
|
||||||
"https://roastmywebsite.ai",
|
"https://roastmywebsite.ai",
|
||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
formats: ["markdown", "html"]
|
formats: ["markdown", "html"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -100,9 +100,9 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-onlyMainContent-false",
|
"test:scrape-onlyMainContent-false",
|
||||||
"https://www.scrapethissite.com/",
|
"https://www.scrapethissite.com/",
|
||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
onlyMainContent: false
|
onlyMainContent: false,
|
||||||
}),
|
}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -123,9 +123,9 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"https://www.scrapethissite.com/",
|
"https://www.scrapethissite.com/",
|
||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
excludeTags: [".nav", "#footer", "strong"]
|
excludeTags: [".nav", "#footer", "strong"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -145,7 +145,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-400",
|
"test:scrape-400",
|
||||||
"https://httpstat.us/400",
|
"https://httpstat.us/400",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -163,7 +163,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-401",
|
"test:scrape-401",
|
||||||
"https://httpstat.us/401",
|
"https://httpstat.us/401",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -181,7 +181,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-403",
|
"test:scrape-403",
|
||||||
"https://httpstat.us/403",
|
"https://httpstat.us/403",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -199,7 +199,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-404",
|
"test:scrape-404",
|
||||||
"https://httpstat.us/404",
|
"https://httpstat.us/404",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -217,7 +217,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-405",
|
"test:scrape-405",
|
||||||
"https://httpstat.us/405",
|
"https://httpstat.us/405",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -235,7 +235,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-500",
|
"test:scrape-500",
|
||||||
"https://httpstat.us/500",
|
"https://httpstat.us/500",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -253,7 +253,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-redirect",
|
"test:scrape-redirect",
|
||||||
"https://scrapethissite.com/",
|
"https://scrapethissite.com/",
|
||||||
scrapeOptions.parse({}),
|
scrapeOptions.parse({}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -264,10 +264,10 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.markdown).toContain("Explore Sandbox");
|
expect(out.document.markdown).toContain("Explore Sandbox");
|
||||||
expect(out.document).toHaveProperty("metadata");
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.sourceURL).toBe(
|
expect(out.document.metadata.sourceURL).toBe(
|
||||||
"https://scrapethissite.com/"
|
"https://scrapethissite.com/",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.url).toBe(
|
expect(out.document.metadata.url).toBe(
|
||||||
"https://www.scrapethissite.com/"
|
"https://www.scrapethissite.com/",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
@@ -283,9 +283,9 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-screenshot",
|
"test:scrape-screenshot",
|
||||||
"https://www.scrapethissite.com/",
|
"https://www.scrapethissite.com/",
|
||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
formats: ["screenshot"]
|
formats: ["screenshot"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -296,8 +296,8 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(typeof out.document.screenshot).toBe("string");
|
expect(typeof out.document.screenshot).toBe("string");
|
||||||
expect(
|
expect(
|
||||||
out.document.screenshot!.startsWith(
|
out.document.screenshot!.startsWith(
|
||||||
"https://service.firecrawl.dev/storage/v1/object/public/media/"
|
"https://service.firecrawl.dev/storage/v1/object/public/media/",
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
// TODO: attempt to fetch screenshot
|
// TODO: attempt to fetch screenshot
|
||||||
expect(out.document).toHaveProperty("metadata");
|
expect(out.document).toHaveProperty("metadata");
|
||||||
@@ -311,9 +311,9 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
"test:scrape-screenshot-fullPage",
|
"test:scrape-screenshot-fullPage",
|
||||||
"https://www.scrapethissite.com/",
|
"https://www.scrapethissite.com/",
|
||||||
scrapeOptions.parse({
|
scrapeOptions.parse({
|
||||||
formats: ["screenshot@fullPage"]
|
formats: ["screenshot@fullPage"],
|
||||||
}),
|
}),
|
||||||
{ forceEngine }
|
{ forceEngine },
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -324,8 +324,8 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(typeof out.document.screenshot).toBe("string");
|
expect(typeof out.document.screenshot).toBe("string");
|
||||||
expect(
|
expect(
|
||||||
out.document.screenshot!.startsWith(
|
out.document.screenshot!.startsWith(
|
||||||
"https://service.firecrawl.dev/storage/v1/object/public/media/"
|
"https://service.firecrawl.dev/storage/v1/object/public/media/",
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
// TODO: attempt to fetch screenshot
|
// TODO: attempt to fetch screenshot
|
||||||
expect(out.document).toHaveProperty("metadata");
|
expect(out.document).toHaveProperty("metadata");
|
||||||
@@ -333,14 +333,14 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
it("Scrape of a PDF file", async () => {
|
it("Scrape of a PDF file", async () => {
|
||||||
const out = await scrapeURL(
|
const out = await scrapeURL(
|
||||||
"test:scrape-pdf",
|
"test:scrape-pdf",
|
||||||
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
scrapeOptions.parse({})
|
scrapeOptions.parse({}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -358,7 +358,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
const out = await scrapeURL(
|
const out = await scrapeURL(
|
||||||
"test:scrape-docx",
|
"test:scrape-docx",
|
||||||
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
||||||
scrapeOptions.parse({})
|
scrapeOptions.parse({}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -367,7 +367,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty("metadata");
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.markdown).toContain(
|
expect(out.document.markdown).toContain(
|
||||||
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT",
|
||||||
);
|
);
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
@@ -388,13 +388,13 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
company_mission: { type: "string" },
|
company_mission: { type: "string" },
|
||||||
supports_sso: { type: "boolean" },
|
supports_sso: { type: "boolean" },
|
||||||
is_open_source: { type: "boolean" }
|
is_open_source: { type: "boolean" },
|
||||||
},
|
},
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
additionalProperties: false
|
additionalProperties: false,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -423,13 +423,13 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
company_mission: { type: "string" },
|
company_mission: { type: "string" },
|
||||||
supports_sso: { type: "boolean" },
|
supports_sso: { type: "boolean" },
|
||||||
is_open_source: { type: "boolean" }
|
is_open_source: { type: "boolean" },
|
||||||
},
|
},
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
additionalProperties: false
|
additionalProperties: false,
|
||||||
}
|
},
|
||||||
}
|
},
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
@@ -460,7 +460,7 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
message: value.message,
|
message: value.message,
|
||||||
name: value.name,
|
name: value.name,
|
||||||
cause: value.cause,
|
cause: value.cause,
|
||||||
stack: value.stack
|
stack: value.stack,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return value;
|
return value;
|
||||||
@@ -486,6 +486,6 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
30000
|
30000,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
|||||||
|
|
||||||
if (document.rawHtml === undefined) {
|
if (document.rawHtml === undefined) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"rawHtml is undefined -- this transformer is being called out of order"
|
"rawHtml is undefined -- this transformer is being called out of order",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
|||||||
html: document.rawHtml!,
|
html: document.rawHtml!,
|
||||||
statusCode: document.metadata.statusCode!,
|
statusCode: document.metadata.statusCode!,
|
||||||
url: document.metadata.url ?? document.metadata.sourceURL!,
|
url: document.metadata.url ?? document.metadata.sourceURL!,
|
||||||
error: document.metadata.error ?? undefined
|
error: document.metadata.error ?? undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
saveEntryToCache(key, entry);
|
saveEntryToCache(key, entry);
|
||||||
|
|||||||
@@ -11,33 +11,33 @@ import { saveToCache } from "./cache";
|
|||||||
|
|
||||||
export type Transformer = (
|
export type Transformer = (
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
) => Document | Promise<Document>;
|
) => Document | Promise<Document>;
|
||||||
|
|
||||||
export function deriveMetadataFromRawHTML(
|
export function deriveMetadataFromRawHTML(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
): Document {
|
): Document {
|
||||||
if (document.rawHtml === undefined) {
|
if (document.rawHtml === undefined) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"rawHtml is undefined -- this transformer is being called out of order"
|
"rawHtml is undefined -- this transformer is being called out of order",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.metadata = {
|
document.metadata = {
|
||||||
...extractMetadata(meta, document.rawHtml),
|
...extractMetadata(meta, document.rawHtml),
|
||||||
...document.metadata
|
...document.metadata,
|
||||||
};
|
};
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function deriveHTMLFromRawHTML(
|
export function deriveHTMLFromRawHTML(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
): Document {
|
): Document {
|
||||||
if (document.rawHtml === undefined) {
|
if (document.rawHtml === undefined) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"rawHtml is undefined -- this transformer is being called out of order"
|
"rawHtml is undefined -- this transformer is being called out of order",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,11 +47,11 @@ export function deriveHTMLFromRawHTML(
|
|||||||
|
|
||||||
export async function deriveMarkdownFromHTML(
|
export async function deriveMarkdownFromHTML(
|
||||||
_meta: Meta,
|
_meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
if (document.html === undefined) {
|
if (document.html === undefined) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"html is undefined -- this transformer is being called out of order"
|
"html is undefined -- this transformer is being called out of order",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,7 +64,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
|
|||||||
if (meta.options.formats.includes("links")) {
|
if (meta.options.formats.includes("links")) {
|
||||||
if (document.html === undefined) {
|
if (document.html === undefined) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"html is undefined -- this transformer is being called out of order"
|
"html is undefined -- this transformer is being called out of order",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -76,7 +76,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
|
|||||||
|
|
||||||
export function coerceFieldsToFormats(
|
export function coerceFieldsToFormats(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
): Document {
|
): Document {
|
||||||
const formats = new Set(meta.options.formats);
|
const formats = new Set(meta.options.formats);
|
||||||
|
|
||||||
@@ -84,7 +84,7 @@ export function coerceFieldsToFormats(
|
|||||||
delete document.markdown;
|
delete document.markdown;
|
||||||
} else if (formats.has("markdown") && document.markdown === undefined) {
|
} else if (formats.has("markdown") && document.markdown === undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format: markdown, but there was no markdown field in the result."
|
"Request had format: markdown, but there was no markdown field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,7 +92,7 @@ export function coerceFieldsToFormats(
|
|||||||
delete document.rawHtml;
|
delete document.rawHtml;
|
||||||
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
|
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format: rawHtml, but there was no rawHtml field in the result."
|
"Request had format: rawHtml, but there was no rawHtml field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,7 +100,7 @@ export function coerceFieldsToFormats(
|
|||||||
delete document.html;
|
delete document.html;
|
||||||
} else if (formats.has("html") && document.html === undefined) {
|
} else if (formats.has("html") && document.html === undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format: html, but there was no html field in the result."
|
"Request had format: html, but there was no html field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -110,7 +110,7 @@ export function coerceFieldsToFormats(
|
|||||||
document.screenshot !== undefined
|
document.screenshot !== undefined
|
||||||
) {
|
) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug."
|
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug.",
|
||||||
);
|
);
|
||||||
delete document.screenshot;
|
delete document.screenshot;
|
||||||
} else if (
|
} else if (
|
||||||
@@ -118,29 +118,29 @@ export function coerceFieldsToFormats(
|
|||||||
document.screenshot === undefined
|
document.screenshot === undefined
|
||||||
) {
|
) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result."
|
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("links") && document.links !== undefined) {
|
if (!formats.has("links") && document.links !== undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug."
|
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug.",
|
||||||
);
|
);
|
||||||
delete document.links;
|
delete document.links;
|
||||||
} else if (formats.has("links") && document.links === undefined) {
|
} else if (formats.has("links") && document.links === undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format: links, but there was no links field in the result."
|
"Request had format: links, but there was no links field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("extract") && document.extract !== undefined) {
|
if (!formats.has("extract") && document.extract !== undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug."
|
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
|
||||||
);
|
);
|
||||||
delete document.extract;
|
delete document.extract;
|
||||||
} else if (formats.has("extract") && document.extract === undefined) {
|
} else if (formats.has("extract") && document.extract === undefined) {
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"Request had format: extract, but there was no extract field in the result."
|
"Request had format: extract, but there was no extract field in the result.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,12 +161,12 @@ export const transformerStack: Transformer[] = [
|
|||||||
uploadScreenshot,
|
uploadScreenshot,
|
||||||
performLLMExtract,
|
performLLMExtract,
|
||||||
coerceFieldsToFormats,
|
coerceFieldsToFormats,
|
||||||
removeBase64Images
|
removeBase64Images,
|
||||||
];
|
];
|
||||||
|
|
||||||
export async function executeTransformers(
|
export async function executeTransformers(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
const executions: [string, number][] = [];
|
const executions: [string, number][] = [];
|
||||||
|
|
||||||
@@ -174,8 +174,8 @@ export async function executeTransformers(
|
|||||||
const _meta = {
|
const _meta = {
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "executeTransformers/" + transformer.name
|
method: "executeTransformers/" + transformer.name,
|
||||||
})
|
}),
|
||||||
};
|
};
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
document = await transformer(_meta, document);
|
document = await transformer(_meta, document);
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ function normalizeSchema(x: any): any {
|
|||||||
x["$defs"] = Object.fromEntries(
|
x["$defs"] = Object.fromEntries(
|
||||||
Object.entries(x["$defs"]).map(([name, schema]) => [
|
Object.entries(x["$defs"]).map(([name, schema]) => [
|
||||||
name,
|
name,
|
||||||
normalizeSchema(schema)
|
normalizeSchema(schema),
|
||||||
])
|
]),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,15 +50,15 @@ function normalizeSchema(x: any): any {
|
|||||||
return {
|
return {
|
||||||
...x,
|
...x,
|
||||||
properties: Object.fromEntries(
|
properties: Object.fromEntries(
|
||||||
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])
|
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)]),
|
||||||
),
|
),
|
||||||
required: Object.keys(x.properties),
|
required: Object.keys(x.properties),
|
||||||
additionalProperties: false
|
additionalProperties: false,
|
||||||
};
|
};
|
||||||
} else if (x && x.type === "array") {
|
} else if (x && x.type === "array") {
|
||||||
return {
|
return {
|
||||||
...x,
|
...x,
|
||||||
items: normalizeSchema(x.items)
|
items: normalizeSchema(x.items),
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return x;
|
return x;
|
||||||
@@ -70,7 +70,7 @@ export async function generateOpenAICompletions(
|
|||||||
options: ExtractOptions,
|
options: ExtractOptions,
|
||||||
markdown?: string,
|
markdown?: string,
|
||||||
previousWarning?: string,
|
previousWarning?: string,
|
||||||
isExtractEndpoint?: boolean
|
isExtractEndpoint?: boolean,
|
||||||
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
|
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
|
||||||
let extract: any;
|
let extract: any;
|
||||||
let warning: string | undefined;
|
let warning: string | undefined;
|
||||||
@@ -125,19 +125,19 @@ export async function generateOpenAICompletions(
|
|||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
items: options.schema
|
items: options.schema,
|
||||||
},
|
},
|
||||||
required: ["items"],
|
required: ["items"],
|
||||||
additionalProperties: false
|
additionalProperties: false,
|
||||||
};
|
};
|
||||||
} else if (schema && typeof schema === "object" && !schema.type) {
|
} else if (schema && typeof schema === "object" && !schema.type) {
|
||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: Object.fromEntries(
|
properties: Object.fromEntries(
|
||||||
Object.entries(schema).map(([key, value]) => [key, { type: value }])
|
Object.entries(schema).map(([key, value]) => [key, { type: value }]),
|
||||||
),
|
),
|
||||||
required: Object.keys(schema),
|
required: Object.keys(schema),
|
||||||
additionalProperties: false
|
additionalProperties: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -149,19 +149,19 @@ export async function generateOpenAICompletions(
|
|||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: options.systemPrompt
|
content: options.systemPrompt,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content: [{ type: "text", text: markdown }]
|
content: [{ type: "text", text: markdown }],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content:
|
content:
|
||||||
options.prompt !== undefined
|
options.prompt !== undefined
|
||||||
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
|
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
|
||||||
: "Transform the above content into structured JSON output."
|
: "Transform the above content into structured JSON output.",
|
||||||
}
|
},
|
||||||
],
|
],
|
||||||
response_format: options.schema
|
response_format: options.schema
|
||||||
? {
|
? {
|
||||||
@@ -169,10 +169,10 @@ export async function generateOpenAICompletions(
|
|||||||
json_schema: {
|
json_schema: {
|
||||||
name: "websiteContent",
|
name: "websiteContent",
|
||||||
schema: schema,
|
schema: schema,
|
||||||
strict: true
|
strict: true,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
: { type: "json_object" }
|
: { type: "json_object" },
|
||||||
});
|
});
|
||||||
|
|
||||||
if (jsonCompletion.choices[0].message.refusal !== null) {
|
if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||||
@@ -187,16 +187,16 @@ export async function generateOpenAICompletions(
|
|||||||
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||||
} else {
|
} else {
|
||||||
const extractData = JSON.parse(
|
const extractData = JSON.parse(
|
||||||
jsonCompletion.choices[0].message.content
|
jsonCompletion.choices[0].message.content,
|
||||||
);
|
);
|
||||||
extract = options.schema ? extractData.data.extract : extractData;
|
extract = options.schema ? extractData.data.extract : extractData;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Failed to parse returned JSON, no schema specified.", {
|
logger.error("Failed to parse returned JSON, no schema specified.", {
|
||||||
error: e
|
error: e,
|
||||||
});
|
});
|
||||||
throw new LLMRefusalError(
|
throw new LLMRefusalError(
|
||||||
"Failed to parse returned JSON. Please specify a schema in the extract object."
|
"Failed to parse returned JSON. Please specify a schema in the extract object.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -215,16 +215,16 @@ export async function generateOpenAICompletions(
|
|||||||
|
|
||||||
export async function performLLMExtract(
|
export async function performLLMExtract(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
document: Document
|
document: Document,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
if (meta.options.formats.includes("extract")) {
|
if (meta.options.formats.includes("extract")) {
|
||||||
const { extract, warning } = await generateOpenAICompletions(
|
const { extract, warning } = await generateOpenAICompletions(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "performLLMExtract/generateOpenAICompletions"
|
method: "performLLMExtract/generateOpenAICompletions",
|
||||||
}),
|
}),
|
||||||
meta.options.extract!,
|
meta.options.extract!,
|
||||||
document.markdown,
|
document.markdown,
|
||||||
document.warning
|
document.warning,
|
||||||
);
|
);
|
||||||
document.extract = extract;
|
document.extract = extract;
|
||||||
document.warning = warning;
|
document.warning = warning;
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ export function removeBase64Images(meta: Meta, document: Document): Document {
|
|||||||
if (meta.options.removeBase64Images && document.markdown !== undefined) {
|
if (meta.options.removeBase64Images && document.markdown !== undefined) {
|
||||||
document.markdown = document.markdown.replace(
|
document.markdown = document.markdown.replace(
|
||||||
regex,
|
regex,
|
||||||
"$1(<Base64-Image-Removed>)"
|
"$1(<Base64-Image-Removed>)",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return document;
|
return document;
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ export function uploadScreenshot(meta: Meta, document: Document): Document {
|
|||||||
{
|
{
|
||||||
cacheControl: "3600",
|
cacheControl: "3600",
|
||||||
upsert: false,
|
upsert: false,
|
||||||
contentType: document.screenshot.split(":")[1].split(";")[0]
|
contentType: document.screenshot.split(":")[1].split(";")[0],
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`;
|
document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`;
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ export async function fireEngineMap(
|
|||||||
location?: string;
|
location?: string;
|
||||||
numResults: number;
|
numResults: number;
|
||||||
page?: number;
|
page?: number;
|
||||||
}
|
},
|
||||||
): Promise<SearchResult[]> {
|
): Promise<SearchResult[]> {
|
||||||
try {
|
try {
|
||||||
let data = JSON.stringify({
|
let data = JSON.stringify({
|
||||||
@@ -25,12 +25,12 @@ export async function fireEngineMap(
|
|||||||
location: options.location,
|
location: options.location,
|
||||||
tbs: options.tbs,
|
tbs: options.tbs,
|
||||||
numResults: options.numResults,
|
numResults: options.numResults,
|
||||||
page: options.page ?? 1
|
page: options.page ?? 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.warn(
|
console.warn(
|
||||||
"(v1/map Beta) Results might differ from cloud offering currently."
|
"(v1/map Beta) Results might differ from cloud offering currently.",
|
||||||
);
|
);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@@ -39,9 +39,9 @@ export async function fireEngineMap(
|
|||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"X-Disable-Cache": "true"
|
"X-Disable-Cache": "true",
|
||||||
},
|
},
|
||||||
body: data
|
body: data,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ const _useragent_list = [
|
|||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||||
];
|
];
|
||||||
|
|
||||||
function get_useragent(): string {
|
function get_useragent(): string {
|
||||||
@@ -27,14 +27,14 @@ async function _req(
|
|||||||
proxies: any,
|
proxies: any,
|
||||||
timeout: number,
|
timeout: number,
|
||||||
tbs: string | undefined = undefined,
|
tbs: string | undefined = undefined,
|
||||||
filter: string | undefined = undefined
|
filter: string | undefined = undefined,
|
||||||
) {
|
) {
|
||||||
const params = {
|
const params = {
|
||||||
q: term,
|
q: term,
|
||||||
num: results, // Number of results to return
|
num: results, // Number of results to return
|
||||||
hl: lang,
|
hl: lang,
|
||||||
gl: country,
|
gl: country,
|
||||||
start: start
|
start: start,
|
||||||
};
|
};
|
||||||
if (tbs) {
|
if (tbs) {
|
||||||
params["tbs"] = tbs;
|
params["tbs"] = tbs;
|
||||||
@@ -45,11 +45,11 @@ async function _req(
|
|||||||
try {
|
try {
|
||||||
const resp = await axios.get("https://www.google.com/search", {
|
const resp = await axios.get("https://www.google.com/search", {
|
||||||
headers: {
|
headers: {
|
||||||
"User-Agent": get_useragent()
|
"User-Agent": get_useragent(),
|
||||||
},
|
},
|
||||||
params: params,
|
params: params,
|
||||||
proxy: proxies,
|
proxy: proxies,
|
||||||
timeout: timeout
|
timeout: timeout,
|
||||||
});
|
});
|
||||||
return resp;
|
return resp;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -70,7 +70,7 @@ export async function googleSearch(
|
|||||||
country = "us",
|
country = "us",
|
||||||
proxy = undefined as string | undefined,
|
proxy = undefined as string | undefined,
|
||||||
sleep_interval = 0,
|
sleep_interval = 0,
|
||||||
timeout = 5000
|
timeout = 5000,
|
||||||
): Promise<SearchResult[]> {
|
): Promise<SearchResult[]> {
|
||||||
let proxies: any = null;
|
let proxies: any = null;
|
||||||
if (proxy) {
|
if (proxy) {
|
||||||
@@ -98,7 +98,7 @@ export async function googleSearch(
|
|||||||
proxies,
|
proxies,
|
||||||
timeout,
|
timeout,
|
||||||
tbs,
|
tbs,
|
||||||
filter
|
filter,
|
||||||
);
|
);
|
||||||
const $ = cheerio.load(resp.data);
|
const $ = cheerio.load(resp.data);
|
||||||
const result_block = $("div.g");
|
const result_block = $("div.g");
|
||||||
@@ -117,7 +117,7 @@ export async function googleSearch(
|
|||||||
const title = $(element).find("h3");
|
const title = $(element).find("h3");
|
||||||
const ogImage = $(element).find("img").eq(1).attr("src");
|
const ogImage = $(element).find("img").eq(1).attr("src");
|
||||||
const description_box = $(element).find(
|
const description_box = $(element).find(
|
||||||
"div[style='-webkit-line-clamp:2']"
|
"div[style='-webkit-line-clamp:2']",
|
||||||
);
|
);
|
||||||
const answerBox = $(element).find(".mod").text();
|
const answerBox = $(element).find(".mod").text();
|
||||||
if (description_box) {
|
if (description_box) {
|
||||||
@@ -129,7 +129,7 @@ export async function googleSearch(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
await new Promise((resolve) =>
|
await new Promise((resolve) =>
|
||||||
setTimeout(resolve, sleep_interval * 1000)
|
setTimeout(resolve, sleep_interval * 1000),
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.message === "Too many requests") {
|
if (error.message === "Too many requests") {
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ export async function search({
|
|||||||
location = undefined,
|
location = undefined,
|
||||||
proxy = undefined,
|
proxy = undefined,
|
||||||
sleep_interval = 0,
|
sleep_interval = 0,
|
||||||
timeout = 5000
|
timeout = 5000,
|
||||||
}: {
|
}: {
|
||||||
query: string;
|
query: string;
|
||||||
advanced?: boolean;
|
advanced?: boolean;
|
||||||
@@ -38,7 +38,7 @@ export async function search({
|
|||||||
filter,
|
filter,
|
||||||
lang,
|
lang,
|
||||||
country,
|
country,
|
||||||
location
|
location,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
if (process.env.SEARCHAPI_API_KEY) {
|
if (process.env.SEARCHAPI_API_KEY) {
|
||||||
@@ -48,7 +48,7 @@ export async function search({
|
|||||||
filter,
|
filter,
|
||||||
lang,
|
lang,
|
||||||
country,
|
country,
|
||||||
location
|
location,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return await googleSearch(
|
return await googleSearch(
|
||||||
@@ -61,7 +61,7 @@ export async function search({
|
|||||||
country,
|
country,
|
||||||
proxy,
|
proxy,
|
||||||
sleep_interval,
|
sleep_interval,
|
||||||
timeout
|
timeout,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error in search function: ${error}`);
|
logger.error(`Error in search function: ${error}`);
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ interface SearchOptions {
|
|||||||
|
|
||||||
export async function searchapi_search(
|
export async function searchapi_search(
|
||||||
q: string,
|
q: string,
|
||||||
options: SearchOptions
|
options: SearchOptions,
|
||||||
): Promise<SearchResult[]> {
|
): Promise<SearchResult[]> {
|
||||||
const params = {
|
const params = {
|
||||||
q: q,
|
q: q,
|
||||||
@@ -25,7 +25,7 @@ export async function searchapi_search(
|
|||||||
location: options.location,
|
location: options.location,
|
||||||
num: options.num_results,
|
num: options.num_results,
|
||||||
page: options.page ?? 1,
|
page: options.page ?? 1,
|
||||||
engine: process.env.SEARCHAPI_ENGINE || "google"
|
engine: process.env.SEARCHAPI_ENGINE || "google",
|
||||||
};
|
};
|
||||||
|
|
||||||
const url = `https://www.searchapi.io/api/v1/search`;
|
const url = `https://www.searchapi.io/api/v1/search`;
|
||||||
@@ -35,9 +35,9 @@ export async function searchapi_search(
|
|||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${process.env.SEARCHAPI_API_KEY}`,
|
Authorization: `Bearer ${process.env.SEARCHAPI_API_KEY}`,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"X-SearchApi-Source": "Firecrawl"
|
"X-SearchApi-Source": "Firecrawl",
|
||||||
},
|
},
|
||||||
params: params
|
params: params,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (response.status === 401) {
|
if (response.status === 401) {
|
||||||
@@ -50,7 +50,7 @@ export async function searchapi_search(
|
|||||||
return data.organic_results.map((a: any) => ({
|
return data.organic_results.map((a: any) => ({
|
||||||
url: a.link,
|
url: a.link,
|
||||||
title: a.title,
|
title: a.title,
|
||||||
description: a.snippet
|
description: a.snippet,
|
||||||
}));
|
}));
|
||||||
} else {
|
} else {
|
||||||
return [];
|
return [];
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ export async function serper_search(
|
|||||||
location?: string;
|
location?: string;
|
||||||
num_results: number;
|
num_results: number;
|
||||||
page?: number;
|
page?: number;
|
||||||
}
|
},
|
||||||
): Promise<SearchResult[]> {
|
): Promise<SearchResult[]> {
|
||||||
let data = JSON.stringify({
|
let data = JSON.stringify({
|
||||||
q: q,
|
q: q,
|
||||||
@@ -23,7 +23,7 @@ export async function serper_search(
|
|||||||
location: options.location,
|
location: options.location,
|
||||||
tbs: options.tbs,
|
tbs: options.tbs,
|
||||||
num: options.num_results,
|
num: options.num_results,
|
||||||
page: options.page ?? 1
|
page: options.page ?? 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
let config = {
|
let config = {
|
||||||
@@ -31,16 +31,16 @@ export async function serper_search(
|
|||||||
url: "https://google.serper.dev/search",
|
url: "https://google.serper.dev/search",
|
||||||
headers: {
|
headers: {
|
||||||
"X-API-KEY": process.env.SERPER_API_KEY,
|
"X-API-KEY": process.env.SERPER_API_KEY,
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
data: data
|
data: data,
|
||||||
};
|
};
|
||||||
const response = await axios(config);
|
const response = await axios(config);
|
||||||
if (response && response.data && Array.isArray(response.data.organic)) {
|
if (response && response.data && Array.isArray(response.data.organic)) {
|
||||||
return response.data.organic.map((a) => ({
|
return response.data.organic.map((a) => ({
|
||||||
url: a.link,
|
url: a.link,
|
||||||
title: a.title,
|
title: a.title,
|
||||||
description: a.snippet
|
description: a.snippet,
|
||||||
}));
|
}));
|
||||||
} else {
|
} else {
|
||||||
return [];
|
return [];
|
||||||
|
|||||||
@@ -17,15 +17,15 @@ export async function checkAlerts() {
|
|||||||
const activeJobs = await scrapeQueue.getActiveCount();
|
const activeJobs = await scrapeQueue.getActiveCount();
|
||||||
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`
|
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.`,
|
||||||
);
|
);
|
||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`,
|
`Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`,
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`
|
`Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -39,11 +39,11 @@ export async function checkAlerts() {
|
|||||||
|
|
||||||
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`
|
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.`,
|
||||||
);
|
);
|
||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}. Scale up the number of workers with fly scale count worker=20`,
|
`Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}. Scale up the number of workers with fly scale count worker=20`,
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -4,18 +4,18 @@ import { logger } from "../../../src/lib/logger";
|
|||||||
export async function sendSlackWebhook(
|
export async function sendSlackWebhook(
|
||||||
message: string,
|
message: string,
|
||||||
alertEveryone: boolean = false,
|
alertEveryone: boolean = false,
|
||||||
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? ""
|
webhookUrl: string = process.env.SLACK_WEBHOOK_URL ?? "",
|
||||||
) {
|
) {
|
||||||
const messagePrefix = alertEveryone ? "<!channel> " : "";
|
const messagePrefix = alertEveryone ? "<!channel> " : "";
|
||||||
const payload = {
|
const payload = {
|
||||||
text: `${messagePrefix} ${message}`
|
text: `${messagePrefix} ${message}`,
|
||||||
};
|
};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await axios.post(webhookUrl, payload, {
|
const response = await axios.post(webhookUrl, payload, {
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json",
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
logger.info("Webhook sent successfully:", response.data);
|
logger.info("Webhook sent successfully:", response.data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds
|
|||||||
*/
|
*/
|
||||||
export async function autoCharge(
|
export async function autoCharge(
|
||||||
chunk: AuthCreditUsageChunk,
|
chunk: AuthCreditUsageChunk,
|
||||||
autoRechargeThreshold: number
|
autoRechargeThreshold: number,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
message: string;
|
message: string;
|
||||||
@@ -38,13 +38,13 @@ export async function autoCharge(
|
|||||||
const cooldownValue = await getValue(cooldownKey);
|
const cooldownValue = await getValue(cooldownKey);
|
||||||
if (cooldownValue) {
|
if (cooldownValue) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Auto-recharge for team ${chunk.team_id} is in cooldown period`
|
`Auto-recharge for team ${chunk.team_id} is in cooldown period`,
|
||||||
);
|
);
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
message: "Auto-recharge is in cooldown period",
|
message: "Auto-recharge is in cooldown period",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,7 +53,7 @@ export async function autoCharge(
|
|||||||
[resource],
|
[resource],
|
||||||
5000,
|
5000,
|
||||||
async (
|
async (
|
||||||
signal
|
signal,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
message: string;
|
message: string;
|
||||||
@@ -81,7 +81,7 @@ export async function autoCharge(
|
|||||||
success: false,
|
success: false,
|
||||||
message: "Error fetching customer data",
|
message: "Error fetching customer data",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,7 +90,7 @@ export async function autoCharge(
|
|||||||
// Attempt to create a payment intent
|
// Attempt to create a payment intent
|
||||||
const paymentStatus = await createPaymentIntent(
|
const paymentStatus = await createPaymentIntent(
|
||||||
chunk.team_id,
|
chunk.team_id,
|
||||||
customer.stripe_customer_id
|
customer.stripe_customer_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
// If payment is successful or requires further action, issue credits
|
// If payment is successful or requires further action, issue credits
|
||||||
@@ -100,7 +100,7 @@ export async function autoCharge(
|
|||||||
) {
|
) {
|
||||||
issueCreditsSuccess = await issueCredits(
|
issueCreditsSuccess = await issueCredits(
|
||||||
chunk.team_id,
|
chunk.team_id,
|
||||||
AUTO_RECHARGE_CREDITS
|
AUTO_RECHARGE_CREDITS,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,7 +109,7 @@ export async function autoCharge(
|
|||||||
team_id: chunk.team_id,
|
team_id: chunk.team_id,
|
||||||
initial_payment_status: paymentStatus.return_status,
|
initial_payment_status: paymentStatus.return_status,
|
||||||
credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0,
|
credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0,
|
||||||
stripe_charge_id: paymentStatus.charge_id
|
stripe_charge_id: paymentStatus.charge_id,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Send a notification if credits were successfully issued
|
// Send a notification if credits were successfully issued
|
||||||
@@ -120,7 +120,7 @@ export async function autoCharge(
|
|||||||
chunk.sub_current_period_start,
|
chunk.sub_current_period_start,
|
||||||
chunk.sub_current_period_end,
|
chunk.sub_current_period_end,
|
||||||
chunk,
|
chunk,
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Set cooldown period
|
// Set cooldown period
|
||||||
@@ -139,7 +139,7 @@ export async function autoCharge(
|
|||||||
sendSlackWebhook(
|
sendSlackWebhook(
|
||||||
`Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`,
|
`Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`,
|
||||||
false,
|
false,
|
||||||
process.env.SLACK_ADMIN_WEBHOOK_URL
|
process.env.SLACK_ADMIN_WEBHOOK_URL,
|
||||||
).catch((error) => {
|
).catch((error) => {
|
||||||
logger.debug(`Error sending slack notification: ${error}`);
|
logger.debug(`Error sending slack notification: ${error}`);
|
||||||
});
|
});
|
||||||
@@ -156,8 +156,8 @@ export async function autoCharge(
|
|||||||
chunk: {
|
chunk: {
|
||||||
...chunk,
|
...chunk,
|
||||||
remaining_credits:
|
remaining_credits:
|
||||||
chunk.remaining_credits + AUTO_RECHARGE_CREDITS
|
chunk.remaining_credits + AUTO_RECHARGE_CREDITS,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
logger.error("No Stripe customer ID found for user");
|
logger.error("No Stripe customer ID found for user");
|
||||||
@@ -165,7 +165,7 @@ export async function autoCharge(
|
|||||||
success: false,
|
success: false,
|
||||||
message: "No Stripe customer ID found for user",
|
message: "No Stripe customer ID found for user",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -174,7 +174,7 @@ export async function autoCharge(
|
|||||||
success: false,
|
success: false,
|
||||||
message: "No sub_user_id found in chunk",
|
message: "No sub_user_id found in chunk",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -182,9 +182,9 @@ export async function autoCharge(
|
|||||||
success: false,
|
success: false,
|
||||||
message: "No need to auto-recharge",
|
message: "No need to auto-recharge",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Failed to acquire lock for auto-recharge: ${error}`);
|
logger.error(`Failed to acquire lock for auto-recharge: ${error}`);
|
||||||
@@ -192,7 +192,7 @@ export async function autoCharge(
|
|||||||
success: false,
|
success: false,
|
||||||
message: "Failed to acquire lock for auto-recharge",
|
message: "Failed to acquire lock for auto-recharge",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,18 +19,18 @@ const FREE_CREDITS = 500;
|
|||||||
export async function billTeam(
|
export async function billTeam(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
subscription_id: string | null | undefined,
|
subscription_id: string | null | undefined,
|
||||||
credits: number
|
credits: number,
|
||||||
) {
|
) {
|
||||||
return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(
|
return withAuth(supaBillTeam, { success: true, message: "No DB, bypassed." })(
|
||||||
team_id,
|
team_id,
|
||||||
subscription_id,
|
subscription_id,
|
||||||
credits
|
credits,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
export async function supaBillTeam(
|
export async function supaBillTeam(
|
||||||
team_id: string,
|
team_id: string,
|
||||||
subscription_id: string | null | undefined,
|
subscription_id: string | null | undefined,
|
||||||
credits: number
|
credits: number,
|
||||||
) {
|
) {
|
||||||
if (team_id === "preview") {
|
if (team_id === "preview") {
|
||||||
return { success: true, message: "Preview team, no credits used" };
|
return { success: true, message: "Preview team, no credits used" };
|
||||||
@@ -41,7 +41,7 @@ export async function supaBillTeam(
|
|||||||
_team_id: team_id,
|
_team_id: team_id,
|
||||||
sub_id: subscription_id ?? null,
|
sub_id: subscription_id ?? null,
|
||||||
fetch_subscription: subscription_id === undefined,
|
fetch_subscription: subscription_id === undefined,
|
||||||
credits
|
credits,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
@@ -58,9 +58,9 @@ export async function supaBillTeam(
|
|||||||
...acuc,
|
...acuc,
|
||||||
credits_used: acuc.credits_used + credits,
|
credits_used: acuc.credits_used + credits,
|
||||||
adjusted_credits_used: acuc.adjusted_credits_used + credits,
|
adjusted_credits_used: acuc.adjusted_credits_used + credits,
|
||||||
remaining_credits: acuc.remaining_credits - credits
|
remaining_credits: acuc.remaining_credits - credits,
|
||||||
}
|
}
|
||||||
: null
|
: null,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
@@ -76,12 +76,12 @@ export type CheckTeamCreditsResponse = {
|
|||||||
export async function checkTeamCredits(
|
export async function checkTeamCredits(
|
||||||
chunk: AuthCreditUsageChunk | null,
|
chunk: AuthCreditUsageChunk | null,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
credits: number
|
credits: number,
|
||||||
): Promise<CheckTeamCreditsResponse> {
|
): Promise<CheckTeamCreditsResponse> {
|
||||||
return withAuth(supaCheckTeamCredits, {
|
return withAuth(supaCheckTeamCredits, {
|
||||||
success: true,
|
success: true,
|
||||||
message: "No DB, bypassed",
|
message: "No DB, bypassed",
|
||||||
remainingCredits: Infinity
|
remainingCredits: Infinity,
|
||||||
})(chunk, team_id, credits);
|
})(chunk, team_id, credits);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -89,14 +89,14 @@ export async function checkTeamCredits(
|
|||||||
export async function supaCheckTeamCredits(
|
export async function supaCheckTeamCredits(
|
||||||
chunk: AuthCreditUsageChunk | null,
|
chunk: AuthCreditUsageChunk | null,
|
||||||
team_id: string,
|
team_id: string,
|
||||||
credits: number
|
credits: number,
|
||||||
): Promise<CheckTeamCreditsResponse> {
|
): Promise<CheckTeamCreditsResponse> {
|
||||||
// WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery
|
// WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery
|
||||||
if (team_id === "preview") {
|
if (team_id === "preview") {
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
message: "Preview team, no credits used",
|
message: "Preview team, no credits used",
|
||||||
remainingCredits: Infinity
|
remainingCredits: Infinity,
|
||||||
};
|
};
|
||||||
} else if (chunk === null) {
|
} else if (chunk === null) {
|
||||||
throw new Error("NULL ACUC passed to supaCheckTeamCredits");
|
throw new Error("NULL ACUC passed to supaCheckTeamCredits");
|
||||||
@@ -141,7 +141,7 @@ export async function supaCheckTeamCredits(
|
|||||||
success: true,
|
success: true,
|
||||||
message: autoChargeResult.message,
|
message: autoChargeResult.message,
|
||||||
remainingCredits: autoChargeResult.remainingCredits,
|
remainingCredits: autoChargeResult.remainingCredits,
|
||||||
chunk: autoChargeResult.chunk
|
chunk: autoChargeResult.chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -155,7 +155,7 @@ export async function supaCheckTeamCredits(
|
|||||||
NotificationType.LIMIT_REACHED,
|
NotificationType.LIMIT_REACHED,
|
||||||
chunk.sub_current_period_start,
|
chunk.sub_current_period_start,
|
||||||
chunk.sub_current_period_end,
|
chunk.sub_current_period_end,
|
||||||
chunk
|
chunk,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
@@ -163,7 +163,7 @@ export async function supaCheckTeamCredits(
|
|||||||
message:
|
message:
|
||||||
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.",
|
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||||
// Send email notification for approaching credit limit
|
// Send email notification for approaching credit limit
|
||||||
@@ -172,7 +172,7 @@ export async function supaCheckTeamCredits(
|
|||||||
NotificationType.APPROACHING_LIMIT,
|
NotificationType.APPROACHING_LIMIT,
|
||||||
chunk.sub_current_period_start,
|
chunk.sub_current_period_start,
|
||||||
chunk.sub_current_period_end,
|
chunk.sub_current_period_end,
|
||||||
chunk
|
chunk,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -180,13 +180,13 @@ export async function supaCheckTeamCredits(
|
|||||||
success: true,
|
success: true,
|
||||||
message: "Sufficient credits available",
|
message: "Sufficient credits available",
|
||||||
remainingCredits: chunk.remaining_credits,
|
remainingCredits: chunk.remaining_credits,
|
||||||
chunk
|
chunk,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
||||||
export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
||||||
team_id: string
|
team_id: string,
|
||||||
) {
|
) {
|
||||||
// 1. Retrieve the team's active subscription based on the team_id.
|
// 1. Retrieve the team's active subscription based on the team_id.
|
||||||
const { data: subscription, error: subscriptionError } =
|
const { data: subscription, error: subscriptionError } =
|
||||||
@@ -206,7 +206,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
|||||||
if (coupons && coupons.length > 0) {
|
if (coupons && coupons.length > 0) {
|
||||||
couponCredits = coupons.reduce(
|
couponCredits = coupons.reduce(
|
||||||
(total, coupon) => total + coupon.credits,
|
(total, coupon) => total + coupon.credits,
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,20 +221,20 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
|||||||
|
|
||||||
if (creditUsageError || !creditUsages) {
|
if (creditUsageError || !creditUsages) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
`Failed to retrieve credit usage for team_id: ${team_id}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const totalCreditsUsed = creditUsages.reduce(
|
const totalCreditsUsed = creditUsages.reduce(
|
||||||
(acc, usage) => acc + usage.credits_used,
|
(acc, usage) => acc + usage.credits_used,
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
|
|
||||||
const remainingCredits = FREE_CREDITS + couponCredits - totalCreditsUsed;
|
const remainingCredits = FREE_CREDITS + couponCredits - totalCreditsUsed;
|
||||||
return {
|
return {
|
||||||
totalCreditsUsed: totalCreditsUsed,
|
totalCreditsUsed: totalCreditsUsed,
|
||||||
remainingCredits,
|
remainingCredits,
|
||||||
totalCredits: FREE_CREDITS + couponCredits
|
totalCredits: FREE_CREDITS + couponCredits,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,13 +247,13 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
|||||||
|
|
||||||
if (creditUsageError || !creditUsages) {
|
if (creditUsageError || !creditUsages) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to retrieve credit usage for subscription_id: ${subscription.id}`
|
`Failed to retrieve credit usage for subscription_id: ${subscription.id}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const totalCreditsUsed = creditUsages.reduce(
|
const totalCreditsUsed = creditUsages.reduce(
|
||||||
(acc, usage) => acc + usage.credits_used,
|
(acc, usage) => acc + usage.credits_used,
|
||||||
0
|
0,
|
||||||
);
|
);
|
||||||
|
|
||||||
const { data: price, error: priceError } = await supabase_service
|
const { data: price, error: priceError } = await supabase_service
|
||||||
@@ -264,7 +264,7 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
|||||||
|
|
||||||
if (priceError || !price) {
|
if (priceError || !price) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Failed to retrieve price for price_id: ${subscription.price_id}`
|
`Failed to retrieve price for price_id: ${subscription.price_id}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -273,6 +273,6 @@ export async function countCreditsAndRemainingForCurrentBillingPeriod(
|
|||||||
return {
|
return {
|
||||||
totalCreditsUsed,
|
totalCreditsUsed,
|
||||||
remainingCredits,
|
remainingCredits,
|
||||||
totalCredits: price.credits
|
totalCredits: price.credits,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user