Nick: fixed prettier
This commit is contained in:
@@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"trailingComma": "none"
|
||||||
|
}
|
||||||
@@ -6,7 +6,7 @@
|
|||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "nodemon --exec ts-node src/index.ts",
|
"start": "nodemon --exec ts-node src/index.ts",
|
||||||
"start:production": "tsc && node dist/src/index.js",
|
"start:production": "tsc && node dist/src/index.js",
|
||||||
"format": "prettier --write \"src/**/*.(js|ts)\"",
|
"format": "npx prettier --write \"src/**/*.(js|ts)\"",
|
||||||
"flyio": "node dist/src/index.js",
|
"flyio": "node dist/src/index.js",
|
||||||
"start:dev": "nodemon --exec ts-node src/index.ts",
|
"start:dev": "nodemon --exec ts-node src/index.ts",
|
||||||
"build": "tsc && pnpm sentry:sourcemaps",
|
"build": "tsc && pnpm sentry:sourcemaps",
|
||||||
|
|||||||
@@ -3,14 +3,16 @@ import dotenv from "dotenv";
|
|||||||
import {
|
import {
|
||||||
FirecrawlCrawlResponse,
|
FirecrawlCrawlResponse,
|
||||||
FirecrawlCrawlStatusResponse,
|
FirecrawlCrawlStatusResponse,
|
||||||
FirecrawlScrapeResponse,
|
FirecrawlScrapeResponse
|
||||||
} from "../../types";
|
} from "../../types";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
const TEST_URL = "http://127.0.0.1:3002";
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
describe("E2E Tests for Extract API Routes", () => {
|
describe("E2E Tests for Extract API Routes", () => {
|
||||||
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
|
it.concurrent(
|
||||||
|
"should return authors of blog posts on firecrawl.dev",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -20,8 +22,10 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
prompt: "Who are the authors of the blog posts?",
|
prompt: "Who are the authors of the blog posts?",
|
||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: { authors: { type: "array", items: { type: "string" } } },
|
properties: {
|
||||||
},
|
authors: { type: "array", items: { type: "string" } }
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -37,13 +41,16 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
if (author.includes("Nicolas Camara")) gotItRight++;
|
if (author.includes("Nicolas Camara")) gotItRight++;
|
||||||
if (author.includes("Jon")) gotItRight++;
|
if (author.includes("Jon")) gotItRight++;
|
||||||
if (author.includes("Wendong")) gotItRight++;
|
if (author.includes("Wendong")) gotItRight++;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(gotItRight).toBeGreaterThan(1);
|
expect(gotItRight).toBeGreaterThan(1);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
|
it.concurrent(
|
||||||
|
"should return founders of firecrawl.dev (allowExternalLinks = true)",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -54,8 +61,10 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
allowExternalLinks: true,
|
allowExternalLinks: true,
|
||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: { founders: { type: "array", items: { type: "string" } } },
|
properties: {
|
||||||
},
|
founders: { type: "array", items: { type: "string" } }
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -70,13 +79,16 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
if (founder.includes("nick")) gotItRight++;
|
if (founder.includes("nick")) gotItRight++;
|
||||||
if (founder.includes("eric")) gotItRight++;
|
if (founder.includes("eric")) gotItRight++;
|
||||||
if (founder.includes("jon-noronha")) gotItRight++;
|
if (founder.includes("jon-noronha")) gotItRight++;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(gotItRight).toBeGreaterThanOrEqual(2);
|
expect(gotItRight).toBeGreaterThanOrEqual(2);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
|
it.concurrent(
|
||||||
|
"should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -91,7 +103,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
type: "string"
|
type: "string"
|
||||||
},
|
},
|
||||||
required: ["items"]
|
required: ["items"]
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -105,9 +117,13 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expect(gotItRight).toBeGreaterThan(2);
|
expect(gotItRight).toBeGreaterThan(2);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
|
it.concurrent(
|
||||||
|
"should return PCI DSS compliance for Fivetran",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -121,14 +137,18 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
pciDssCompliance: { type: "boolean" }
|
pciDssCompliance: { type: "boolean" }
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
expect(response.body.data?.pciDssCompliance).toBe(true);
|
expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
|
it.concurrent(
|
||||||
|
"should return Azure Data Connectors for Fivetran",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -147,21 +167,27 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
// expect(response.statusCode).toBe(200);
|
// expect(response.statusCode).toBe(200);
|
||||||
// expect(response.body).toHaveProperty("data");
|
// expect(response.body).toHaveProperty("data");
|
||||||
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
// expect(response.body.data?.pciDssCompliance).toBe(true);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => {
|
it.concurrent(
|
||||||
|
"should return Greenhouse Applicant Tracking System for Abnormal Security",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"],
|
urls: [
|
||||||
|
"https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"
|
||||||
|
],
|
||||||
prompt: "what applicant tracking system is this company using?",
|
prompt: "what applicant tracking system is this company using?",
|
||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
@@ -171,15 +197,19 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
allowExternalLinks: true
|
allowExternalLinks: true
|
||||||
})
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
expect(response.body.data?.isGreenhouseATS).toBe(true);
|
expect(response.body.data?.isGreenhouseATS).toBe(true);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return mintlify api components", async () => {
|
it.concurrent(
|
||||||
|
"should return mintlify api components",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -198,7 +228,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
required: ["items"]
|
required: ["items"]
|
||||||
},
|
},
|
||||||
allowExternalLinks: true
|
allowExternalLinks: true
|
||||||
})
|
});
|
||||||
|
|
||||||
console.log(response.body.data?.items);
|
console.log(response.body.data?.items);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
@@ -206,24 +236,32 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
expect(response.body.data?.items.length).toBe(4);
|
expect(response.body.data?.items.length).toBe(4);
|
||||||
let gotItRight = 0;
|
let gotItRight = 0;
|
||||||
for (const component of response.body.data?.items) {
|
for (const component of response.body.data?.items) {
|
||||||
if (component.component.toLowerCase().includes("parameter")) gotItRight++;
|
if (component.component.toLowerCase().includes("parameter"))
|
||||||
if (component.component.toLowerCase().includes("response")) gotItRight++;
|
gotItRight++;
|
||||||
if (component.component.toLowerCase().includes("expandable")) gotItRight++;
|
if (component.component.toLowerCase().includes("response"))
|
||||||
|
gotItRight++;
|
||||||
|
if (component.component.toLowerCase().includes("expandable"))
|
||||||
|
gotItRight++;
|
||||||
if (component.component.toLowerCase().includes("sticky")) gotItRight++;
|
if (component.component.toLowerCase().includes("sticky")) gotItRight++;
|
||||||
if (component.component.toLowerCase().includes("examples")) gotItRight++;
|
if (component.component.toLowerCase().includes("examples"))
|
||||||
|
gotItRight++;
|
||||||
}
|
}
|
||||||
expect(gotItRight).toBeGreaterThan(2);
|
expect(gotItRight).toBeGreaterThan(2);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return information about Eric Ciarla", async () => {
|
it.concurrent(
|
||||||
|
"should return information about Eric Ciarla",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
urls: ["https://ericciarla.com/"],
|
urls: ["https://ericciarla.com/"],
|
||||||
prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?",
|
prompt:
|
||||||
|
"Who is Eric Ciarla? Where does he work? Where did he go to school?",
|
||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
@@ -234,7 +272,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
required: ["name", "work", "education"]
|
required: ["name", "work", "education"]
|
||||||
},
|
},
|
||||||
allowExternalLinks: true
|
allowExternalLinks: true
|
||||||
})
|
});
|
||||||
|
|
||||||
console.log(response.body.data);
|
console.log(response.body.data);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
@@ -242,9 +280,13 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
expect(response.body.data?.name).toBe("Eric Ciarla");
|
expect(response.body.data?.name).toBe("Eric Ciarla");
|
||||||
expect(response.body.data?.work).toBeDefined();
|
expect(response.body.data?.work).toBeDefined();
|
||||||
expect(response.body.data?.education).toBeDefined();
|
expect(response.body.data?.education).toBeDefined();
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should extract information without a schema", async () => {
|
it.concurrent(
|
||||||
|
"should extract information without a schema",
|
||||||
|
async () => {
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
.post("/v1/extract")
|
.post("/v1/extract")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -259,8 +301,7 @@ describe("E2E Tests for Extract API Routes", () => {
|
|||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
expect(typeof response.body.data).toBe("object");
|
expect(typeof response.body.data).toBe("object");
|
||||||
expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
|
expect(Object.keys(response.body.data).length).toBeGreaterThan(0);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -15,7 +15,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: false,
|
sitemapOnly: false,
|
||||||
search: "smart-crawl",
|
search: "smart-crawl"
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -37,7 +37,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: false,
|
sitemapOnly: false,
|
||||||
includeSubdomains: true,
|
includeSubdomains: true
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -60,7 +60,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: true,
|
sitemapOnly: true
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -84,7 +84,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
sitemapOnly: false,
|
sitemapOnly: false,
|
||||||
limit: 10,
|
limit: 10
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
@@ -104,7 +104,7 @@ describe("E2E Tests for Map API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://geekflare.com/sitemap_index.xml",
|
url: "https://geekflare.com/sitemap_index.xml",
|
||||||
sitemapOnly: true,
|
sitemapOnly: true
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(response.body);
|
console.log(response.body);
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
process.env = originalEnv;
|
process.env = originalEnv;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
describe("GET /", () => {
|
describe("GET /", () => {
|
||||||
it("should return Hello, world! message", async () => {
|
it("should return Hello, world! message", async () => {
|
||||||
const response = await request(TEST_URL).get("/");
|
const response = await request(TEST_URL).get("/");
|
||||||
@@ -62,7 +61,9 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
expect(response.body.error).toContain(
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return a successful response", async () => {
|
it("should return a successful response", async () => {
|
||||||
@@ -87,7 +88,9 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
expect(response.body.error).toContain(
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return a successful response", async () => {
|
it("should return a successful response", async () => {
|
||||||
@@ -116,7 +119,9 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: blocklistedUrl });
|
.send({ url: blocklistedUrl });
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
expect(response.body.error).toContain(
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return a successful response", async () => {
|
it("should return a successful response", async () => {
|
||||||
@@ -199,8 +204,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
|
||||||
|
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import request from "supertest";
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import {
|
||||||
ScrapeRequestInput,
|
ScrapeRequestInput,
|
||||||
ScrapeResponseRequestTest,
|
ScrapeResponseRequestTest
|
||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@@ -19,15 +19,17 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
describe("GET /is-production", () => {
|
describe("GET /is-production", () => {
|
||||||
it.concurrent("should return the production status", async () => {
|
it.concurrent("should return the production status", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
const response: ScrapeResponseRequestTest =
|
||||||
"/is-production"
|
await request(TEST_URL).get("/is-production");
|
||||||
);
|
|
||||||
|
|
||||||
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
|
console.log(
|
||||||
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
|
"process.env.USE_DB_AUTHENTICATION",
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
process.env.USE_DB_AUTHENTICATION
|
||||||
console.log('!!useDbAuthentication', !!useDbAuthentication);
|
);
|
||||||
console.log('!useDbAuthentication', !useDbAuthentication);
|
console.log("?", process.env.USE_DB_AUTHENTICATION === "true");
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
|
console.log("!!useDbAuthentication", !!useDbAuthentication);
|
||||||
|
console.log("!useDbAuthentication", !useDbAuthentication);
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("isProduction");
|
expect(response.body).toHaveProperty("isProduction");
|
||||||
@@ -38,14 +40,14 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.send({ url: "https://firecrawl.dev"})
|
.send({ url: "https://firecrawl.dev" });
|
||||||
|
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://facebook.com/fake-test",
|
url: "https://facebook.com/fake-test"
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@@ -55,7 +57,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.send(scrapeRequest);
|
.send(scrapeRequest);
|
||||||
|
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
expect(response.body.error).toBe(
|
||||||
|
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -74,7 +78,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai"
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -126,7 +130,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
"should return a successful response with a valid API key",
|
"should return a successful response with a valid API key",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/abs/2410.04840",
|
url: "https://arxiv.org/abs/2410.04840"
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -146,8 +150,12 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).not.toHaveProperty("html");
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
expect(response.body.data.markdown).toContain("Strong Model Collapse");
|
expect(response.body.data.markdown).toContain("Strong Model Collapse");
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse");
|
expect(response.body.data.metadata.description).toContain(
|
||||||
expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse");
|
"Abstract page for arXiv paper 2410.04840: Strong Model Collapse"
|
||||||
|
);
|
||||||
|
expect(response.body.data.metadata.citation_title).toBe(
|
||||||
|
"Strong Model Collapse"
|
||||||
|
);
|
||||||
expect(response.body.data.metadata.citation_author).toEqual([
|
expect(response.body.data.metadata.citation_author).toEqual([
|
||||||
"Dohmatob, Elvis",
|
"Dohmatob, Elvis",
|
||||||
"Feng, Yunzhen",
|
"Feng, Yunzhen",
|
||||||
@@ -155,11 +163,21 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
"Kempe, Julia"
|
"Kempe, Julia"
|
||||||
]);
|
]);
|
||||||
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
|
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
|
||||||
expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08");
|
expect(response.body.data.metadata.citation_online_date).toBe(
|
||||||
expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840");
|
"2024/10/08"
|
||||||
expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840");
|
);
|
||||||
expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm");
|
expect(response.body.data.metadata.citation_pdf_url).toBe(
|
||||||
expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840");
|
"http://arxiv.org/pdf/2410.04840"
|
||||||
|
);
|
||||||
|
expect(response.body.data.metadata.citation_arxiv_id).toBe(
|
||||||
|
"2410.04840"
|
||||||
|
);
|
||||||
|
expect(response.body.data.metadata.citation_abstract).toContain(
|
||||||
|
"Within the scaling laws paradigm"
|
||||||
|
);
|
||||||
|
expect(response.body.data.metadata.sourceURL).toBe(
|
||||||
|
"https://arxiv.org/abs/2410.04840"
|
||||||
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
},
|
},
|
||||||
30000
|
30000
|
||||||
@@ -169,7 +187,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["markdown", "html"],
|
formats: ["markdown", "html"]
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -193,58 +211,73 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
},
|
},
|
||||||
30000
|
30000
|
||||||
);
|
);
|
||||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response for a valid scrape with PDF file",
|
||||||
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||||
// formats: ["markdown", "html"],
|
// formats: ["markdown", "html"],
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post('/v1/scrape')
|
.post("/v1/scrape")
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set('Content-Type', 'application/json')
|
.set("Content-Type", "application/json")
|
||||||
.send(scrapeRequest);
|
.send(scrapeRequest);
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"Broad Line Radio Galaxy"
|
||||||
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response for a valid scrape with PDF file without explicit .pdf extension",
|
||||||
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post('/v1/scrape')
|
.post("/v1/scrape")
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set('Content-Type', 'application/json')
|
.set("Content-Type", "application/json")
|
||||||
.send(scrapeRequest);
|
.send(scrapeRequest);
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty('markdown');
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"Broad Line Radio Galaxy"
|
||||||
|
);
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key with removeTags option",
|
||||||
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
onlyMainContent: false // default is true
|
onlyMainContent: false // default is true
|
||||||
};
|
};
|
||||||
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
|
const responseWithoutRemoveTags: ScrapeResponseRequestTest =
|
||||||
|
await request(TEST_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -258,12 +291,16 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
|
||||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
"[FAQ](/faq/)"
|
||||||
|
); // .nav
|
||||||
|
expect(responseWithoutRemoveTags.body.data.markdown).toContain(
|
||||||
|
"Hartley Brody 2023"
|
||||||
|
); // #footer
|
||||||
|
|
||||||
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
excludeTags: ['.nav', '#footer', 'strong'],
|
excludeTags: [".nav", "#footer", "strong"],
|
||||||
onlyMainContent: false // default is true
|
onlyMainContent: false // default is true
|
||||||
};
|
};
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -282,44 +319,53 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).not.toHaveProperty("html");
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||||
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||||
}, 30000);
|
},
|
||||||
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response for a scrape with 400 page",
|
||||||
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post('/v1/scrape')
|
.post("/v1/scrape")
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set('Content-Type', 'application/json')
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: 'https://httpstat.us/400' });
|
.send({ url: "https://httpstat.us/400" });
|
||||||
await new Promise((r) => setTimeout(r, 5000));
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty('markdown');
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(400);
|
expect(response.body.data.metadata.statusCode).toBe(400);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
|
it.concurrent(
|
||||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
"should return a successful response for a scrape with 401 page",
|
||||||
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post('/v1/scrape')
|
.post("/v1/scrape")
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set('Content-Type', 'application/json')
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: 'https://httpstat.us/401' });
|
.send({ url: "https://httpstat.us/401" });
|
||||||
await new Promise((r) => setTimeout(r, 5000));
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty('markdown');
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(401);
|
expect(response.body.data.metadata.statusCode).toBe(401);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
// Removed it as we want to retry fallback to the next scraper
|
// Removed it as we want to retry fallback to the next scraper
|
||||||
// it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
// it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||||
@@ -340,23 +386,27 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// expect(response.body.data.metadata.statusCode).toBe(403);
|
// expect(response.body.data.metadata.statusCode).toBe(403);
|
||||||
// }, 60000);
|
// }, 60000);
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response for a scrape with 404 page",
|
||||||
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post('/v1/scrape')
|
.post("/v1/scrape")
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set('Content-Type', 'application/json')
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: 'https://httpstat.us/404' });
|
.send({ url: "https://httpstat.us/404" });
|
||||||
await new Promise((r) => setTimeout(r, 5000));
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty('markdown');
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(404);
|
expect(response.body.data.metadata.statusCode).toBe(404);
|
||||||
}, 60000);
|
},
|
||||||
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
// it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
// it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||||
// const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
// const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -394,7 +444,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// expect(response.body.data.metadata.statusCode).toBe(500);
|
// expect(response.body.data.metadata.statusCode).toBe(500);
|
||||||
// }, 60000);
|
// }, 60000);
|
||||||
|
|
||||||
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
|
it.concurrent(
|
||||||
|
"should return a timeout error when scraping takes longer than the specified timeout",
|
||||||
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
@@ -402,14 +454,16 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
||||||
|
|
||||||
expect(response.statusCode).toBe(408);
|
expect(response.statusCode).toBe(408);
|
||||||
}, 3000);
|
},
|
||||||
|
3000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key and includeHtml set to true",
|
"should return a successful response with a valid API key and includeHtml set to true",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["html","rawHtml"],
|
formats: ["html", "rawHtml"]
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -463,7 +517,6 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data.markdown).toContain("PagerDuty");
|
expect(response.body.data.markdown).toContain("PagerDuty");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
expect(response.body.data.metadata.error).toBeUndefined();
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
|
|
||||||
},
|
},
|
||||||
30000
|
30000
|
||||||
);
|
);
|
||||||
@@ -473,7 +526,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
formats: ["links"],
|
formats: ["links"]
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -497,11 +550,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
},
|
},
|
||||||
30000
|
30000
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("POST /v1/map", () => {
|
describe("POST /v1/map", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/map")
|
.post("/v1/map")
|
||||||
@@ -509,16 +560,21 @@ describe("POST /v1/map", () => {
|
|||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
it.concurrent(
|
||||||
|
"should return an error response with an invalid API key",
|
||||||
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/map")
|
.post("/v1/map")
|
||||||
.set("Authorization", `Bearer invalid-api-key`)
|
.set("Authorization", `Bearer invalid-api-key`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://firecrawl.dev" });
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key", async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key",
|
||||||
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://roastmywebsite.ai"
|
url: "https://roastmywebsite.ai"
|
||||||
};
|
};
|
||||||
@@ -538,9 +594,12 @@ describe("POST /v1/map", () => {
|
|||||||
const links = response.body.links as unknown[];
|
const links = response.body.links as unknown[];
|
||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and search", async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key and search",
|
||||||
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://usemotion.com",
|
url: "https://usemotion.com",
|
||||||
search: "pricing"
|
search: "pricing"
|
||||||
@@ -562,9 +621,12 @@ describe("POST /v1/map", () => {
|
|||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
expect(links[0]).toContain("usemotion.com/pricing");
|
expect(links[0]).toContain("usemotion.com/pricing");
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key and search and allowSubdomains",
|
||||||
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
search: "docs",
|
search: "docs",
|
||||||
@@ -587,11 +649,16 @@ describe("POST /v1/map", () => {
|
|||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
const containsDocsFirecrawlDev = links.some((link: string) =>
|
||||||
|
link.includes("docs.firecrawl.dev")
|
||||||
|
);
|
||||||
expect(containsDocsFirecrawlDev).toBe(true);
|
expect(containsDocsFirecrawlDev).toBe(true);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key and search and allowSubdomains and www",
|
||||||
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://www.firecrawl.dev",
|
url: "https://www.firecrawl.dev",
|
||||||
search: "docs",
|
search: "docs",
|
||||||
@@ -614,11 +681,17 @@ describe("POST /v1/map", () => {
|
|||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
const containsDocsFirecrawlDev = links.some((link: string) =>
|
||||||
|
link.includes("docs.firecrawl.dev")
|
||||||
|
);
|
||||||
expect(containsDocsFirecrawlDev).toBe(true);
|
expect(containsDocsFirecrawlDev).toBe(true);
|
||||||
}, 10000)
|
},
|
||||||
|
10000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key and search and not allowSubdomains and www",
|
||||||
|
async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "https://www.firecrawl.dev",
|
url: "https://www.firecrawl.dev",
|
||||||
search: "docs",
|
search: "docs",
|
||||||
@@ -641,13 +714,14 @@ describe("POST /v1/map", () => {
|
|||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||||
})
|
}
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should return an error for invalid URL", async () => {
|
it.concurrent("should return an error for invalid URL", async () => {
|
||||||
const mapRequest = {
|
const mapRequest = {
|
||||||
url: "invalid-url",
|
url: "invalid-url",
|
||||||
includeSubdomains: true,
|
includeSubdomains: true,
|
||||||
search: "test",
|
search: "test"
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
@@ -660,10 +734,9 @@ describe("POST /v1/map", () => {
|
|||||||
expect(response.body).toHaveProperty("success", false);
|
expect(response.body).toHaveProperty("success", false);
|
||||||
expect(response.body).toHaveProperty("error");
|
expect(response.body).toHaveProperty("error");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("POST /v1/crawl", () => {
|
||||||
describe("POST /v1/crawl", () => {
|
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
.post("/v1/crawl")
|
.post("/v1/crawl")
|
||||||
@@ -673,7 +746,7 @@ describe("POST /v1/crawl", () => {
|
|||||||
|
|
||||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||||
const scrapeRequest: ScrapeRequestInput = {
|
const scrapeRequest: ScrapeRequestInput = {
|
||||||
url: "https://facebook.com/fake-test",
|
url: "https://facebook.com/fake-test"
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const response = await request(TEST_URL)
|
||||||
@@ -683,7 +756,9 @@ describe("POST /v1/crawl", () => {
|
|||||||
.send(scrapeRequest);
|
.send(scrapeRequest);
|
||||||
|
|
||||||
expect(response.statusCode).toBe(403);
|
expect(response.statusCode).toBe(403);
|
||||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
expect(response.body.error).toBe(
|
||||||
|
"URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
@@ -725,7 +800,7 @@ describe("POST /v1/crawl", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
limit: 40,
|
limit: 40,
|
||||||
includePaths: ["blog/*"],
|
includePaths: ["blog/*"]
|
||||||
});
|
});
|
||||||
|
|
||||||
let response;
|
let response;
|
||||||
@@ -781,7 +856,7 @@ describe("POST /v1/crawl", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://firecrawl.dev",
|
url: "https://firecrawl.dev",
|
||||||
limit: 40,
|
limit: 40,
|
||||||
excludePaths: ["blog/*"],
|
excludePaths: ["blog/*"]
|
||||||
});
|
});
|
||||||
|
|
||||||
let isFinished = false;
|
let isFinished = false;
|
||||||
@@ -802,9 +877,7 @@ describe("POST /v1/crawl", () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||||
const completedResponse = await request(
|
const completedResponse = await request(TEST_URL)
|
||||||
TEST_URL
|
|
||||||
)
|
|
||||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
@@ -828,7 +901,7 @@ describe("POST /v1/crawl", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.scrapethissite.com",
|
||||||
maxDepth: 1,
|
maxDepth: 1
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -837,7 +910,9 @@ describe("POST /v1/crawl", () => {
|
|||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("status");
|
expect(response.body).toHaveProperty("status");
|
||||||
expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status);
|
expect(["active", "waiting", "completed", "scraping"]).toContain(
|
||||||
|
response.body.status
|
||||||
|
);
|
||||||
// wait for 60 seconds
|
// wait for 60 seconds
|
||||||
let isCompleted = false;
|
let isCompleted = false;
|
||||||
while (!isCompleted) {
|
while (!isCompleted) {
|
||||||
@@ -850,9 +925,7 @@ describe("POST /v1/crawl", () => {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const completedResponse = await request(
|
const completedResponse = await request(TEST_URL)
|
||||||
TEST_URL
|
|
||||||
)
|
|
||||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
@@ -884,9 +957,9 @@ describe("POST /v1/crawl", () => {
|
|||||||
},
|
},
|
||||||
180000
|
180000
|
||||||
);
|
);
|
||||||
})
|
});
|
||||||
|
|
||||||
describe("GET /v1/crawl/:jobId", () => {
|
describe("GET /v1/crawl/:jobId", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response = await request(TEST_URL).get("/v1/crawl/123");
|
const response = await request(TEST_URL).get("/v1/crawl/123");
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
@@ -950,14 +1023,10 @@ describe("GET /v1/crawl/:jobId", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||||
expect(
|
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||||
completedResponse.body.data[0].metadata.error
|
|
||||||
).toBeUndefined();
|
|
||||||
|
|
||||||
const childrenLinks = completedResponse.body.data.filter(
|
const childrenLinks = completedResponse.body.data.filter(
|
||||||
(doc) =>
|
(doc) => doc.metadata && doc.metadata.sourceURL
|
||||||
doc.metadata &&
|
|
||||||
doc.metadata.sourceURL
|
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||||
@@ -1001,5 +1070,5 @@ describe("GET /v1/crawl/:jobId", () => {
|
|||||||
},
|
},
|
||||||
60000
|
60000
|
||||||
); // 60 seconds
|
); // 60 seconds
|
||||||
})
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import request from "supertest";
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import {
|
||||||
ScrapeRequest,
|
ScrapeRequest,
|
||||||
ScrapeResponseRequestTest,
|
ScrapeResponseRequestTest
|
||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
@@ -10,31 +10,39 @@ const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
|
|||||||
const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
|
const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
|
||||||
|
|
||||||
describe("E2E Tests for v1 API Routes", () => {
|
describe("E2E Tests for v1 API Routes", () => {
|
||||||
|
it.concurrent(
|
||||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
"should return a successful response for a scrape with 403 page",
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
async () => {
|
||||||
.post('/v1/scrape')
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
FIRECRAWL_API_URL
|
||||||
.set('Content-Type', 'application/json')
|
)
|
||||||
.send({ url: 'https://httpstat.us/403' });
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://httpstat.us/403" });
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data).toHaveProperty('markdown');
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty('metadata');
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||||
}, 30000);
|
},
|
||||||
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'formats:markdown (default)' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL
|
url: E2E_TEST_SERVER_URL
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -48,26 +56,40 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
expect(response.body.data).toHaveProperty("markdown");
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
expect(response.body.data.markdown).toContain(
|
||||||
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
"This page is used for end-to-end (e2e) testing with Firecrawl."
|
||||||
|
);
|
||||||
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"Content with id #content-1"
|
||||||
|
);
|
||||||
// expect(response.body.data.markdown).toContain("Loading...");
|
// expect(response.body.data.markdown).toContain("Loading...");
|
||||||
expect(response.body.data.markdown).toContain("Click me!");
|
expect(response.body.data.markdown).toContain("Click me!");
|
||||||
expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
|
expect(response.body.data.markdown).toContain(
|
||||||
expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
|
"Power your AI apps with clean data crawled from any website. It's also open-source."
|
||||||
|
); // firecrawl.dev inside an iframe
|
||||||
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"This content loads only when you see it. Don't blink! 👼"
|
||||||
|
); // the browser always scroll to the bottom
|
||||||
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
|
||||||
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
|
||||||
expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
|
expect(response.body.data.markdown).not.toContain(
|
||||||
|
"This content is only visible on mobile"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'formats:html' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'formats:html' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
formats: ["html"]
|
formats: ["html"]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -79,23 +101,30 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
expect(response.body.data).not.toHaveProperty("markdown");
|
expect(response.body.data).not.toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty("html");
|
expect(response.body.data).toHaveProperty("html");
|
||||||
|
|
||||||
expect(response.body.data.html).not.toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
expect(response.body.data.html).not.toContain(
|
||||||
expect(response.body.data.html).toContain("<p style=\"\">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
'<header class="row-start-1" style="">Header</header>'
|
||||||
|
);
|
||||||
|
expect(response.body.data.html).toContain(
|
||||||
|
'<p style="">This page is used for end-to-end (e2e) testing with Firecrawl.</p>'
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'rawHtml' in 'formats' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
formats: ["rawHtml"]
|
formats: ["rawHtml"]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -110,23 +139,30 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
expect(response.body.data).not.toHaveProperty("markdown");
|
expect(response.body.data).not.toHaveProperty("markdown");
|
||||||
expect(response.body.data).toHaveProperty("rawHtml");
|
expect(response.body.data).toHaveProperty("rawHtml");
|
||||||
|
|
||||||
expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
expect(response.body.data.rawHtml).toContain(
|
||||||
|
">This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
|
||||||
|
);
|
||||||
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
expect(response.body.data.rawHtml).toContain(">Header</header>");
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
// - TODO: tests for links
|
// - TODO: tests for links
|
||||||
// - TODO: tests for screenshot
|
// - TODO: tests for screenshot
|
||||||
// - TODO: tests for screenshot@fullPage
|
// - TODO: tests for screenshot@fullPage
|
||||||
|
|
||||||
it.concurrent("should handle 'headers' parameter correctly", async () => {
|
it.concurrent(
|
||||||
|
"should handle 'headers' parameter correctly",
|
||||||
|
async () => {
|
||||||
// @ts-ignore
|
// @ts-ignore
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
headers: { "e2e-header-test": "firecrawl" }
|
headers: { "e2e-header-test": "firecrawl" }
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -138,17 +174,24 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
|
expect(response.body.data.markdown).toContain(
|
||||||
}, 30000);
|
"e2e-header-test: firecrawl"
|
||||||
|
);
|
||||||
|
},
|
||||||
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'includeTags' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'includeTags' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
includeTags: ['#content-1']
|
includeTags: ["#content-1"]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -160,19 +203,27 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).not.toContain("<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
expect(response.body.data.markdown).not.toContain(
|
||||||
expect(response.body.data.markdown).toContain("Content with id #content-1");
|
"<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
|
||||||
|
);
|
||||||
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"Content with id #content-1"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'excludeTags' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'excludeTags' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
excludeTags: ['#content-1']
|
excludeTags: ["#content-1"]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -184,12 +235,18 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
expect(response.body.data.markdown).toContain(
|
||||||
expect(response.body.data.markdown).not.toContain("Content with id #content-1");
|
"This page is used for end-to-end (e2e) testing with Firecrawl."
|
||||||
|
);
|
||||||
|
expect(response.body.data.markdown).not.toContain(
|
||||||
|
"Content with id #content-1"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'onlyMainContent' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'onlyMainContent' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
@@ -197,7 +254,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
onlyMainContent: false
|
onlyMainContent: false
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -209,19 +268,27 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
|
expect(response.body.data.markdown).toContain(
|
||||||
expect(response.body.data.html).toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
|
"This page is used for end-to-end (e2e) testing with Firecrawl."
|
||||||
|
);
|
||||||
|
expect(response.body.data.html).toContain(
|
||||||
|
'<header class="row-start-1" style="">Header</header>'
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'timeout' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'timeout' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
timeout: 500
|
timeout: 500
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -234,17 +301,21 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
}
|
}
|
||||||
expect(response.body.error).toBe("Request timed out");
|
expect(response.body.error).toBe("Request timed out");
|
||||||
expect(response.body.success).toBe(false);
|
expect(response.body.success).toBe(false);
|
||||||
}, 30000);
|
},
|
||||||
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
|
it.concurrent(
|
||||||
it.concurrent("should handle 'mobile' parameter correctly",
|
"should handle 'mobile' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
mobile: true
|
mobile: true
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -255,43 +326,61 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data.markdown).toContain("This content is only visible on mobile");
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"This content is only visible on mobile"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'parsePDF' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'parsePDF' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
|
.send({ url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" });
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty('data');
|
expect(response.body).toHaveProperty("data");
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
|
expect(response.body.data.markdown).toContain(
|
||||||
expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
"arXiv:astro-ph/9301001v1 7 Jan 1993"
|
||||||
|
);
|
||||||
|
expect(response.body.data.markdown).not.toContain(
|
||||||
|
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm"
|
||||||
|
);
|
||||||
|
|
||||||
const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const responseNoParsePDF: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
|
.send({
|
||||||
|
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
|
parsePDF: false
|
||||||
|
});
|
||||||
await new Promise((r) => setTimeout(r, 6000));
|
await new Promise((r) => setTimeout(r, 6000));
|
||||||
|
|
||||||
expect(responseNoParsePDF.statusCode).toBe(200);
|
expect(responseNoParsePDF.statusCode).toBe(200);
|
||||||
expect(responseNoParsePDF.body).toHaveProperty('data');
|
expect(responseNoParsePDF.body).toHaveProperty("data");
|
||||||
if (!("data" in responseNoParsePDF.body)) {
|
if (!("data" in responseNoParsePDF.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
|
expect(responseNoParsePDF.body.data.markdown).toContain(
|
||||||
|
"h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
// it.concurrent("should handle 'location' parameter correctly",
|
// it.concurrent("should handle 'location' parameter correctly",
|
||||||
// async () => {
|
// async () => {
|
||||||
@@ -314,59 +403,68 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// },
|
// },
|
||||||
// 30000);
|
// 30000);
|
||||||
|
|
||||||
it.concurrent("should handle 'skipTlsVerification' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'skipTlsVerification' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: "https://expired.badssl.com/",
|
url: "https://expired.badssl.com/",
|
||||||
timeout: 120000
|
timeout: 120000
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send(scrapeRequest);
|
.send(scrapeRequest);
|
||||||
console.log("Error1a")
|
console.log("Error1a");
|
||||||
// console.log(response.body)
|
// console.log(response.body)
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
if (!("data" in response.body)) {
|
if (!("data" in response.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
expect(response.body.data.metadata.pageStatusCode).toBe(500);
|
||||||
console.log("Error?")
|
console.log("Error?");
|
||||||
|
|
||||||
const scrapeRequestWithSkipTlsVerification = {
|
const scrapeRequestWithSkipTlsVerification = {
|
||||||
url: "https://expired.badssl.com/",
|
url: "https://expired.badssl.com/",
|
||||||
skipTlsVerification: true,
|
skipTlsVerification: true,
|
||||||
timeout: 120000
|
timeout: 120000
|
||||||
|
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const responseWithSkipTlsVerification: ScrapeResponseRequestTest =
|
||||||
|
await request(FIRECRAWL_API_URL)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send(scrapeRequestWithSkipTlsVerification);
|
.send(scrapeRequestWithSkipTlsVerification);
|
||||||
|
|
||||||
console.log("Error1b")
|
console.log("Error1b");
|
||||||
// console.log(responseWithSkipTlsVerification.body)
|
// console.log(responseWithSkipTlsVerification.body)
|
||||||
expect(responseWithSkipTlsVerification.statusCode).toBe(200);
|
expect(responseWithSkipTlsVerification.statusCode).toBe(200);
|
||||||
if (!("data" in responseWithSkipTlsVerification.body)) {
|
if (!("data" in responseWithSkipTlsVerification.body)) {
|
||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
// console.log(responseWithSkipTlsVerification.body.data)
|
// console.log(responseWithSkipTlsVerification.body.data)
|
||||||
expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
|
expect(responseWithSkipTlsVerification.body.data.markdown).toContain(
|
||||||
|
"badssl.com"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
60000);
|
60000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'removeBase64Images' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'removeBase64Images' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
removeBase64Images: true
|
removeBase64Images: true
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -380,19 +478,25 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// - TODO: not working for every image
|
// - TODO: not working for every image
|
||||||
// expect(response.body.data.markdown).toContain("Image-Removed");
|
// expect(response.body.data.markdown).toContain("Image-Removed");
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'action wait' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action wait' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
actions: [{
|
actions: [
|
||||||
|
{
|
||||||
type: "wait",
|
type: "wait",
|
||||||
milliseconds: 10000
|
milliseconds: 10000
|
||||||
}]
|
}
|
||||||
|
]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -403,21 +507,29 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data.markdown).not.toContain("Loading...");
|
expect(response.body.data.markdown).not.toContain("Loading...");
|
||||||
expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"Content loaded after 5 seconds!"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
// screenshot
|
// screenshot
|
||||||
it.concurrent("should handle 'action screenshot' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action screenshot' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
actions: [{
|
actions: [
|
||||||
|
{
|
||||||
type: "screenshot"
|
type: "screenshot"
|
||||||
}]
|
}
|
||||||
|
]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -430,27 +542,37 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
if (!response.body.data.actions?.screenshots) {
|
if (!response.body.data.actions?.screenshots) {
|
||||||
throw new Error("Expected response body to have screenshots array");
|
throw new Error("Expected response body to have screenshots array");
|
||||||
}
|
}
|
||||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
|
||||||
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
0
|
||||||
|
);
|
||||||
|
expect(response.body.data.actions.screenshots[0]).toContain(
|
||||||
|
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"
|
||||||
|
);
|
||||||
|
|
||||||
// TODO compare screenshot with expected screenshot
|
// TODO compare screenshot with expected screenshot
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action screenshot@fullPage' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
actions: [{
|
actions: [
|
||||||
|
{
|
||||||
type: "screenshot",
|
type: "screenshot",
|
||||||
fullPage: true
|
fullPage: true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type:"scrape"
|
type: "scrape"
|
||||||
}]
|
}
|
||||||
|
]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -464,29 +586,43 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
if (!response.body.data.actions?.screenshots) {
|
if (!response.body.data.actions?.screenshots) {
|
||||||
throw new Error("Expected response body to have screenshots array");
|
throw new Error("Expected response body to have screenshots array");
|
||||||
}
|
}
|
||||||
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
|
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(
|
||||||
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
|
0
|
||||||
|
);
|
||||||
|
expect(response.body.data.actions.screenshots[0]).toContain(
|
||||||
|
"https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"
|
||||||
|
);
|
||||||
|
|
||||||
if (!response.body.data.actions?.scrapes) {
|
if (!response.body.data.actions?.scrapes) {
|
||||||
throw new Error("Expected response body to have scrapes array");
|
throw new Error("Expected response body to have scrapes array");
|
||||||
}
|
}
|
||||||
expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
|
expect(response.body.data.actions.scrapes[0].url).toBe(
|
||||||
expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
|
"https://firecrawl-e2e-test.vercel.app/"
|
||||||
|
);
|
||||||
|
expect(response.body.data.actions.scrapes[0].html).toContain(
|
||||||
|
"This page is used for end-to-end (e2e) testing with Firecrawl.</p>"
|
||||||
|
);
|
||||||
// TODO compare screenshot with expected full page screenshot
|
// TODO compare screenshot with expected full page screenshot
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'action click' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action click' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
actions: [{
|
actions: [
|
||||||
|
{
|
||||||
type: "click",
|
type: "click",
|
||||||
selector: "#click-me"
|
selector: "#click-me"
|
||||||
}]
|
}
|
||||||
|
]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -497,16 +633,21 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
throw new Error("Expected response body to have 'data' property");
|
throw new Error("Expected response body to have 'data' property");
|
||||||
}
|
}
|
||||||
expect(response.body.data.markdown).not.toContain("Click me!");
|
expect(response.body.data.markdown).not.toContain("Click me!");
|
||||||
expect(response.body.data.markdown).toContain("Text changed after click!");
|
expect(response.body.data.markdown).toContain(
|
||||||
|
"Text changed after click!"
|
||||||
|
);
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
it.concurrent("should handle 'action write' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action write' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
formats: ["html"],
|
formats: ["html"],
|
||||||
actions: [{
|
actions: [
|
||||||
|
{
|
||||||
type: "click",
|
type: "click",
|
||||||
selector: "#input-1"
|
selector: "#input-1"
|
||||||
},
|
},
|
||||||
@@ -514,9 +655,12 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
type: "write",
|
type: "write",
|
||||||
text: "Hello, world!"
|
text: "Hello, world!"
|
||||||
}
|
}
|
||||||
]} as ScrapeRequest;
|
]
|
||||||
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -531,10 +675,12 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// uncomment the following line:
|
// uncomment the following line:
|
||||||
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: fix this test (need to fix fire-engine first)
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
it.concurrent("should handle 'action pressKey' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action pressKey' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
@@ -547,7 +693,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
]
|
]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -561,10 +709,12 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
// }
|
// }
|
||||||
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: fix this test (need to fix fire-engine first)
|
// TODO: fix this test (need to fix fire-engine first)
|
||||||
it.concurrent("should handle 'action scroll' parameter correctly",
|
it.concurrent(
|
||||||
|
"should handle 'action scroll' parameter correctly",
|
||||||
async () => {
|
async () => {
|
||||||
const scrapeRequest = {
|
const scrapeRequest = {
|
||||||
url: E2E_TEST_SERVER_URL,
|
url: E2E_TEST_SERVER_URL,
|
||||||
@@ -582,7 +732,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
]
|
]
|
||||||
} as ScrapeRequest;
|
} as ScrapeRequest;
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
|
const response: ScrapeResponseRequestTest = await request(
|
||||||
|
FIRECRAWL_API_URL
|
||||||
|
)
|
||||||
.post("/v1/scrape")
|
.post("/v1/scrape")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
@@ -596,8 +748,8 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
//
|
//
|
||||||
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
|
||||||
},
|
},
|
||||||
30000);
|
30000
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: test scrape action
|
// TODO: test scrape action
|
||||||
|
|
||||||
});
|
});
|
||||||
@@ -3,7 +3,7 @@ import dotenv from "dotenv";
|
|||||||
import {
|
import {
|
||||||
FirecrawlCrawlResponse,
|
FirecrawlCrawlResponse,
|
||||||
FirecrawlCrawlStatusResponse,
|
FirecrawlCrawlStatusResponse,
|
||||||
FirecrawlScrapeResponse,
|
FirecrawlScrapeResponse
|
||||||
} from "../../types";
|
} from "../../types";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
@@ -28,9 +28,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
|
|
||||||
describe("POST /v0/scrape", () => {
|
describe("POST /v0/scrape", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: FirecrawlScrapeResponse = await request(TEST_URL).post(
|
const response: FirecrawlScrapeResponse =
|
||||||
"/v0/scrape"
|
await request(TEST_URL).post("/v0/scrape");
|
||||||
);
|
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -99,7 +98,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://roastmywebsite.ai",
|
url: "https://roastmywebsite.ai",
|
||||||
pageOptions: { includeHtml: true },
|
pageOptions: { includeHtml: true }
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -196,7 +195,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com/",
|
url: "https://www.scrapethissite.com/",
|
||||||
pageOptions: { removeTags: [".nav", "#footer", "strong"] },
|
pageOptions: { removeTags: [".nav", "#footer", "strong"] }
|
||||||
});
|
});
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("data");
|
expect(response.body).toHaveProperty("data");
|
||||||
@@ -338,9 +337,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
|
|
||||||
describe("POST /v0/crawl", () => {
|
describe("POST /v0/crawl", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: FirecrawlCrawlResponse = await request(TEST_URL).post(
|
const response: FirecrawlCrawlResponse =
|
||||||
"/v0/crawl"
|
await request(TEST_URL).post("/v0/crawl");
|
||||||
);
|
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -383,8 +381,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
limit: 10,
|
limit: 10,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
includes: ["blog/*"],
|
includes: ["blog/*"]
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let response: FirecrawlCrawlStatusResponse;
|
let response: FirecrawlCrawlStatusResponse;
|
||||||
@@ -446,8 +444,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
limit: 10,
|
limit: 10,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
excludes: ["blog/*"],
|
excludes: ["blog/*"]
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let isFinished = false;
|
let isFinished = false;
|
||||||
@@ -494,7 +492,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({
|
.send({
|
||||||
url: "https://www.scrapethissite.com",
|
url: "https://www.scrapethissite.com",
|
||||||
crawlerOptions: { maxDepth: 1 },
|
crawlerOptions: { maxDepth: 1 }
|
||||||
});
|
});
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -690,7 +688,9 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
||||||
|
200
|
||||||
|
);
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
@@ -760,7 +760,10 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://docs.tatum.io", crawlerOptions: { limit: 200 } });
|
.send({
|
||||||
|
url: "https://docs.tatum.io",
|
||||||
|
crawlerOptions: { limit: 200 }
|
||||||
|
});
|
||||||
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
@@ -825,7 +828,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
.send({
|
.send({
|
||||||
url: "https://mendable.ai",
|
url: "https://mendable.ai",
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
onlyMainContent: true,
|
onlyMainContent: true
|
||||||
},
|
},
|
||||||
extractorOptions: {
|
extractorOptions: {
|
||||||
mode: "llm-extraction",
|
mode: "llm-extraction",
|
||||||
@@ -835,18 +838,18 @@ describe("E2E Tests for v0 API Routes", () => {
|
|||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
company_mission: {
|
company_mission: {
|
||||||
type: "string",
|
type: "string"
|
||||||
},
|
},
|
||||||
supports_sso: {
|
supports_sso: {
|
||||||
type: "boolean",
|
type: "boolean"
|
||||||
},
|
},
|
||||||
is_open_source: {
|
is_open_source: {
|
||||||
type: "boolean",
|
type: "boolean"
|
||||||
},
|
}
|
||||||
},
|
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
|
required: ["company_mission", "supports_sso", "is_open_source"]
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Ensure that the job was successfully created before proceeding with LLM extraction
|
// Ensure that the job was successfully created before proceeding with LLM extraction
|
||||||
|
|||||||
@@ -1,30 +1,30 @@
|
|||||||
import { crawlController } from '../v0/crawl'
|
import { crawlController } from "../v0/crawl";
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
import { authenticateUser } from "../auth"; // Ensure this import is correct
|
||||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
import { createIdempotencyKey } from "../../services/idempotency/create";
|
||||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
import { validateIdempotencyKey } from "../../services/idempotency/validate";
|
||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
jest.mock('../auth', () => ({
|
jest.mock("../auth", () => ({
|
||||||
authenticateUser: jest.fn().mockResolvedValue({
|
authenticateUser: jest.fn().mockResolvedValue({
|
||||||
success: true,
|
success: true,
|
||||||
team_id: 'team123',
|
team_id: "team123",
|
||||||
error: null,
|
error: null,
|
||||||
status: 200
|
status: 200
|
||||||
}),
|
}),
|
||||||
reduce: jest.fn()
|
reduce: jest.fn()
|
||||||
}));
|
}));
|
||||||
jest.mock('../../services/idempotency/validate');
|
jest.mock("../../services/idempotency/validate");
|
||||||
|
|
||||||
describe('crawlController', () => {
|
describe("crawlController", () => {
|
||||||
it('should prevent duplicate requests using the same idempotency key', async () => {
|
it("should prevent duplicate requests using the same idempotency key", async () => {
|
||||||
const req = {
|
const req = {
|
||||||
headers: {
|
headers: {
|
||||||
'x-idempotency-key': await uuidv4(),
|
"x-idempotency-key": await uuidv4(),
|
||||||
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
|
Authorization: `Bearer ${process.env.TEST_API_KEY}`
|
||||||
},
|
},
|
||||||
body: {
|
body: {
|
||||||
url: 'https://mendable.ai'
|
url: "https://mendable.ai"
|
||||||
}
|
}
|
||||||
} as unknown as Request;
|
} as unknown as Request;
|
||||||
const res = {
|
const res = {
|
||||||
@@ -33,7 +33,9 @@ describe('crawlController', () => {
|
|||||||
} as unknown as Response;
|
} as unknown as Response;
|
||||||
|
|
||||||
// Mock the idempotency key validation to return false for the second call
|
// Mock the idempotency key validation to return false for the second call
|
||||||
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
|
(validateIdempotencyKey as jest.Mock)
|
||||||
|
.mockResolvedValueOnce(true)
|
||||||
|
.mockResolvedValueOnce(false);
|
||||||
|
|
||||||
// First request should succeed
|
// First request should succeed
|
||||||
await crawlController(req, res);
|
await crawlController(req, res);
|
||||||
@@ -42,6 +44,8 @@ describe('crawlController', () => {
|
|||||||
// Second request with the same key should fail
|
// Second request with the same key should fail
|
||||||
await crawlController(req, res);
|
await crawlController(req, res);
|
||||||
expect(res.status).toHaveBeenCalledWith(409);
|
expect(res.status).toHaveBeenCalledWith(409);
|
||||||
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
|
expect(res.json).toHaveBeenCalledWith({
|
||||||
|
error: "Idempotency key already used"
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -4,7 +4,7 @@ import {
|
|||||||
AuthResponse,
|
AuthResponse,
|
||||||
NotificationType,
|
NotificationType,
|
||||||
PlanType,
|
PlanType,
|
||||||
RateLimiterMode,
|
RateLimiterMode
|
||||||
} from "../types";
|
} from "../types";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { withAuth } from "../lib/withAuth";
|
import { withAuth } from "../lib/withAuth";
|
||||||
@@ -39,7 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
|
|||||||
export async function setCachedACUC(
|
export async function setCachedACUC(
|
||||||
api_key: string,
|
api_key: string,
|
||||||
acuc:
|
acuc:
|
||||||
| AuthCreditUsageChunk | null
|
| AuthCreditUsageChunk
|
||||||
|
| null
|
||||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
|
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
|
||||||
) {
|
) {
|
||||||
const cacheKeyACUC = `acuc_${api_key}`;
|
const cacheKeyACUC = `acuc_${api_key}`;
|
||||||
@@ -48,7 +49,7 @@ export async function setCachedACUC(
|
|||||||
try {
|
try {
|
||||||
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
||||||
if (typeof acuc === "function") {
|
if (typeof acuc === "function") {
|
||||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));
|
acuc = acuc(JSON.parse((await getValue(cacheKeyACUC)) ?? "null"));
|
||||||
|
|
||||||
if (acuc === null) {
|
if (acuc === null) {
|
||||||
if (signal.aborted) {
|
if (signal.aborted) {
|
||||||
@@ -134,9 +135,7 @@ export async function getACUC(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function clearACUC(
|
export async function clearACUC(api_key: string): Promise<void> {
|
||||||
api_key: string,
|
|
||||||
): Promise<void> {
|
|
||||||
const cacheKeyACUC = `acuc_${api_key}`;
|
const cacheKeyACUC = `acuc_${api_key}`;
|
||||||
await deleteKey(cacheKeyACUC);
|
await deleteKey(cacheKeyACUC);
|
||||||
}
|
}
|
||||||
@@ -146,7 +145,11 @@ export async function authenticateUser(
|
|||||||
res,
|
res,
|
||||||
mode?: RateLimiterMode
|
mode?: RateLimiterMode
|
||||||
): Promise<AuthResponse> {
|
): Promise<AuthResponse> {
|
||||||
return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
|
return withAuth(supaAuthenticateUser, {
|
||||||
|
success: true,
|
||||||
|
chunk: null,
|
||||||
|
team_id: "bypass"
|
||||||
|
})(req, res, mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function supaAuthenticateUser(
|
export async function supaAuthenticateUser(
|
||||||
@@ -167,7 +170,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Unauthorized: Token missing",
|
error: "Unauthorized: Token missing",
|
||||||
status: 401,
|
status: 401
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -196,7 +199,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Unauthorized: Invalid token",
|
error: "Unauthorized: Invalid token",
|
||||||
status: 401,
|
status: 401
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -206,7 +209,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Unauthorized: Invalid token",
|
error: "Unauthorized: Invalid token",
|
||||||
status: 401,
|
status: 401
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -216,7 +219,7 @@ export async function supaAuthenticateUser(
|
|||||||
const plan = getPlanByPriceId(priceId);
|
const plan = getPlanByPriceId(priceId);
|
||||||
subscriptionData = {
|
subscriptionData = {
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
plan,
|
plan
|
||||||
};
|
};
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case RateLimiterMode.Crawl:
|
case RateLimiterMode.Crawl:
|
||||||
@@ -270,7 +273,13 @@ export async function supaAuthenticateUser(
|
|||||||
try {
|
try {
|
||||||
await rateLimiter.consume(team_endpoint_token);
|
await rateLimiter.consume(team_endpoint_token);
|
||||||
} catch (rateLimiterRes) {
|
} catch (rateLimiterRes) {
|
||||||
logger.error(`Rate limit exceeded: ${rateLimiterRes}`, { teamId, priceId, plan: subscriptionData?.plan, mode, rateLimiterRes });
|
logger.error(`Rate limit exceeded: ${rateLimiterRes}`, {
|
||||||
|
teamId,
|
||||||
|
priceId,
|
||||||
|
plan: subscriptionData?.plan,
|
||||||
|
mode,
|
||||||
|
rateLimiterRes
|
||||||
|
});
|
||||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||||
|
|
||||||
@@ -284,7 +293,7 @@ export async function supaAuthenticateUser(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
error: `Rate limit exceeded. Consumed (req/min): ${rateLimiterRes.consumedPoints}, Remaining (req/min): ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||||
status: 429,
|
status: 429
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -314,7 +323,7 @@ export async function supaAuthenticateUser(
|
|||||||
success: true,
|
success: true,
|
||||||
team_id: teamId ?? undefined,
|
team_id: teamId ?? undefined,
|
||||||
plan: (subscriptionData?.plan ?? "") as PlanType,
|
plan: (subscriptionData?.plan ?? "") as PlanType,
|
||||||
chunk,
|
chunk
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
function getPlanByPriceId(price_id: string | null): PlanType {
|
function getPlanByPriceId(price_id: string | null): PlanType {
|
||||||
|
|||||||
@@ -31,7 +31,9 @@ export async function cleanBefore24hCompleteJobsController(
|
|||||||
).flat();
|
).flat();
|
||||||
const before24hJobs =
|
const before24hJobs =
|
||||||
completedJobs.filter(
|
completedJobs.filter(
|
||||||
(job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
(job) =>
|
||||||
|
job.finishedOn !== undefined &&
|
||||||
|
job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
let count = 0;
|
let count = 0;
|
||||||
@@ -71,14 +73,14 @@ export async function queuesController(req: Request, res: Response) {
|
|||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
|
|
||||||
const [webScraperActive] = await Promise.all([
|
const [webScraperActive] = await Promise.all([
|
||||||
scrapeQueue.getActiveCount(),
|
scrapeQueue.getActiveCount()
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const noActiveJobs = webScraperActive === 0;
|
const noActiveJobs = webScraperActive === 0;
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
return res.status(noActiveJobs ? 200 : 500).json({
|
return res.status(noActiveJobs ? 200 : 500).json({
|
||||||
webScraperActive,
|
webScraperActive,
|
||||||
noActiveJobs,
|
noActiveJobs
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
@@ -97,7 +99,7 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
await Promise.all([
|
await Promise.all([
|
||||||
scrapeQueue.getActiveCount(),
|
scrapeQueue.getActiveCount(),
|
||||||
scrapeQueue.getWaitingCount(),
|
scrapeQueue.getWaitingCount(),
|
||||||
scrapeQueue.getPrioritizedCount(),
|
scrapeQueue.getPrioritizedCount()
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
||||||
@@ -107,8 +109,8 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
|
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
const machines = await request.json();
|
const machines = await request.json();
|
||||||
@@ -184,13 +186,13 @@ export async function autoscalerController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
mode: "scale-descale",
|
mode: "scale-descale",
|
||||||
count: targetMachineCount,
|
count: targetMachineCount
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
mode: "normal",
|
mode: "normal",
|
||||||
count: activeMachines,
|
count: activeMachines
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ export async function redisHealthController(req: Request, res: Response) {
|
|||||||
const healthStatus = {
|
const healthStatus = {
|
||||||
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy",
|
||||||
redisRateLimitClient:
|
redisRateLimitClient:
|
||||||
redisRateLimitHealth === testValue ? "healthy" : "unhealthy",
|
redisRateLimitHealth === testValue ? "healthy" : "unhealthy"
|
||||||
};
|
};
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
|||||||
@@ -10,13 +10,9 @@ configDotenv();
|
|||||||
|
|
||||||
export async function crawlCancelController(req: Request, res: Response) {
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
|
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, RateLimiterMode.CrawlStatus);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.CrawlStatus
|
|
||||||
);
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,21 +12,25 @@ import { toLegacyDocument } from "../v1/types";
|
|||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function getJobs(crawlId: string, ids: string[]) {
|
export async function getJobs(crawlId: string, ids: string[]) {
|
||||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
|
const jobs = (
|
||||||
|
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
|
||||||
|
).filter((x) => x) as Job[];
|
||||||
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
|
||||||
|
|
||||||
supabaseData.forEach(x => {
|
supabaseData.forEach((x) => {
|
||||||
const job = jobs.find(y => y.id === x.job_id);
|
const job = jobs.find((y) => y.id === x.job_id);
|
||||||
if (job) {
|
if (job) {
|
||||||
job.returnvalue = x.docs;
|
job.returnvalue = x.docs;
|
||||||
}
|
}
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
jobs.forEach(job => {
|
jobs.forEach((job) => {
|
||||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
job.returnvalue = Array.isArray(job.returnvalue)
|
||||||
|
? job.returnvalue[0]
|
||||||
|
: job.returnvalue;
|
||||||
});
|
});
|
||||||
|
|
||||||
return jobs;
|
return jobs;
|
||||||
@@ -34,11 +38,7 @@ export async function getJobs(crawlId: string, ids: string[]) {
|
|||||||
|
|
||||||
export async function crawlStatusController(req: Request, res: Response) {
|
export async function crawlStatusController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, RateLimiterMode.CrawlStatus);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.CrawlStatus
|
|
||||||
);
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
@@ -55,7 +55,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
let jobs = await getJobs(req.params.jobId, jobIDs);
|
let jobs = await getJobs(req.params.jobId, jobIDs);
|
||||||
let jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
let jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
||||||
|
|
||||||
// Combine jobs and jobStatuses into a single array of objects
|
// Combine jobs and jobStatuses into a single array of objects
|
||||||
let jobsWithStatuses = jobs.map((job, index) => ({
|
let jobsWithStatuses = jobs.map((job, index) => ({
|
||||||
@@ -64,18 +64,31 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
}));
|
}));
|
||||||
|
|
||||||
// Filter out failed jobs
|
// Filter out failed jobs
|
||||||
jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed" && x.status !== "unknown");
|
jobsWithStatuses = jobsWithStatuses.filter(
|
||||||
|
(x) => x.status !== "failed" && x.status !== "unknown"
|
||||||
|
);
|
||||||
|
|
||||||
// Sort jobs by timestamp
|
// Sort jobs by timestamp
|
||||||
jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp);
|
jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp);
|
||||||
|
|
||||||
// Extract sorted jobs and statuses
|
// Extract sorted jobs and statuses
|
||||||
jobs = jobsWithStatuses.map(x => x.job);
|
jobs = jobsWithStatuses.map((x) => x.job);
|
||||||
jobStatuses = jobsWithStatuses.map(x => x.status);
|
jobStatuses = jobsWithStatuses.map((x) => x.status);
|
||||||
|
|
||||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : "active";
|
const jobStatus = sc.cancelled
|
||||||
|
? "failed"
|
||||||
|
: jobStatuses.every((x) => x === "completed")
|
||||||
|
? "completed"
|
||||||
|
: "active";
|
||||||
|
|
||||||
const data = jobs.filter(x => x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null).map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
const data = jobs
|
||||||
|
.filter(
|
||||||
|
(x) =>
|
||||||
|
x.failedReason !== "Concurreny limit hit" && x.returnvalue !== null
|
||||||
|
)
|
||||||
|
.map((x) =>
|
||||||
|
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
||||||
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
jobs.length > 0 &&
|
jobs.length > 0 &&
|
||||||
@@ -83,7 +96,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
jobs[0].data.pageOptions &&
|
jobs[0].data.pageOptions &&
|
||||||
!jobs[0].data.pageOptions.includeRawHtml
|
!jobs[0].data.pageOptions.includeRawHtml
|
||||||
) {
|
) {
|
||||||
data.forEach(item => {
|
data.forEach((item) => {
|
||||||
if (item) {
|
if (item) {
|
||||||
delete item.rawHtml;
|
delete item.rawHtml;
|
||||||
}
|
}
|
||||||
@@ -92,10 +105,19 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: jobStatus,
|
status: jobStatus,
|
||||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
current: jobStatuses.filter((x) => x === "completed" || x === "failed")
|
||||||
|
.length,
|
||||||
total: jobs.length,
|
total: jobs.length,
|
||||||
data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
|
data:
|
||||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
|
jobStatus === "completed"
|
||||||
|
? data.map((x) => toLegacyDocument(x, sc.internalOptions))
|
||||||
|
: null,
|
||||||
|
partial_data:
|
||||||
|
jobStatus === "completed"
|
||||||
|
? []
|
||||||
|
: data
|
||||||
|
.filter((x) => x !== null)
|
||||||
|
.map((x) => toLegacyDocument(x, sc.internalOptions))
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
|||||||
@@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
|||||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
import {
|
||||||
|
defaultCrawlPageOptions,
|
||||||
|
defaultCrawlerOptions,
|
||||||
|
defaultOrigin
|
||||||
|
} from "../../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
import {
|
||||||
|
addCrawlJob,
|
||||||
|
addCrawlJobs,
|
||||||
|
crawlToCrawler,
|
||||||
|
lockURL,
|
||||||
|
lockURLs,
|
||||||
|
saveCrawl,
|
||||||
|
StoredCrawl
|
||||||
|
} from "../../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
@@ -20,11 +32,7 @@ import { ZodError } from "zod";
|
|||||||
|
|
||||||
export async function crawlController(req: Request, res: Response) {
|
export async function crawlController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, RateLimiterMode.Crawl);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Crawl
|
|
||||||
);
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
@@ -46,7 +54,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const crawlerOptions = {
|
const crawlerOptions = {
|
||||||
...defaultCrawlerOptions,
|
...defaultCrawlerOptions,
|
||||||
...req.body.crawlerOptions,
|
...req.body.crawlerOptions
|
||||||
};
|
};
|
||||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
|
|
||||||
@@ -71,11 +79,19 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const limitCheck = req.body?.crawlerOptions?.limit ?? 1;
|
const limitCheck = req.body?.crawlerOptions?.limit ?? 1;
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } =
|
const {
|
||||||
await checkTeamCredits(chunk, team_id, limitCheck);
|
success: creditsCheckSuccess,
|
||||||
|
message: creditsCheckMessage,
|
||||||
|
remainingCredits
|
||||||
|
} = await checkTeamCredits(chunk, team_id, limitCheck);
|
||||||
|
|
||||||
if (!creditsCheckSuccess) {
|
if (!creditsCheckSuccess) {
|
||||||
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
|
return res
|
||||||
|
.status(402)
|
||||||
|
.json({
|
||||||
|
error:
|
||||||
|
"Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com"
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: need to do this to v1
|
// TODO: need to do this to v1
|
||||||
@@ -99,7 +115,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
error:
|
error:
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -136,7 +152,11 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
await logCrawl(id, team_id);
|
await logCrawl(id, team_id);
|
||||||
|
|
||||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
|
||||||
|
pageOptions,
|
||||||
|
undefined,
|
||||||
|
undefined
|
||||||
|
);
|
||||||
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
internalOptions.disableSmartWaitCache = true; // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||||
|
|
||||||
delete (scrapeOptions as any).timeout;
|
delete (scrapeOptions as any).timeout;
|
||||||
@@ -148,7 +168,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now()
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
@@ -163,14 +183,13 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
? null
|
? null
|
||||||
: await crawler.tryGetSitemap();
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
|
|
||||||
if (sitemap !== null && sitemap.length > 0) {
|
if (sitemap !== null && sitemap.length > 0) {
|
||||||
let jobPriority = 20;
|
let jobPriority = 20;
|
||||||
// If it is over 1000, we need to get the job priority,
|
// If it is over 1000, we need to get the job priority,
|
||||||
// otherwise we can use the default priority of 20
|
// otherwise we can use the default priority of 20
|
||||||
if(sitemap.length > 1000){
|
if (sitemap.length > 1000) {
|
||||||
// set base to 21
|
// set base to 21
|
||||||
jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
|
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||||
}
|
}
|
||||||
const jobs = sitemap.map((x) => {
|
const jobs = sitemap.map((x) => {
|
||||||
const url = x.url;
|
const url = x.url;
|
||||||
@@ -187,12 +206,12 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
plan,
|
plan,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: jobPriority,
|
priority: jobPriority
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -226,12 +245,12 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
team_id,
|
team_id,
|
||||||
plan: plan!,
|
plan: plan!,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
crawl_id: id,
|
crawl_id: id
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
priority: 15 // prioritize request 0 of crawl jobs same as scrape jobs
|
||||||
},
|
},
|
||||||
jobId,
|
jobId
|
||||||
);
|
);
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
@@ -240,8 +259,10 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
return res.status(500).json({ error: error instanceof ZodError
|
return res
|
||||||
? "Invalid URL"
|
.status(500)
|
||||||
: error.message });
|
.json({
|
||||||
|
error: error instanceof ZodError ? "Invalid URL" : error.message
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,13 @@ import { RateLimiterMode } from "../../../src/types";
|
|||||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { logger } from "../../../src/lib/logger";
|
import { logger } from "../../../src/lib/logger";
|
||||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
import {
|
||||||
|
addCrawlJob,
|
||||||
|
crawlToCrawler,
|
||||||
|
lockURL,
|
||||||
|
saveCrawl,
|
||||||
|
StoredCrawl
|
||||||
|
} from "../../../src/lib/crawl-redis";
|
||||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
@@ -12,11 +18,7 @@ import { fromLegacyScrapeOptions } from "../v1/types";
|
|||||||
|
|
||||||
export async function crawlPreviewController(req: Request, res: Response) {
|
export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, RateLimiterMode.Preview);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Preview
|
|
||||||
);
|
|
||||||
|
|
||||||
const team_id = "preview";
|
const team_id = "preview";
|
||||||
|
|
||||||
@@ -39,16 +41,18 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res
|
return res.status(403).json({
|
||||||
.status(403)
|
|
||||||
.json({
|
|
||||||
error:
|
error:
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
removeTags: []
|
||||||
|
};
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
// try {
|
// try {
|
||||||
@@ -87,7 +91,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
robots = await this.getRobotsTxt();
|
robots = await this.getRobotsTxt();
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
|
|
||||||
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
|
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(
|
||||||
|
pageOptions,
|
||||||
|
undefined,
|
||||||
|
undefined
|
||||||
|
);
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: url,
|
originUrl: url,
|
||||||
@@ -97,20 +105,23 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
team_id,
|
team_id,
|
||||||
plan,
|
plan,
|
||||||
robots,
|
robots,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now()
|
||||||
};
|
};
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||||
|
? null
|
||||||
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
for (const url of sitemap.map(x => x.url)) {
|
for (const url of sitemap.map((x) => x.url)) {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
await addScrapeJob({
|
await addScrapeJob(
|
||||||
|
{
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
team_id,
|
team_id,
|
||||||
@@ -120,14 +131,18 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
origin: "website-preview",
|
origin: "website-preview",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true
|
||||||
}, {}, jobId);
|
},
|
||||||
|
{},
|
||||||
|
jobId
|
||||||
|
);
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
await addScrapeJob({
|
await addScrapeJob(
|
||||||
|
{
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
team_id,
|
team_id,
|
||||||
@@ -136,8 +151,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions,
|
internalOptions,
|
||||||
origin: "website-preview",
|
origin: "website-preview",
|
||||||
crawl_id: id,
|
crawl_id: id
|
||||||
}, {}, jobId);
|
},
|
||||||
|
{},
|
||||||
|
jobId
|
||||||
|
);
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,12 @@
|
|||||||
|
|
||||||
import { AuthResponse, RateLimiterMode } from "../../types";
|
import { AuthResponse, RateLimiterMode } from "../../types";
|
||||||
|
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
|
|
||||||
|
|
||||||
export const keyAuthController = async (req: Request, res: Response) => {
|
export const keyAuthController = async (req: Request, res: Response) => {
|
||||||
try {
|
try {
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res);
|
||||||
req,
|
|
||||||
res
|
|
||||||
);
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
@@ -22,4 +17,3 @@ export const keyAuthController = async (req: Request, res: Response) => {
|
|||||||
return res.status(500).json({ error: error.message });
|
return res.status(500).json({ error: error.message });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -2,19 +2,24 @@ import { ExtractorOptions, PageOptions } from "./../../lib/entities";
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import {
|
import {
|
||||||
billTeam,
|
billTeam,
|
||||||
checkTeamCredits,
|
checkTeamCredits
|
||||||
} from "../../services/billing/credit_billing";
|
} from "../../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
|
import {
|
||||||
|
Document,
|
||||||
|
fromLegacyCombo,
|
||||||
|
toLegacyDocument,
|
||||||
|
url as urlSchema
|
||||||
|
} from "../v1/types";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
import {
|
import {
|
||||||
defaultPageOptions,
|
defaultPageOptions,
|
||||||
defaultExtractorOptions,
|
defaultExtractorOptions,
|
||||||
defaultTimeout,
|
defaultTimeout,
|
||||||
defaultOrigin,
|
defaultOrigin
|
||||||
} from "../../lib/default-values";
|
} from "../../lib/default-values";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
@@ -50,13 +55,18 @@ export async function scrapeHelper(
|
|||||||
success: false,
|
success: false,
|
||||||
error:
|
error:
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
returnCode: 403,
|
returnCode: 403
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||||
|
|
||||||
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
|
const { scrapeOptions, internalOptions } = fromLegacyCombo(
|
||||||
|
pageOptions,
|
||||||
|
extractorOptions,
|
||||||
|
timeout,
|
||||||
|
crawlerOptions
|
||||||
|
);
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
{
|
{
|
||||||
@@ -67,7 +77,7 @@ export async function scrapeHelper(
|
|||||||
internalOptions,
|
internalOptions,
|
||||||
plan: plan!,
|
plan: plan!,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
is_scrape: true,
|
is_scrape: true
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
@@ -80,18 +90,21 @@ export async function scrapeHelper(
|
|||||||
{
|
{
|
||||||
name: "Wait for job to finish",
|
name: "Wait for job to finish",
|
||||||
op: "bullmq.wait",
|
op: "bullmq.wait",
|
||||||
attributes: { job: jobId },
|
attributes: { job: jobId }
|
||||||
},
|
},
|
||||||
async (span) => {
|
async (span) => {
|
||||||
try {
|
try {
|
||||||
doc = (await waitForJob<Document>(jobId, timeout));
|
doc = await waitForJob<Document>(jobId, timeout);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
|
if (
|
||||||
|
e instanceof Error &&
|
||||||
|
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||||
|
) {
|
||||||
span.setAttribute("timedOut", true);
|
span.setAttribute("timedOut", true);
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: "Request timed out",
|
error: "Request timed out",
|
||||||
returnCode: 408,
|
returnCode: 408
|
||||||
};
|
};
|
||||||
} else if (
|
} else if (
|
||||||
typeof e === "string" &&
|
typeof e === "string" &&
|
||||||
@@ -104,7 +117,7 @@ export async function scrapeHelper(
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: e,
|
error: e,
|
||||||
returnCode: 500,
|
returnCode: 500
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
throw e;
|
throw e;
|
||||||
@@ -127,7 +140,7 @@ export async function scrapeHelper(
|
|||||||
success: true,
|
success: true,
|
||||||
error: "No page found",
|
error: "No page found",
|
||||||
returnCode: 200,
|
returnCode: 200,
|
||||||
data: doc,
|
data: doc
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +166,7 @@ export async function scrapeHelper(
|
|||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: toLegacyDocument(doc, internalOptions),
|
data: toLegacyDocument(doc, internalOptions),
|
||||||
returnCode: 200,
|
returnCode: 200
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,11 +174,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
try {
|
try {
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, RateLimiterMode.Scrape);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Scrape
|
|
||||||
);
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
@@ -176,7 +185,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||||
const extractorOptions = {
|
const extractorOptions = {
|
||||||
...defaultExtractorOptions,
|
...defaultExtractorOptions,
|
||||||
...req.body.extractorOptions,
|
...req.body.extractorOptions
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? defaultOrigin;
|
const origin = req.body.origin ?? defaultOrigin;
|
||||||
let timeout = req.body.timeout ?? defaultTimeout;
|
let timeout = req.body.timeout ?? defaultTimeout;
|
||||||
@@ -188,7 +197,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
) {
|
) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
error:
|
error:
|
||||||
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
|
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified"
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,14 +211,19 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
await checkTeamCredits(chunk, team_id, 1);
|
await checkTeamCredits(chunk, team_id, 1);
|
||||||
if (!creditsCheckSuccess) {
|
if (!creditsCheckSuccess) {
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
|
return res
|
||||||
|
.status(402)
|
||||||
|
.json({
|
||||||
|
error:
|
||||||
|
"Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing"
|
||||||
|
});
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error);
|
logger.error(error);
|
||||||
earlyReturn = true;
|
earlyReturn = true;
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
error:
|
error:
|
||||||
"Error checking team credits. Please contact help@firecrawl.com for help.",
|
"Error checking team credits. Please contact help@firecrawl.com for help."
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -230,7 +244,10 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens =
|
const numTokens =
|
||||||
result.data && (result.data as Document).markdown
|
result.data && (result.data as Document).markdown
|
||||||
? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
|
? numTokensFromString(
|
||||||
|
(result.data as Document).markdown!,
|
||||||
|
"gpt-3.5-turbo"
|
||||||
|
)
|
||||||
: 0;
|
: 0;
|
||||||
|
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
@@ -250,8 +267,10 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
if (creditsToBeBilled > 0) {
|
if (creditsToBeBilled > 0) {
|
||||||
// billing for doc done on queue end, bill only for llm extraction
|
// billing for doc done on queue end, bill only for llm extraction
|
||||||
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
|
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch((error) => {
|
||||||
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
logger.error(
|
||||||
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`
|
||||||
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -264,13 +283,17 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(pageOptions && pageOptions.includeExtract) {
|
if (pageOptions && pageOptions.includeExtract) {
|
||||||
if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
|
||||||
delete (doc as Document).markdown;
|
delete (doc as Document).markdown;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
const { scrapeOptions } = fromLegacyScrapeOptions(
|
||||||
|
pageOptions,
|
||||||
|
extractorOptions,
|
||||||
|
timeout
|
||||||
|
);
|
||||||
|
|
||||||
logJob({
|
logJob({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
@@ -285,7 +308,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
num_tokens: numTokens,
|
num_tokens: numTokens
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
@@ -298,7 +321,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
? "Invalid URL"
|
? "Invalid URL"
|
||||||
: typeof error === "string"
|
: typeof error === "string"
|
||||||
? error
|
? error
|
||||||
: error?.message ?? "Internal Server Error",
|
: (error?.message ?? "Internal Server Error")
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
import {
|
||||||
|
billTeam,
|
||||||
|
checkTeamCredits
|
||||||
|
} from "../../services/billing/credit_billing";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { PlanType, RateLimiterMode } from "../../types";
|
import { PlanType, RateLimiterMode } from "../../types";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
@@ -13,7 +16,12 @@ import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
|||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { Job } from "bullmq";
|
import { Job } from "bullmq";
|
||||||
import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";
|
import {
|
||||||
|
Document,
|
||||||
|
fromLegacyCombo,
|
||||||
|
fromLegacyScrapeOptions,
|
||||||
|
toLegacyDocument
|
||||||
|
} from "../v1/types";
|
||||||
|
|
||||||
export async function searchHelper(
|
export async function searchHelper(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
@@ -54,16 +62,23 @@ export async function searchHelper(
|
|||||||
filter: filter,
|
filter: filter,
|
||||||
lang: searchOptions.lang ?? "en",
|
lang: searchOptions.lang ?? "en",
|
||||||
country: searchOptions.country ?? "us",
|
country: searchOptions.country ?? "us",
|
||||||
location: searchOptions.location,
|
location: searchOptions.location
|
||||||
});
|
});
|
||||||
|
|
||||||
let justSearch = pageOptions.fetchPageContent === false;
|
let justSearch = pageOptions.fetchPageContent === false;
|
||||||
|
|
||||||
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);
|
const { scrapeOptions, internalOptions } = fromLegacyCombo(
|
||||||
|
pageOptions,
|
||||||
|
undefined,
|
||||||
|
60000,
|
||||||
|
crawlerOptions
|
||||||
|
);
|
||||||
|
|
||||||
if (justSearch) {
|
if (justSearch) {
|
||||||
billTeam(team_id, subscription_id, res.length).catch(error => {
|
billTeam(team_id, subscription_id, res.length).catch((error) => {
|
||||||
logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
logger.error(
|
||||||
|
`Failed to bill team ${team_id} for ${res.length} credits: ${error}`
|
||||||
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
return { success: true, data: res, returnCode: 200 };
|
return { success: true, data: res, returnCode: 200 };
|
||||||
@@ -78,11 +93,11 @@ export async function searchHelper(
|
|||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const jobPriority = await getJobPriority({plan, team_id, basePriority: 20});
|
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 20 });
|
||||||
|
|
||||||
// filter out social media links
|
// filter out social media links
|
||||||
|
|
||||||
const jobDatas = res.map(x => {
|
const jobDatas = res.map((x) => {
|
||||||
const url = x.url;
|
const url = x.url;
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
return {
|
return {
|
||||||
@@ -92,28 +107,32 @@ export async function searchHelper(
|
|||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions,
|
internalOptions
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: jobPriority,
|
priority: jobPriority
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
})
|
});
|
||||||
|
|
||||||
// TODO: addScrapeJobs
|
// TODO: addScrapeJobs
|
||||||
for (const job of jobDatas) {
|
for (const job of jobDatas) {
|
||||||
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
|
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority);
|
||||||
}
|
}
|
||||||
|
|
||||||
const docs = (await Promise.all(jobDatas.map(x => waitForJob<Document>(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
|
const docs = (
|
||||||
|
await Promise.all(
|
||||||
|
jobDatas.map((x) => waitForJob<Document>(x.opts.jobId, 60000))
|
||||||
|
)
|
||||||
|
).map((x) => toLegacyDocument(x, internalOptions));
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return { success: true, error: "No search results found", returnCode: 200 };
|
return { success: true, error: "No search results found", returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const sq = getScrapeQueue();
|
const sq = getScrapeQueue();
|
||||||
await Promise.all(jobDatas.map(x => sq.remove(x.opts.jobId)));
|
await Promise.all(jobDatas.map((x) => sq.remove(x.opts.jobId)));
|
||||||
|
|
||||||
// make sure doc.content is not empty
|
// make sure doc.content is not empty
|
||||||
const filteredDocs = docs.filter(
|
const filteredDocs = docs.filter(
|
||||||
@@ -121,24 +140,25 @@ export async function searchHelper(
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (filteredDocs.length === 0) {
|
if (filteredDocs.length === 0) {
|
||||||
return { success: true, error: "No page found", returnCode: 200, data: docs };
|
return {
|
||||||
|
success: true,
|
||||||
|
error: "No page found",
|
||||||
|
returnCode: 200,
|
||||||
|
data: docs
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: filteredDocs,
|
data: filteredDocs,
|
||||||
returnCode: 200,
|
returnCode: 200
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function searchController(req: Request, res: Response) {
|
export async function searchController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
// make sure to authenticate user first, Bearer <token>
|
// make sure to authenticate user first, Bearer <token>
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, RateLimiterMode.Search);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
RateLimiterMode.Search
|
|
||||||
);
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return res.status(auth.status).json({ error: auth.error });
|
return res.status(auth.status).json({ error: auth.error });
|
||||||
}
|
}
|
||||||
@@ -149,7 +169,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||||
fallback: req.body.pageOptions?.fallback ?? false,
|
fallback: req.body.pageOptions?.fallback ?? false
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
||||||
@@ -192,11 +212,14 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
mode: "search",
|
mode: "search",
|
||||||
url: req.body.query,
|
url: req.body.query,
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
origin: origin,
|
origin: origin
|
||||||
});
|
});
|
||||||
return res.status(result.returnCode).json(result);
|
return res.status(result.returnCode).json(result);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && (error.message.startsWith("Job wait") || error.message === "timeout")) {
|
if (
|
||||||
|
error instanceof Error &&
|
||||||
|
(error.message.startsWith("Job wait") || error.message === "timeout")
|
||||||
|
) {
|
||||||
return res.status(408).json({ error: "Request timed out" });
|
return res.status(408).json({ error: "Request timed out" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,10 @@ import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
|||||||
import { getJobs } from "./crawl-status";
|
import { getJobs } from "./crawl-status";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
export async function crawlJobStatusPreviewController(req: Request, res: Response) {
|
export async function crawlJobStatusPreviewController(
|
||||||
|
req: Request,
|
||||||
|
res: Response
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
@@ -22,18 +25,30 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
|
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort(
|
||||||
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
|
(a, b) => a.timestamp - b.timestamp
|
||||||
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
|
);
|
||||||
|
const jobStatuses = await Promise.all(jobs.map((x) => x.getState()));
|
||||||
|
const jobStatus = sc.cancelled
|
||||||
|
? "failed"
|
||||||
|
: jobStatuses.every((x) => x === "completed")
|
||||||
|
? "completed"
|
||||||
|
: jobStatuses.some((x) => x === "failed")
|
||||||
|
? "failed"
|
||||||
|
: "active";
|
||||||
|
|
||||||
const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
|
const data = jobs.map((x) =>
|
||||||
|
Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue
|
||||||
|
);
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
status: jobStatus,
|
status: jobStatus,
|
||||||
current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
|
current: jobStatuses.filter((x) => x === "completed" || x === "failed")
|
||||||
|
.length,
|
||||||
total: jobs.length,
|
total: jobs.length,
|
||||||
data: jobStatus === "completed" ? data : null,
|
data: jobStatus === "completed" ? data : null,
|
||||||
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null),
|
partial_data:
|
||||||
|
jobStatus === "completed" ? [] : data.filter((x) => x !== null)
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
|
|||||||
@@ -24,11 +24,15 @@ describe("URL Schema Validation", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should reject URLs without a valid top-level domain", () => {
|
it("should reject URLs without a valid top-level domain", () => {
|
||||||
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
|
expect(() => url.parse("http://example")).toThrow(
|
||||||
|
"URL must have a valid top-level domain or be a valid path"
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject blocked URLs", () => {
|
it("should reject blocked URLs", () => {
|
||||||
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
expect(() => url.parse("https://facebook.com")).toThrow(
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should handle URLs with subdomains correctly", () => {
|
it("should handle URLs with subdomains correctly", () => {
|
||||||
@@ -42,23 +46,33 @@ describe("URL Schema Validation", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should handle URLs with subdomains that are blocked", () => {
|
it("should handle URLs with subdomains that are blocked", () => {
|
||||||
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
expect(() => url.parse("https://sub.facebook.com")).toThrow(
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should handle URLs with paths that are blocked", () => {
|
it("should handle URLs with paths that are blocked", () => {
|
||||||
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
expect(() => url.parse("http://facebook.com/path")).toThrow(
|
||||||
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
|
expect(() => url.parse("https://facebook.com/another/path")).toThrow(
|
||||||
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject malformed URLs starting with 'http://http'", () => {
|
it("should reject malformed URLs starting with 'http://http'", () => {
|
||||||
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
|
expect(() => url.parse("http://http://example.com")).toThrow(
|
||||||
|
"Invalid URL. Invalid protocol."
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||||
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
|
expect(() =>
|
||||||
|
url.parse("http://example.com/http://example.com")
|
||||||
|
).not.toThrow();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||||
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
||||||
});
|
});
|
||||||
})
|
});
|
||||||
|
|||||||
@@ -5,14 +5,14 @@ import {
|
|||||||
batchScrapeRequestSchema,
|
batchScrapeRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
ScrapeOptions,
|
ScrapeOptions
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJobs,
|
addCrawlJobs,
|
||||||
getCrawl,
|
getCrawl,
|
||||||
lockURLs,
|
lockURLs,
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl,
|
StoredCrawl
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
@@ -27,26 +27,39 @@ export async function batchScrapeController(
|
|||||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = req.body.appendToId ?? uuidv4();
|
const id = req.body.appendToId ?? uuidv4();
|
||||||
const logger = _logger.child({ crawlId: id, batchScrapeId: id, module: "api/v1", method: "batchScrapeController", teamId: req.auth.team_id, plan: req.auth.plan });
|
const logger = _logger.child({
|
||||||
logger.debug("Batch scrape " + id + " starting", { urlsLength: req.body.urls, appendToId: req.body.appendToId, account: req.account });
|
crawlId: id,
|
||||||
|
batchScrapeId: id,
|
||||||
|
module: "api/v1",
|
||||||
|
method: "batchScrapeController",
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
plan: req.auth.plan
|
||||||
|
});
|
||||||
|
logger.debug("Batch scrape " + id + " starting", {
|
||||||
|
urlsLength: req.body.urls,
|
||||||
|
appendToId: req.body.appendToId,
|
||||||
|
account: req.account
|
||||||
|
});
|
||||||
|
|
||||||
if (!req.body.appendToId) {
|
if (!req.body.appendToId) {
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
let { remainingCredits } = req.account!;
|
let { remainingCredits } = req.account!;
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if(!useDbAuthentication){
|
if (!useDbAuthentication) {
|
||||||
remainingCredits = Infinity;
|
remainingCredits = Infinity;
|
||||||
}
|
}
|
||||||
|
|
||||||
const sc: StoredCrawl = req.body.appendToId ? await getCrawl(req.body.appendToId) as StoredCrawl : {
|
const sc: StoredCrawl = req.body.appendToId
|
||||||
|
? ((await getCrawl(req.body.appendToId)) as StoredCrawl)
|
||||||
|
: {
|
||||||
crawlerOptions: null,
|
crawlerOptions: null,
|
||||||
scrapeOptions: req.body,
|
scrapeOptions: req.body,
|
||||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for batch scrapes to ensure contentful scrape, speed does not matter
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!req.body.appendToId) {
|
if (!req.body.appendToId) {
|
||||||
@@ -57,9 +70,13 @@ export async function batchScrapeController(
|
|||||||
|
|
||||||
// If it is over 1000, we need to get the job priority,
|
// If it is over 1000, we need to get the job priority,
|
||||||
// otherwise we can use the default priority of 20
|
// otherwise we can use the default priority of 20
|
||||||
if(req.body.urls.length > 1000){
|
if (req.body.urls.length > 1000) {
|
||||||
// set base to 21
|
// set base to 21
|
||||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
jobPriority = await getJobPriority({
|
||||||
|
plan: req.auth.plan,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
basePriority: 21
|
||||||
|
});
|
||||||
}
|
}
|
||||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
@@ -80,12 +97,12 @@ export async function batchScrapeController(
|
|||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
v1: true,
|
v1: true,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuidv4(),
|
jobId: uuidv4(),
|
||||||
priority: 20,
|
priority: 20
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -103,9 +120,18 @@ export async function batchScrapeController(
|
|||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await addScrapeJobs(jobs);
|
await addScrapeJobs(jobs);
|
||||||
|
|
||||||
if(req.body.webhook) {
|
if (req.body.webhook) {
|
||||||
logger.debug("Calling webhook with batch_scrape.started...", { webhook: req.body.webhook });
|
logger.debug("Calling webhook with batch_scrape.started...", {
|
||||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
|
webhook: req.body.webhook
|
||||||
|
});
|
||||||
|
await callWebhook(
|
||||||
|
req.auth.team_id,
|
||||||
|
id,
|
||||||
|
null,
|
||||||
|
req.body.webhook,
|
||||||
|
true,
|
||||||
|
"batch_scrape.started"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||||
@@ -113,8 +139,6 @@ export async function batchScrapeController(
|
|||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
|
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { authenticateUser } from "../auth";
|
|||||||
import {
|
import {
|
||||||
ConcurrencyCheckParams,
|
ConcurrencyCheckParams,
|
||||||
ConcurrencyCheckResponse,
|
ConcurrencyCheckResponse,
|
||||||
RequestWithAuth,
|
RequestWithAuth
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { RateLimiterMode } from "../../types";
|
import { RateLimiterMode } from "../../types";
|
||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
|
|||||||
@@ -7,9 +7,12 @@ import { configDotenv } from "dotenv";
|
|||||||
import { RequestWithAuth } from "./types";
|
import { RequestWithAuth } from "./types";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function crawlCancelController(req: RequestWithAuth<{ jobId: string }>, res: Response) {
|
export async function crawlCancelController(
|
||||||
|
req: RequestWithAuth<{ jobId: string }>,
|
||||||
|
res: Response
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
|
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
|
|||||||
@@ -1,32 +1,47 @@
|
|||||||
import { authMiddleware } from "../../routes/v1";
|
import { authMiddleware } from "../../routes/v1";
|
||||||
import { RateLimiterMode } from "../../types";
|
import { RateLimiterMode } from "../../types";
|
||||||
import { authenticateUser } from "../auth";
|
import { authenticateUser } from "../auth";
|
||||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
|
import {
|
||||||
|
CrawlStatusParams,
|
||||||
|
CrawlStatusResponse,
|
||||||
|
Document,
|
||||||
|
ErrorResponse,
|
||||||
|
RequestWithAuth
|
||||||
|
} from "./types";
|
||||||
import { WebSocket } from "ws";
|
import { WebSocket } from "ws";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
import {
|
||||||
|
getCrawl,
|
||||||
|
getCrawlExpiry,
|
||||||
|
getCrawlJobs,
|
||||||
|
getDoneJobsOrdered,
|
||||||
|
getDoneJobsOrderedLength,
|
||||||
|
getThrottledJobs,
|
||||||
|
isCrawlFinished,
|
||||||
|
isCrawlFinishedLocked
|
||||||
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { getJob, getJobs } from "./crawl-status";
|
import { getJob, getJobs } from "./crawl-status";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Job, JobState } from "bullmq";
|
import { Job, JobState } from "bullmq";
|
||||||
|
|
||||||
type ErrorMessage = {
|
type ErrorMessage = {
|
||||||
type: "error",
|
type: "error";
|
||||||
error: string,
|
error: string;
|
||||||
}
|
};
|
||||||
|
|
||||||
type CatchupMessage = {
|
type CatchupMessage = {
|
||||||
type: "catchup",
|
type: "catchup";
|
||||||
data: CrawlStatusResponse,
|
data: CrawlStatusResponse;
|
||||||
}
|
};
|
||||||
|
|
||||||
type DocumentMessage = {
|
type DocumentMessage = {
|
||||||
type: "document",
|
type: "document";
|
||||||
data: Document,
|
data: Document;
|
||||||
}
|
};
|
||||||
|
|
||||||
type DoneMessage = { type: "done" }
|
type DoneMessage = { type: "done" };
|
||||||
|
|
||||||
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
||||||
|
|
||||||
@@ -47,7 +62,10 @@ function close(ws: WebSocket, code: number, msg: Message) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
async function crawlStatusWS(
|
||||||
|
ws: WebSocket,
|
||||||
|
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>
|
||||||
|
) {
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
return close(ws, 1008, { type: "error", error: "Job not found" });
|
return close(ws, 1008, { type: "error", error: "Job not found" });
|
||||||
@@ -69,17 +87,23 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||||||
return close(ws, 1000, { type: "done" });
|
return close(ws, 1000, { type: "done" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
const notDoneJobIDs = jobIDs.filter((x) => !doneJobIDs.includes(x));
|
||||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
const jobStatuses = await Promise.all(
|
||||||
const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
notDoneJobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)])
|
||||||
const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]
|
);
|
||||||
|
const newlyDoneJobIDs: string[] = jobStatuses
|
||||||
|
.filter((x) => x[1] === "completed" || x[1] === "failed")
|
||||||
|
.map((x) => x[0]);
|
||||||
|
const newlyDoneJobs: Job[] = (
|
||||||
|
await Promise.all(newlyDoneJobIDs.map((x) => getJob(x)))
|
||||||
|
).filter((x) => x !== undefined) as Job[];
|
||||||
|
|
||||||
for (const job of newlyDoneJobs) {
|
for (const job of newlyDoneJobs) {
|
||||||
if (job.returnvalue) {
|
if (job.returnvalue) {
|
||||||
send(ws, {
|
send(ws, {
|
||||||
type: "document",
|
type: "document",
|
||||||
data: job.returnvalue,
|
data: job.returnvalue
|
||||||
})
|
});
|
||||||
} else {
|
} else {
|
||||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||||
}
|
}
|
||||||
@@ -95,8 +119,10 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||||||
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
||||||
|
|
||||||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
let jobStatuses = await Promise.all(jobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)] as const));
|
let jobStatuses = await Promise.all(
|
||||||
const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id));
|
jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const)
|
||||||
|
);
|
||||||
|
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||||
|
|
||||||
const throttledJobsSet = new Set(throttledJobs);
|
const throttledJobsSet = new Set(throttledJobs);
|
||||||
|
|
||||||
@@ -104,18 +130,27 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||||||
const validJobIDs: string[] = [];
|
const validJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
if (
|
||||||
|
!throttledJobsSet.has(id) &&
|
||||||
|
status !== "failed" &&
|
||||||
|
status !== "unknown"
|
||||||
|
) {
|
||||||
validJobStatuses.push([id, status]);
|
validJobStatuses.push([id, status]);
|
||||||
validJobIDs.push(id);
|
validJobIDs.push(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||||
|
sc.cancelled
|
||||||
|
? "cancelled"
|
||||||
|
: validJobStatuses.every((x) => x[1] === "completed")
|
||||||
|
? "completed"
|
||||||
|
: "scraping";
|
||||||
|
|
||||||
jobIDs = validJobIDs; // Use validJobIDs instead of jobIDs for further processing
|
jobIDs = validJobIDs; // Use validJobIDs instead of jobIDs for further processing
|
||||||
|
|
||||||
const doneJobs = await getJobs(doneJobIDs);
|
const doneJobs = await getJobs(doneJobIDs);
|
||||||
const data = doneJobs.map(x => x.returnvalue);
|
const data = doneJobs.map((x) => x.returnvalue);
|
||||||
|
|
||||||
send(ws, {
|
send(ws, {
|
||||||
type: "catchup",
|
type: "catchup",
|
||||||
@@ -126,7 +161,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||||||
completed: doneJobIDs.length,
|
completed: doneJobIDs.length,
|
||||||
creditsUsed: jobIDs.length,
|
creditsUsed: jobIDs.length,
|
||||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||||
data: data,
|
data: data
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -137,18 +172,17 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Basically just middleware and error wrapping
|
// Basically just middleware and error wrapping
|
||||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
export async function crawlStatusWSController(
|
||||||
|
ws: WebSocket,
|
||||||
|
req: RequestWithAuth<CrawlStatusParams, undefined, undefined>
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, null, RateLimiterMode.CrawlStatus);
|
||||||
req,
|
|
||||||
null,
|
|
||||||
RateLimiterMode.CrawlStatus,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
return close(ws, 3000, {
|
return close(ws, 3000, {
|
||||||
type: "error",
|
type: "error",
|
||||||
error: auth.error,
|
error: auth.error
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,15 +201,24 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
|
|||||||
verbose = JSON.stringify({
|
verbose = JSON.stringify({
|
||||||
message: err.message,
|
message: err.message,
|
||||||
name: err.name,
|
name: err.name,
|
||||||
stack: err.stack,
|
stack: err.stack
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
logger.error(
|
||||||
|
"Error occurred in WebSocket! (" +
|
||||||
|
req.path +
|
||||||
|
") -- ID " +
|
||||||
|
id +
|
||||||
|
" -- " +
|
||||||
|
verbose
|
||||||
|
);
|
||||||
return close(ws, 1011, {
|
return close(ws, 1011, {
|
||||||
type: "error",
|
type: "error",
|
||||||
error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
|
error:
|
||||||
|
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
|
||||||
|
id
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,23 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
|
import {
|
||||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
|
CrawlStatusParams,
|
||||||
|
CrawlStatusResponse,
|
||||||
|
ErrorResponse,
|
||||||
|
RequestWithAuth
|
||||||
|
} from "./types";
|
||||||
|
import {
|
||||||
|
getCrawl,
|
||||||
|
getCrawlExpiry,
|
||||||
|
getCrawlJobs,
|
||||||
|
getDoneJobsOrdered,
|
||||||
|
getDoneJobsOrderedLength,
|
||||||
|
getThrottledJobs
|
||||||
|
} from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
import {
|
||||||
|
supabaseGetJobById,
|
||||||
|
supabaseGetJobsById
|
||||||
|
} from "../../lib/supabase-jobs";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { Job, JobState } from "bullmq";
|
import { Job, JobState } from "bullmq";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
@@ -20,33 +35,43 @@ export async function getJob(id: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
job.returnvalue = Array.isArray(job.returnvalue)
|
||||||
|
? job.returnvalue[0]
|
||||||
|
: job.returnvalue;
|
||||||
|
|
||||||
return job;
|
return job;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getJobs(ids: string[]) {
|
export async function getJobs(ids: string[]) {
|
||||||
const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
|
const jobs: (Job & { id: string })[] = (
|
||||||
|
await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
|
||||||
|
).filter((x) => x) as (Job & { id: string })[];
|
||||||
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
const supabaseData = await supabaseGetJobsById(ids);
|
const supabaseData = await supabaseGetJobsById(ids);
|
||||||
|
|
||||||
supabaseData.forEach(x => {
|
supabaseData.forEach((x) => {
|
||||||
const job = jobs.find(y => y.id === x.job_id);
|
const job = jobs.find((y) => y.id === x.job_id);
|
||||||
if (job) {
|
if (job) {
|
||||||
job.returnvalue = x.docs;
|
job.returnvalue = x.docs;
|
||||||
}
|
}
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
jobs.forEach(job => {
|
jobs.forEach((job) => {
|
||||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
job.returnvalue = Array.isArray(job.returnvalue)
|
||||||
|
? job.returnvalue[0]
|
||||||
|
: job.returnvalue;
|
||||||
});
|
});
|
||||||
|
|
||||||
return jobs;
|
return jobs;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
|
export async function crawlStatusController(
|
||||||
|
req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>,
|
||||||
|
res: Response<CrawlStatusResponse>,
|
||||||
|
isBatch = false
|
||||||
|
) {
|
||||||
const sc = await getCrawl(req.params.jobId);
|
const sc = await getCrawl(req.params.jobId);
|
||||||
if (!sc) {
|
if (!sc) {
|
||||||
return res.status(404).json({ success: false, error: "Job not found" });
|
return res.status(404).json({ success: false, error: "Job not found" });
|
||||||
@@ -56,12 +81,18 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
const start =
|
||||||
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
|
typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
||||||
|
const end =
|
||||||
|
typeof req.query.limit === "string"
|
||||||
|
? start + parseInt(req.query.limit, 10) - 1
|
||||||
|
: undefined;
|
||||||
|
|
||||||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
let jobStatuses = await Promise.all(jobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)] as const));
|
let jobStatuses = await Promise.all(
|
||||||
const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id));
|
jobIDs.map(async (x) => [x, await getScrapeQueue().getJobState(x)] as const)
|
||||||
|
);
|
||||||
|
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||||
|
|
||||||
const throttledJobsSet = new Set(throttledJobs);
|
const throttledJobsSet = new Set(throttledJobs);
|
||||||
|
|
||||||
@@ -69,30 +100,48 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||||||
const validJobIDs: string[] = [];
|
const validJobIDs: string[] = [];
|
||||||
|
|
||||||
for (const [id, status] of jobStatuses) {
|
for (const [id, status] of jobStatuses) {
|
||||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
if (
|
||||||
|
!throttledJobsSet.has(id) &&
|
||||||
|
status !== "failed" &&
|
||||||
|
status !== "unknown"
|
||||||
|
) {
|
||||||
validJobStatuses.push([id, status]);
|
validJobStatuses.push([id, status]);
|
||||||
validJobIDs.push(id);
|
validJobIDs.push(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
|
||||||
|
sc.cancelled
|
||||||
|
? "cancelled"
|
||||||
|
: validJobStatuses.every((x) => x[1] === "completed")
|
||||||
|
? "completed"
|
||||||
|
: "scraping";
|
||||||
|
|
||||||
// Use validJobIDs instead of jobIDs for further processing
|
// Use validJobIDs instead of jobIDs for further processing
|
||||||
jobIDs = validJobIDs;
|
jobIDs = validJobIDs;
|
||||||
|
|
||||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
const doneJobsOrder = await getDoneJobsOrdered(
|
||||||
|
req.params.jobId,
|
||||||
|
start,
|
||||||
|
end ?? -1
|
||||||
|
);
|
||||||
|
|
||||||
let doneJobs: Job[] = [];
|
let doneJobs: Job[] = [];
|
||||||
|
|
||||||
if (end === undefined) { // determine 10 megabyte limit
|
if (end === undefined) {
|
||||||
|
// determine 10 megabyte limit
|
||||||
let bytes = 0;
|
let bytes = 0;
|
||||||
const bytesLimit = 10485760; // 10 MiB in bytes
|
const bytesLimit = 10485760; // 10 MiB in bytes
|
||||||
const factor = 100; // chunking for faster retrieval
|
const factor = 100; // chunking for faster retrieval
|
||||||
|
|
||||||
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
|
for (
|
||||||
|
let i = 0;
|
||||||
|
i < doneJobsOrder.length && bytes < bytesLimit;
|
||||||
|
i += factor
|
||||||
|
) {
|
||||||
// get current chunk and retrieve jobs
|
// get current chunk and retrieve jobs
|
||||||
const currentIDs = doneJobsOrder.slice(i, i+factor);
|
const currentIDs = doneJobsOrder.slice(i, i + factor);
|
||||||
const jobs = await getJobs(currentIDs);
|
const jobs = await getJobs(currentIDs);
|
||||||
|
|
||||||
// iterate through jobs and add them one them one to the byte counter
|
// iterate through jobs and add them one them one to the byte counter
|
||||||
@@ -101,12 +150,16 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||||||
const job = jobs[ii];
|
const job = jobs[ii];
|
||||||
const state = await job.getState();
|
const state = await job.getState();
|
||||||
|
|
||||||
if (state === "failed" || state === "active") { // TODO: why is active here? race condition? shouldn't matter tho - MG
|
if (state === "failed" || state === "active") {
|
||||||
|
// TODO: why is active here? race condition? shouldn't matter tho - MG
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.returnvalue === undefined) {
|
if (job.returnvalue === undefined) {
|
||||||
logger.warn("Job was considered done, but returnvalue is undefined!", { jobId: job.id, state });
|
logger.warn(
|
||||||
|
"Job was considered done, but returnvalue is undefined!",
|
||||||
|
{ jobId: job.id, state }
|
||||||
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
doneJobs.push(job);
|
doneJobs.push(job);
|
||||||
@@ -119,13 +172,21 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||||||
doneJobs.splice(doneJobs.length - 1, 1);
|
doneJobs.splice(doneJobs.length - 1, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
doneJobs = (await Promise.all((await getJobs(doneJobsOrder)).map(async x => (await x.getState()) === "failed" ? null : x))).filter(x => x !== null) as Job[];
|
doneJobs = (
|
||||||
|
await Promise.all(
|
||||||
|
(await getJobs(doneJobsOrder)).map(async (x) =>
|
||||||
|
(await x.getState()) === "failed" ? null : x
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).filter((x) => x !== null) as Job[];
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = doneJobs.map(x => x.returnvalue);
|
const data = doneJobs.map((x) => x.returnvalue);
|
||||||
|
|
||||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||||
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
|
const nextURL = new URL(
|
||||||
|
`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`
|
||||||
|
);
|
||||||
|
|
||||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||||
|
|
||||||
@@ -151,10 +212,9 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||||||
creditsUsed: jobIDs.length,
|
creditsUsed: jobIDs.length,
|
||||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||||
next:
|
next:
|
||||||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this
|
||||||
? undefined
|
? undefined
|
||||||
: nextURL.href,
|
: nextURL.href,
|
||||||
data: data,
|
data: data
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import {
|
|||||||
crawlRequestSchema,
|
crawlRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
toLegacyCrawlerOptions,
|
toLegacyCrawlerOptions
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import {
|
import {
|
||||||
addCrawlJob,
|
addCrawlJob,
|
||||||
@@ -14,7 +14,7 @@ import {
|
|||||||
lockURL,
|
lockURL,
|
||||||
lockURLs,
|
lockURLs,
|
||||||
saveCrawl,
|
saveCrawl,
|
||||||
StoredCrawl,
|
StoredCrawl
|
||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
@@ -32,21 +32,31 @@ export async function crawlController(
|
|||||||
req.body = crawlRequestSchema.parse(req.body);
|
req.body = crawlRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
const logger = _logger.child({ crawlId: id, module: "api/v1", method: "crawlController", teamId: req.auth.team_id, plan: req.auth.plan });
|
const logger = _logger.child({
|
||||||
logger.debug("Crawl " + id + " starting", { request: req.body, originalRequest: preNormalizedBody, account: req.account });
|
crawlId: id,
|
||||||
|
module: "api/v1",
|
||||||
|
method: "crawlController",
|
||||||
|
teamId: req.auth.team_id,
|
||||||
|
plan: req.auth.plan
|
||||||
|
});
|
||||||
|
logger.debug("Crawl " + id + " starting", {
|
||||||
|
request: req.body,
|
||||||
|
originalRequest: preNormalizedBody,
|
||||||
|
account: req.account
|
||||||
|
});
|
||||||
|
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
|
|
||||||
let { remainingCredits } = req.account!;
|
let { remainingCredits } = req.account!;
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if(!useDbAuthentication){
|
if (!useDbAuthentication) {
|
||||||
remainingCredits = Infinity;
|
remainingCredits = Infinity;
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawlerOptions = {
|
const crawlerOptions = {
|
||||||
...req.body,
|
...req.body,
|
||||||
url: undefined,
|
url: undefined,
|
||||||
scrapeOptions: undefined,
|
scrapeOptions: undefined
|
||||||
};
|
};
|
||||||
const scrapeOptions = req.body.scrapeOptions;
|
const scrapeOptions = req.body.scrapeOptions;
|
||||||
|
|
||||||
@@ -73,7 +83,11 @@ export async function crawlController(
|
|||||||
|
|
||||||
const originalLimit = crawlerOptions.limit;
|
const originalLimit = crawlerOptions.limit;
|
||||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||||
logger.debug("Determined limit: " + crawlerOptions.limit, { remainingCredits, bodyLimit: originalLimit, originalBodyLimit: preNormalizedBody.limit });
|
logger.debug("Determined limit: " + crawlerOptions.limit, {
|
||||||
|
remainingCredits,
|
||||||
|
bodyLimit: originalLimit,
|
||||||
|
originalBodyLimit: preNormalizedBody.limit
|
||||||
|
});
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
@@ -82,7 +96,7 @@ export async function crawlController(
|
|||||||
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
internalOptions: { disableSmartWaitCache: true }, // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
@@ -90,7 +104,9 @@ export async function crawlController(
|
|||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.debug("Failed to get robots.txt (this is probably fine!)", { error: e });
|
logger.debug("Failed to get robots.txt (this is probably fine!)", {
|
||||||
|
error: e
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
@@ -100,13 +116,19 @@ export async function crawlController(
|
|||||||
: await crawler.tryGetSitemap();
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null && sitemap.length > 0) {
|
if (sitemap !== null && sitemap.length > 0) {
|
||||||
logger.debug("Using sitemap of length " + sitemap.length, { sitemapLength: sitemap.length });
|
logger.debug("Using sitemap of length " + sitemap.length, {
|
||||||
|
sitemapLength: sitemap.length
|
||||||
|
});
|
||||||
let jobPriority = 20;
|
let jobPriority = 20;
|
||||||
// If it is over 1000, we need to get the job priority,
|
// If it is over 1000, we need to get the job priority,
|
||||||
// otherwise we can use the default priority of 20
|
// otherwise we can use the default priority of 20
|
||||||
if(sitemap.length > 1000){
|
if (sitemap.length > 1000) {
|
||||||
// set base to 21
|
// set base to 21
|
||||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
jobPriority = await getJobPriority({
|
||||||
|
plan: req.auth.plan,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
basePriority: 21
|
||||||
|
});
|
||||||
}
|
}
|
||||||
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
logger.debug("Using job priority " + jobPriority, { jobPriority });
|
||||||
|
|
||||||
@@ -127,14 +149,14 @@ export async function crawlController(
|
|||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
v1: true,
|
v1: true
|
||||||
},
|
},
|
||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: 20,
|
priority: 20
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
})
|
});
|
||||||
|
|
||||||
logger.debug("Locking URLs...");
|
logger.debug("Locking URLs...");
|
||||||
await lockURLs(
|
await lockURLs(
|
||||||
@@ -150,7 +172,9 @@ export async function crawlController(
|
|||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await getScrapeQueue().addBulk(jobs);
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Sitemap not found or ignored.", { ignoreSitemap: sc.crawlerOptions.ignoreSitemap });
|
logger.debug("Sitemap not found or ignored.", {
|
||||||
|
ignoreSitemap: sc.crawlerOptions.ignoreSitemap
|
||||||
|
});
|
||||||
|
|
||||||
logger.debug("Locking URL...");
|
logger.debug("Locking URL...");
|
||||||
await lockURL(id, sc, req.body.url);
|
await lockURL(id, sc, req.body.url);
|
||||||
@@ -168,21 +192,30 @@ export async function crawlController(
|
|||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
v1: true,
|
v1: true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
priority: 15,
|
priority: 15
|
||||||
},
|
},
|
||||||
jobId,
|
jobId
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
logger.debug("Adding scrape job to BullMQ...", { jobId });
|
||||||
await addCrawlJob(id, jobId);
|
await addCrawlJob(id, jobId);
|
||||||
}
|
}
|
||||||
logger.debug("Done queueing jobs!");
|
logger.debug("Done queueing jobs!");
|
||||||
|
|
||||||
if(req.body.webhook) {
|
if (req.body.webhook) {
|
||||||
logger.debug("Calling webhook with crawl.started...", { webhook: req.body.webhook });
|
logger.debug("Calling webhook with crawl.started...", {
|
||||||
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "crawl.started");
|
webhook: req.body.webhook
|
||||||
|
});
|
||||||
|
await callWebhook(
|
||||||
|
req.auth.team_id,
|
||||||
|
id,
|
||||||
|
null,
|
||||||
|
req.body.webhook,
|
||||||
|
true,
|
||||||
|
"crawl.started"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||||
@@ -190,8 +223,6 @@ export async function crawlController(
|
|||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`,
|
url: `${protocol}://${req.get("host")}/v1/crawl/${id}`
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import {
|
|||||||
extractRequestSchema,
|
extractRequestSchema,
|
||||||
ExtractResponse,
|
ExtractResponse,
|
||||||
MapDocument,
|
MapDocument,
|
||||||
scrapeOptions,
|
scrapeOptions
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { Document } from "../../lib/entities";
|
import { Document } from "../../lib/entities";
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
@@ -56,16 +56,18 @@ export async function extractController(
|
|||||||
|
|
||||||
// Process all URLs in parallel
|
// Process all URLs in parallel
|
||||||
const urlPromises = req.body.urls.map(async (url) => {
|
const urlPromises = req.body.urls.map(async (url) => {
|
||||||
if (url.includes('/*') || req.body.allowExternalLinks) {
|
if (url.includes("/*") || req.body.allowExternalLinks) {
|
||||||
// Handle glob pattern URLs
|
// Handle glob pattern URLs
|
||||||
const baseUrl = url.replace('/*', '');
|
const baseUrl = url.replace("/*", "");
|
||||||
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
|
||||||
|
|
||||||
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
const allowExternalLinks = req.body.allowExternalLinks ?? true;
|
||||||
let urlWithoutWww = baseUrl.replace("www.", "");
|
let urlWithoutWww = baseUrl.replace("www.", "");
|
||||||
let mapUrl = req.body.prompt && allowExternalLinks
|
let mapUrl =
|
||||||
|
req.body.prompt && allowExternalLinks
|
||||||
? `${req.body.prompt} ${urlWithoutWww}`
|
? `${req.body.prompt} ${urlWithoutWww}`
|
||||||
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
|
: req.body.prompt
|
||||||
|
? `${req.body.prompt} site:${urlWithoutWww}`
|
||||||
: `site:${urlWithoutWww}`;
|
: `site:${urlWithoutWww}`;
|
||||||
|
|
||||||
const mapResults = await getMapResults({
|
const mapResults = await getMapResults({
|
||||||
@@ -79,14 +81,16 @@ export async function extractController(
|
|||||||
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
|
||||||
ignoreSitemap: !selfHosted ? true : false,
|
ignoreSitemap: !selfHosted ? true : false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: req.body.includeSubdomains,
|
includeSubdomains: req.body.includeSubdomains
|
||||||
});
|
});
|
||||||
|
|
||||||
let mappedLinks = mapResults.links as MapDocument[];
|
let mappedLinks = mapResults.links as MapDocument[];
|
||||||
// Limit number of links to MAX_EXTRACT_LIMIT
|
// Limit number of links to MAX_EXTRACT_LIMIT
|
||||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||||
|
|
||||||
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
|
let mappedLinksRerank = mappedLinks.map(
|
||||||
|
(x) => `url: ${x.url}, title: ${x.title}, description: ${x.description}`
|
||||||
|
);
|
||||||
|
|
||||||
// Filter by path prefix if present
|
// Filter by path prefix if present
|
||||||
// wrong
|
// wrong
|
||||||
@@ -96,32 +100,50 @@ export async function extractController(
|
|||||||
|
|
||||||
if (req.body.prompt) {
|
if (req.body.prompt) {
|
||||||
// Get similarity scores between the search query and each link's context
|
// Get similarity scores between the search query and each link's context
|
||||||
const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
|
const linksAndScores = await performRanking(
|
||||||
|
mappedLinksRerank,
|
||||||
|
mappedLinks.map((l) => l.url),
|
||||||
|
mapUrl
|
||||||
|
);
|
||||||
|
|
||||||
// First try with high threshold
|
// First try with high threshold
|
||||||
let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
|
let filteredLinks = filterAndProcessLinks(
|
||||||
|
mappedLinks,
|
||||||
|
linksAndScores,
|
||||||
|
INITIAL_SCORE_THRESHOLD
|
||||||
|
);
|
||||||
|
|
||||||
// If we don't have enough high-quality links, try with lower threshold
|
// If we don't have enough high-quality links, try with lower threshold
|
||||||
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
||||||
logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
|
logger.info(
|
||||||
filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
|
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`
|
||||||
|
);
|
||||||
|
filteredLinks = filterAndProcessLinks(
|
||||||
|
mappedLinks,
|
||||||
|
linksAndScores,
|
||||||
|
FALLBACK_SCORE_THRESHOLD
|
||||||
|
);
|
||||||
|
|
||||||
if (filteredLinks.length === 0) {
|
if (filteredLinks.length === 0) {
|
||||||
// If still no results, take top N results regardless of score
|
// If still no results, take top N results regardless of score
|
||||||
logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
|
logger.warn(
|
||||||
|
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`
|
||||||
|
);
|
||||||
filteredLinks = linksAndScores
|
filteredLinks = linksAndScores
|
||||||
.sort((a, b) => b.score - a.score)
|
.sort((a, b) => b.score - a.score)
|
||||||
.slice(0, MIN_REQUIRED_LINKS)
|
.slice(0, MIN_REQUIRED_LINKS)
|
||||||
.map(x => mappedLinks.find(link => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
|
.filter(
|
||||||
|
(x): x is MapDocument =>
|
||||||
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
|
mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mappedLinks.map(x => x.url) as string[];
|
return mappedLinks.map((x) => x.url) as string[];
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// Handle direct URLs without glob pattern
|
// Handle direct URLs without glob pattern
|
||||||
if (!isUrlBlocked(url)) {
|
if (!isUrlBlocked(url)) {
|
||||||
@@ -138,7 +160,8 @@ export async function extractController(
|
|||||||
if (links.length === 0) {
|
if (links.length === 0) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
|
error:
|
||||||
|
"No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,7 +174,7 @@ export async function extractController(
|
|||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
plan: req.auth.plan as PlanType,
|
plan: req.auth.plan as PlanType,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
basePriority: 10,
|
basePriority: 10
|
||||||
});
|
});
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
@@ -163,7 +186,7 @@ export async function extractController(
|
|||||||
internalOptions: {},
|
internalOptions: {},
|
||||||
plan: req.auth.plan!,
|
plan: req.auth.plan!,
|
||||||
origin,
|
origin,
|
||||||
is_scrape: true,
|
is_scrape: true
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
@@ -179,7 +202,10 @@ export async function extractController(
|
|||||||
return doc;
|
return doc;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Error in scrapeController: ${e}`);
|
logger.error(`Error in scrapeController: ${e}`);
|
||||||
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
|
if (
|
||||||
|
e instanceof Error &&
|
||||||
|
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||||
|
) {
|
||||||
throw {
|
throw {
|
||||||
status: 408,
|
status: 408,
|
||||||
error: "Request timed out"
|
error: "Request timed out"
|
||||||
@@ -187,7 +213,7 @@ export async function extractController(
|
|||||||
} else {
|
} else {
|
||||||
throw {
|
throw {
|
||||||
status: 500,
|
status: 500,
|
||||||
error: `(Internal server error) - ${(e && e.message) ? e.message : e}`
|
error: `(Internal server error) - ${e && e.message ? e.message : e}`
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -195,7 +221,7 @@ export async function extractController(
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const results = await Promise.all(scrapePromises);
|
const results = await Promise.all(scrapePromises);
|
||||||
docs.push(...results.filter(doc => doc !== null).map(x => x!));
|
docs.push(...results.filter((doc) => doc !== null).map((x) => x!));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return res.status(e.status).json({
|
return res.status(e.status).json({
|
||||||
success: false,
|
success: false,
|
||||||
@@ -207,20 +233,26 @@ export async function extractController(
|
|||||||
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
logger.child({ method: "extractController/generateOpenAICompletions" }),
|
||||||
{
|
{
|
||||||
mode: "llm",
|
mode: "llm",
|
||||||
systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " + links.join(", "),
|
systemPrompt:
|
||||||
|
"Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided. Here are the urls the user provided of which he wants to extract information from: " +
|
||||||
|
links.join(", "),
|
||||||
prompt: req.body.prompt,
|
prompt: req.body.prompt,
|
||||||
schema: req.body.schema,
|
schema: req.body.schema
|
||||||
},
|
},
|
||||||
docs.map(x => buildDocument(x)).join('\n'),
|
docs.map((x) => buildDocument(x)).join("\n"),
|
||||||
undefined,
|
undefined,
|
||||||
true // isExtractEndpoint
|
true // isExtractEndpoint
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO: change this later
|
// TODO: change this later
|
||||||
// While on beta, we're billing 5 credits per link discovered/scraped.
|
// While on beta, we're billing 5 credits per link discovered/scraped.
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
|
||||||
logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`);
|
(error) => {
|
||||||
});
|
logger.error(
|
||||||
|
`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
let data = completions.extract ?? {};
|
let data = completions.extract ?? {};
|
||||||
let warning = completions.warning;
|
let warning = completions.warning;
|
||||||
@@ -257,11 +289,19 @@ export async function extractController(
|
|||||||
*/
|
*/
|
||||||
function filterAndProcessLinks(
|
function filterAndProcessLinks(
|
||||||
mappedLinks: MapDocument[],
|
mappedLinks: MapDocument[],
|
||||||
linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
|
linksAndScores: {
|
||||||
|
link: string;
|
||||||
|
linkWithContext: string;
|
||||||
|
score: number;
|
||||||
|
originalIndex: number;
|
||||||
|
}[],
|
||||||
threshold: number
|
threshold: number
|
||||||
): MapDocument[] {
|
): MapDocument[] {
|
||||||
return linksAndScores
|
return linksAndScores
|
||||||
.filter(x => x.score > threshold)
|
.filter((x) => x.score > threshold)
|
||||||
.map(x => mappedLinks.find(link => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
|
.filter(
|
||||||
|
(x): x is MapDocument =>
|
||||||
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
@@ -1,6 +1,11 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
|
import {
|
||||||
|
MapDocument,
|
||||||
|
mapRequestSchema,
|
||||||
|
RequestWithAuth,
|
||||||
|
scrapeOptions
|
||||||
|
} from "./types";
|
||||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { MapResponse, MapRequest } from "./types";
|
import { MapResponse, MapRequest } from "./types";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
@@ -8,7 +13,7 @@ import {
|
|||||||
checkAndUpdateURLForMap,
|
checkAndUpdateURLForMap,
|
||||||
isSameDomain,
|
isSameDomain,
|
||||||
isSameSubdomain,
|
isSameSubdomain,
|
||||||
removeDuplicateUrls,
|
removeDuplicateUrls
|
||||||
} from "../../lib/validateUrl";
|
} from "../../lib/validateUrl";
|
||||||
import { fireEngineMap } from "../../search/fireEngine";
|
import { fireEngineMap } from "../../search/fireEngine";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
@@ -67,13 +72,13 @@ export async function getMapResults({
|
|||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
...crawlerOptions,
|
...crawlerOptions,
|
||||||
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
|
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
|
||||||
scrapeOptions: undefined,
|
scrapeOptions: undefined
|
||||||
},
|
},
|
||||||
scrapeOptions: scrapeOptions.parse({}),
|
scrapeOptions: scrapeOptions.parse({}),
|
||||||
internalOptions: {},
|
internalOptions: {},
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
plan: plan,
|
plan: plan
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
@@ -85,7 +90,8 @@ export async function getMapResults({
|
|||||||
sitemap.forEach((x) => {
|
sitemap.forEach((x) => {
|
||||||
links.push(x.url);
|
links.push(x.url);
|
||||||
});
|
});
|
||||||
links = links.slice(1)
|
links = links
|
||||||
|
.slice(1)
|
||||||
.map((x) => {
|
.map((x) => {
|
||||||
try {
|
try {
|
||||||
return checkAndUpdateURLForMap(x).url.trim();
|
return checkAndUpdateURLForMap(x).url.trim();
|
||||||
@@ -99,13 +105,17 @@ export async function getMapResults({
|
|||||||
} else {
|
} else {
|
||||||
let urlWithoutWww = url.replace("www.", "");
|
let urlWithoutWww = url.replace("www.", "");
|
||||||
|
|
||||||
let mapUrl = search && allowExternalLinks
|
let mapUrl =
|
||||||
|
search && allowExternalLinks
|
||||||
? `${search} ${urlWithoutWww}`
|
? `${search} ${urlWithoutWww}`
|
||||||
: search ? `${search} site:${urlWithoutWww}`
|
: search
|
||||||
|
? `${search} site:${urlWithoutWww}`
|
||||||
: `site:${url}`;
|
: `site:${url}`;
|
||||||
|
|
||||||
const resultsPerPage = 100;
|
const resultsPerPage = 100;
|
||||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
const maxPages = Math.ceil(
|
||||||
|
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
|
||||||
|
);
|
||||||
|
|
||||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||||
const cachedResult = await redis.get(cacheKey);
|
const cachedResult = await redis.get(cacheKey);
|
||||||
@@ -119,7 +129,7 @@ export async function getMapResults({
|
|||||||
const fetchPage = async (page: number) => {
|
const fetchPage = async (page: number) => {
|
||||||
return fireEngineMap(mapUrl, {
|
return fireEngineMap(mapUrl, {
|
||||||
numResults: resultsPerPage,
|
numResults: resultsPerPage,
|
||||||
page: page,
|
page: page
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -134,7 +144,7 @@ export async function getMapResults({
|
|||||||
// Parallelize sitemap fetch with serper search
|
// Parallelize sitemap fetch with serper search
|
||||||
const [sitemap, ...searchResults] = await Promise.all([
|
const [sitemap, ...searchResults] = await Promise.all([
|
||||||
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
ignoreSitemap ? null : crawler.tryGetSitemap(true),
|
||||||
...(cachedResult ? [] : pagePromises),
|
...(cachedResult ? [] : pagePromises)
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (!cachedResult) {
|
if (!cachedResult) {
|
||||||
@@ -162,7 +172,7 @@ export async function getMapResults({
|
|||||||
links = [
|
links = [
|
||||||
mapResults[0].url,
|
mapResults[0].url,
|
||||||
...mapResults.slice(1).map((x) => x.url),
|
...mapResults.slice(1).map((x) => x.url),
|
||||||
...links,
|
...links
|
||||||
];
|
];
|
||||||
} else {
|
} else {
|
||||||
mapResults.map((x) => {
|
mapResults.map((x) => {
|
||||||
@@ -199,14 +209,16 @@ export async function getMapResults({
|
|||||||
links = removeDuplicateUrls(links);
|
links = removeDuplicateUrls(links);
|
||||||
}
|
}
|
||||||
|
|
||||||
const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit);
|
const linksToReturn = crawlerOptions.sitemapOnly
|
||||||
|
? links
|
||||||
|
: links.slice(0, limit);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
links: includeMetadata ? mapResults : linksToReturn,
|
links: includeMetadata ? mapResults : linksToReturn,
|
||||||
scrape_id: origin?.includes("website") ? id : undefined,
|
scrape_id: origin?.includes("website") ? id : undefined,
|
||||||
job_id: id,
|
job_id: id,
|
||||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
time_taken: (new Date().getTime() - Date.now()) / 1000
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -225,7 +237,7 @@ export async function mapController(
|
|||||||
crawlerOptions: req.body,
|
crawlerOptions: req.body,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
teamId: req.auth.team_id,
|
teamId: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan
|
||||||
});
|
});
|
||||||
|
|
||||||
// Bill the team
|
// Bill the team
|
||||||
@@ -249,7 +261,7 @@ export async function mapController(
|
|||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
scrapeOptions: {},
|
scrapeOptions: {},
|
||||||
origin: req.body.origin ?? "api",
|
origin: req.body.origin ?? "api",
|
||||||
num_tokens: 0,
|
num_tokens: 0
|
||||||
});
|
});
|
||||||
|
|
||||||
const response = {
|
const response = {
|
||||||
|
|||||||
@@ -16,26 +16,26 @@ export async function scrapeStatusController(req: any, res: any) {
|
|||||||
"511544f2-2fce-4183-9c59-6c29b02c69b5"
|
"511544f2-2fce-4183-9c59-6c29b02c69b5"
|
||||||
];
|
];
|
||||||
|
|
||||||
if(!allowedTeams.includes(job?.team_id)){
|
if (!allowedTeams.includes(job?.team_id)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "You are not allowed to access this resource.",
|
error: "You are not allowed to access this resource."
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: job?.docs[0],
|
data: job?.docs[0]
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message == "Too Many Requests") {
|
if (error instanceof Error && error.message == "Too Many Requests") {
|
||||||
return res.status(429).json({
|
return res.status(429).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Rate limit exceeded. Please try again later.",
|
error: "Rate limit exceeded. Please try again later."
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "An unexpected error occurred.",
|
error: "An unexpected error occurred."
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import {
|
|||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
ScrapeRequest,
|
ScrapeRequest,
|
||||||
scrapeRequestSchema,
|
scrapeRequestSchema,
|
||||||
ScrapeResponse,
|
ScrapeResponse
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
@@ -30,7 +30,7 @@ export async function scrapeController(
|
|||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
plan: req.auth.plan as PlanType,
|
plan: req.auth.plan as PlanType,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
basePriority: 10,
|
basePriority: 10
|
||||||
});
|
});
|
||||||
|
|
||||||
await addScrapeJob(
|
await addScrapeJob(
|
||||||
@@ -42,29 +42,37 @@ export async function scrapeController(
|
|||||||
internalOptions: {},
|
internalOptions: {},
|
||||||
plan: req.auth.plan!,
|
plan: req.auth.plan!,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
is_scrape: true,
|
is_scrape: true
|
||||||
},
|
},
|
||||||
{},
|
{},
|
||||||
jobId,
|
jobId,
|
||||||
jobPriority
|
jobPriority
|
||||||
);
|
);
|
||||||
|
|
||||||
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
|
const totalWait =
|
||||||
|
(req.body.waitFor ?? 0) +
|
||||||
|
(req.body.actions ?? []).reduce(
|
||||||
|
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 0) : 0) + a,
|
||||||
|
0
|
||||||
|
);
|
||||||
|
|
||||||
let doc: Document;
|
let doc: Document;
|
||||||
try {
|
try {
|
||||||
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Error in scrapeController: ${e}`);
|
logger.error(`Error in scrapeController: ${e}`);
|
||||||
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
|
if (
|
||||||
|
e instanceof Error &&
|
||||||
|
(e.message.startsWith("Job wait") || e.message === "timeout")
|
||||||
|
) {
|
||||||
return res.status(408).json({
|
return res.status(408).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Request timed out",
|
error: "Request timed out"
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: `(Internal server error) - ${(e && e.message) ? e.message : e}`,
|
error: `(Internal server error) - ${e && e.message ? e.message : e}`
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -75,8 +83,8 @@ export async function scrapeController(
|
|||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens =
|
const numTokens =
|
||||||
doc && doc.extract
|
doc && doc.extract
|
||||||
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
? // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||||
? 0 // TODO: fix
|
0 // TODO: fix
|
||||||
: 0;
|
: 0;
|
||||||
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
@@ -84,14 +92,18 @@ export async function scrapeController(
|
|||||||
// Don't bill if we're early returning
|
// Don't bill if we're early returning
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if(req.body.extract && req.body.formats.includes("extract")) {
|
if (req.body.extract && req.body.formats.includes("extract")) {
|
||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(
|
||||||
logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
(error) => {
|
||||||
|
logger.error(
|
||||||
|
`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`
|
||||||
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
if (!req.body.formats.includes("rawHtml")) {
|
if (!req.body.formats.includes("rawHtml")) {
|
||||||
if (doc && doc.rawHtml) {
|
if (doc && doc.rawHtml) {
|
||||||
@@ -111,12 +123,12 @@ export async function scrapeController(
|
|||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
scrapeOptions: req.body,
|
scrapeOptions: req.body,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
num_tokens: numTokens,
|
num_tokens: numTokens
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: doc,
|
data: doc,
|
||||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
scrape_id: origin?.includes("website") ? jobId : undefined
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,12 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
|||||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
import { countries } from "../../lib/validate-country";
|
import { countries } from "../../lib/validate-country";
|
||||||
import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
|
import {
|
||||||
|
ExtractorOptions,
|
||||||
|
PageOptions,
|
||||||
|
ScrapeActionContent,
|
||||||
|
Document as V0Document
|
||||||
|
} from "../../lib/entities";
|
||||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||||
|
|
||||||
export type Format =
|
export type Format =
|
||||||
@@ -31,76 +36,88 @@ export const url = z.preprocess(
|
|||||||
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
|
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
|
||||||
"URL must have a valid top-level domain or be a valid path"
|
"URL must have a valid top-level domain or be a valid path"
|
||||||
)
|
)
|
||||||
.refine(
|
.refine((x) => {
|
||||||
(x) => {
|
|
||||||
try {
|
try {
|
||||||
checkUrl(x as string)
|
checkUrl(x as string);
|
||||||
return true;
|
return true;
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
},
|
}, "Invalid URL")
|
||||||
"Invalid URL"
|
|
||||||
)
|
|
||||||
.refine(
|
.refine(
|
||||||
(x) => !isUrlBlocked(x as string),
|
(x) => !isUrlBlocked(x as string),
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
const strictMessage =
|
||||||
|
"Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||||
|
|
||||||
export const extractOptions = z.object({
|
export const extractOptions = z
|
||||||
|
.object({
|
||||||
mode: z.enum(["llm"]).default("llm"),
|
mode: z.enum(["llm"]).default("llm"),
|
||||||
schema: z.any().optional(),
|
schema: z.any().optional(),
|
||||||
systemPrompt: z.string().default("Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."),
|
systemPrompt: z
|
||||||
|
.string()
|
||||||
|
.default(
|
||||||
|
"Based on the information on the page, extract all the information from the schema in JSON format. Try to extract all the fields even those that might not be marked as required."
|
||||||
|
),
|
||||||
prompt: z.string().optional()
|
prompt: z.string().optional()
|
||||||
}).strict(strictMessage);
|
})
|
||||||
|
.strict(strictMessage);
|
||||||
|
|
||||||
export type ExtractOptions = z.infer<typeof extractOptions>;
|
export type ExtractOptions = z.infer<typeof extractOptions>;
|
||||||
|
|
||||||
export const actionsSchema = z.array(z.union([
|
export const actionsSchema = z.array(
|
||||||
z.object({
|
z.union([
|
||||||
|
z
|
||||||
|
.object({
|
||||||
type: z.literal("wait"),
|
type: z.literal("wait"),
|
||||||
milliseconds: z.number().int().positive().finite().optional(),
|
milliseconds: z.number().int().positive().finite().optional(),
|
||||||
selector: z.string().optional(),
|
selector: z.string().optional()
|
||||||
}).refine(
|
})
|
||||||
(data) => (data.milliseconds !== undefined || data.selector !== undefined) && !(data.milliseconds !== undefined && data.selector !== undefined),
|
.refine(
|
||||||
|
(data) =>
|
||||||
|
(data.milliseconds !== undefined || data.selector !== undefined) &&
|
||||||
|
!(data.milliseconds !== undefined && data.selector !== undefined),
|
||||||
{
|
{
|
||||||
message: "Either 'milliseconds' or 'selector' must be provided, but not both.",
|
message:
|
||||||
|
"Either 'milliseconds' or 'selector' must be provided, but not both."
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("click"),
|
type: z.literal("click"),
|
||||||
selector: z.string(),
|
selector: z.string()
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("screenshot"),
|
type: z.literal("screenshot"),
|
||||||
fullPage: z.boolean().default(false),
|
fullPage: z.boolean().default(false)
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("write"),
|
type: z.literal("write"),
|
||||||
text: z.string(),
|
text: z.string()
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("press"),
|
type: z.literal("press"),
|
||||||
key: z.string(),
|
key: z.string()
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("scroll"),
|
type: z.literal("scroll"),
|
||||||
direction: z.enum(["up", "down"]).optional().default("down"),
|
direction: z.enum(["up", "down"]).optional().default("down"),
|
||||||
selector: z.string().optional(),
|
selector: z.string().optional()
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("scrape"),
|
type: z.literal("scrape")
|
||||||
}),
|
}),
|
||||||
z.object({
|
z.object({
|
||||||
type: z.literal("executeJavascript"),
|
type: z.literal("executeJavascript"),
|
||||||
script: z.string()
|
script: z.string()
|
||||||
}),
|
})
|
||||||
]));
|
])
|
||||||
|
);
|
||||||
|
|
||||||
export const scrapeOptions = z.object({
|
export const scrapeOptions = z
|
||||||
|
.object({
|
||||||
formats: z
|
formats: z
|
||||||
.enum([
|
.enum([
|
||||||
"markdown",
|
"markdown",
|
||||||
@@ -114,7 +131,10 @@ export const scrapeOptions = z.object({
|
|||||||
.array()
|
.array()
|
||||||
.optional()
|
.optional()
|
||||||
.default(["markdown"])
|
.default(["markdown"])
|
||||||
.refine(x => !(x.includes("screenshot") && x.includes("screenshot@fullPage")), "You may only specify either screenshot or screenshot@fullPage"),
|
.refine(
|
||||||
|
(x) => !(x.includes("screenshot") && x.includes("screenshot@fullPage")),
|
||||||
|
"You may only specify either screenshot or screenshot@fullPage"
|
||||||
|
),
|
||||||
headers: z.record(z.string(), z.string()).optional(),
|
headers: z.record(z.string(), z.string()).optional(),
|
||||||
includeTags: z.string().array().optional(),
|
includeTags: z.string().array().optional(),
|
||||||
excludeTags: z.string().array().optional(),
|
excludeTags: z.string().array().optional(),
|
||||||
@@ -126,36 +146,52 @@ export const scrapeOptions = z.object({
|
|||||||
parsePDF: z.boolean().default(true),
|
parsePDF: z.boolean().default(true),
|
||||||
actions: actionsSchema.optional(),
|
actions: actionsSchema.optional(),
|
||||||
// New
|
// New
|
||||||
location: z.object({
|
location: z
|
||||||
country: z.string().optional().refine(
|
.object({
|
||||||
|
country: z
|
||||||
|
.string()
|
||||||
|
.optional()
|
||||||
|
.refine(
|
||||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||||
{
|
{
|
||||||
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
message:
|
||||||
|
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code."
|
||||||
}
|
}
|
||||||
).transform(val => val ? val.toUpperCase() : 'US'),
|
)
|
||||||
languages: z.string().array().optional(),
|
.transform((val) => (val ? val.toUpperCase() : "US")),
|
||||||
}).optional(),
|
languages: z.string().array().optional()
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
|
|
||||||
// Deprecated
|
// Deprecated
|
||||||
geolocation: z.object({
|
geolocation: z
|
||||||
country: z.string().optional().refine(
|
.object({
|
||||||
|
country: z
|
||||||
|
.string()
|
||||||
|
.optional()
|
||||||
|
.refine(
|
||||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||||
{
|
{
|
||||||
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
message:
|
||||||
|
"Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code."
|
||||||
}
|
}
|
||||||
).transform(val => val ? val.toUpperCase() : 'US'),
|
)
|
||||||
languages: z.string().array().optional(),
|
.transform((val) => (val ? val.toUpperCase() : "US")),
|
||||||
}).optional(),
|
languages: z.string().array().optional()
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
skipTlsVerification: z.boolean().default(false),
|
skipTlsVerification: z.boolean().default(false),
|
||||||
removeBase64Images: z.boolean().default(true),
|
removeBase64Images: z.boolean().default(true)
|
||||||
}).strict(strictMessage)
|
})
|
||||||
|
.strict(strictMessage);
|
||||||
|
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
|
|
||||||
export const extractV1Options = z.object({
|
export const extractV1Options = z
|
||||||
urls: url.array().max(10, "Maximum of 10 URLs allowed per request while in beta."),
|
.object({
|
||||||
|
urls: url
|
||||||
|
.array()
|
||||||
|
.max(10, "Maximum of 10 URLs allowed per request while in beta."),
|
||||||
prompt: z.string().optional(),
|
prompt: z.string().optional(),
|
||||||
schema: z.any().optional(),
|
schema: z.any().optional(),
|
||||||
limit: z.number().int().positive().finite().safe().optional(),
|
limit: z.number().int().positive().finite().safe().optional(),
|
||||||
@@ -164,67 +200,88 @@ export const extractV1Options = z.object({
|
|||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(60000)
|
timeout: z.number().int().positive().finite().safe().default(60000)
|
||||||
}).strict(strictMessage)
|
})
|
||||||
|
.strict(strictMessage);
|
||||||
|
|
||||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||||
export const extractRequestSchema = extractV1Options;
|
export const extractRequestSchema = extractV1Options;
|
||||||
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
|
export const scrapeRequestSchema = scrapeOptions
|
||||||
|
.omit({ timeout: true })
|
||||||
|
.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
timeout: z.number().int().positive().finite().safe().default(30000),
|
timeout: z.number().int().positive().finite().safe().default(30000)
|
||||||
}).strict(strictMessage).refine(
|
})
|
||||||
|
.strict(strictMessage)
|
||||||
|
.refine(
|
||||||
(obj) => {
|
(obj) => {
|
||||||
const hasExtractFormat = obj.formats?.includes("extract");
|
const hasExtractFormat = obj.formats?.includes("extract");
|
||||||
const hasExtractOptions = obj.extract !== undefined;
|
const hasExtractOptions = obj.extract !== undefined;
|
||||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
return (
|
||||||
|
(hasExtractFormat && hasExtractOptions) ||
|
||||||
|
(!hasExtractFormat && !hasExtractOptions)
|
||||||
|
);
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
message:
|
||||||
|
"When 'extract' format is specified, 'extract' options must be provided, and vice versa"
|
||||||
}
|
}
|
||||||
).transform((obj) => {
|
)
|
||||||
|
.transform((obj) => {
|
||||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||||
return { ...obj, timeout: 60000 };
|
return { ...obj, timeout: 60000 };
|
||||||
}
|
}
|
||||||
return obj;
|
return obj;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||||
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
|
||||||
|
|
||||||
export const webhookSchema = z.preprocess(x => {
|
export const webhookSchema = z.preprocess(
|
||||||
|
(x) => {
|
||||||
if (typeof x === "string") {
|
if (typeof x === "string") {
|
||||||
return { url: x };
|
return { url: x };
|
||||||
} else {
|
} else {
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
}, z.object({
|
},
|
||||||
|
z
|
||||||
|
.object({
|
||||||
url: z.string().url(),
|
url: z.string().url(),
|
||||||
headers: z.record(z.string(), z.string()).default({}),
|
headers: z.record(z.string(), z.string()).default({})
|
||||||
}).strict(strictMessage))
|
})
|
||||||
|
.strict(strictMessage)
|
||||||
|
);
|
||||||
|
|
||||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
export const batchScrapeRequestSchema = scrapeOptions
|
||||||
|
.extend({
|
||||||
urls: url.array(),
|
urls: url.array(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
webhook: webhookSchema.optional(),
|
webhook: webhookSchema.optional(),
|
||||||
appendToId: z.string().uuid().optional(),
|
appendToId: z.string().uuid().optional()
|
||||||
}).strict(strictMessage).refine(
|
})
|
||||||
|
.strict(strictMessage)
|
||||||
|
.refine(
|
||||||
(obj) => {
|
(obj) => {
|
||||||
const hasExtractFormat = obj.formats?.includes("extract");
|
const hasExtractFormat = obj.formats?.includes("extract");
|
||||||
const hasExtractOptions = obj.extract !== undefined;
|
const hasExtractOptions = obj.extract !== undefined;
|
||||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
return (
|
||||||
|
(hasExtractFormat && hasExtractOptions) ||
|
||||||
|
(!hasExtractFormat && !hasExtractOptions)
|
||||||
|
);
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
message:
|
||||||
|
"When 'extract' format is specified, 'extract' options must be provided, and vice versa"
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||||
|
|
||||||
const crawlerOptions = z.object({
|
const crawlerOptions = z
|
||||||
|
.object({
|
||||||
includePaths: z.string().array().default([]),
|
includePaths: z.string().array().default([]),
|
||||||
excludePaths: z.string().array().default([]),
|
excludePaths: z.string().array().default([]),
|
||||||
maxDepth: z.number().default(10), // default?
|
maxDepth: z.number().default(10), // default?
|
||||||
@@ -235,8 +292,9 @@ const crawlerOptions = z.object({
|
|||||||
ignoreRobotsTxt: z.boolean().default(false),
|
ignoreRobotsTxt: z.boolean().default(false),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
deduplicateSimilarURLs: z.boolean().default(true),
|
deduplicateSimilarURLs: z.boolean().default(true),
|
||||||
ignoreQueryParameters: z.boolean().default(false),
|
ignoreQueryParameters: z.boolean().default(false)
|
||||||
}).strict(strictMessage);
|
})
|
||||||
|
.strict(strictMessage);
|
||||||
|
|
||||||
// export type CrawlerOptions = {
|
// export type CrawlerOptions = {
|
||||||
// includePaths?: string[];
|
// includePaths?: string[];
|
||||||
@@ -250,13 +308,15 @@ const crawlerOptions = z.object({
|
|||||||
|
|
||||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||||
|
|
||||||
export const crawlRequestSchema = crawlerOptions.extend({
|
export const crawlRequestSchema = crawlerOptions
|
||||||
|
.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
scrapeOptions: scrapeOptions.default({}),
|
scrapeOptions: scrapeOptions.default({}),
|
||||||
webhook: webhookSchema.optional(),
|
webhook: webhookSchema.optional(),
|
||||||
limit: z.number().default(10000),
|
limit: z.number().default(10000)
|
||||||
}).strict(strictMessage);
|
})
|
||||||
|
.strict(strictMessage);
|
||||||
|
|
||||||
// export type CrawlRequest = {
|
// export type CrawlRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@@ -270,18 +330,19 @@ export const crawlRequestSchema = crawlerOptions.extend({
|
|||||||
// extractionSchema?: Record<string, any>;
|
// extractionSchema?: Record<string, any>;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
|
||||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||||
|
|
||||||
export const mapRequestSchema = crawlerOptions.extend({
|
export const mapRequestSchema = crawlerOptions
|
||||||
|
.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
search: z.string().optional(),
|
search: z.string().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
sitemapOnly: z.boolean().default(false),
|
sitemapOnly: z.boolean().default(false),
|
||||||
limit: z.number().min(1).max(5000).default(5000),
|
limit: z.number().min(1).max(5000).default(5000)
|
||||||
}).strict(strictMessage);
|
})
|
||||||
|
.strict(strictMessage);
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@@ -451,7 +512,7 @@ export interface RequestWithMaybeACUC<
|
|||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
acuc?: AuthCreditUsageChunk,
|
acuc?: AuthCreditUsageChunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RequestWithACUC<
|
export interface RequestWithACUC<
|
||||||
@@ -459,13 +520,13 @@ export interface RequestWithACUC<
|
|||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
acuc: AuthCreditUsageChunk,
|
acuc: AuthCreditUsageChunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RequestWithAuth<
|
export interface RequestWithAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined,
|
ResBody = undefined
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
account?: Account;
|
account?: Account;
|
||||||
@@ -483,16 +544,15 @@ export interface RequestWithMaybeAuth<
|
|||||||
export interface RequestWithAuth<
|
export interface RequestWithAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined,
|
ResBody = undefined
|
||||||
> extends RequestWithACUC<ReqParams, ReqBody, ResBody> {
|
> extends RequestWithACUC<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
account?: Account;
|
account?: Account;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ResponseWithSentry<
|
export interface ResponseWithSentry<ResBody = undefined>
|
||||||
ResBody = undefined,
|
extends Response<ResBody> {
|
||||||
> extends Response<ResBody> {
|
sentry?: string;
|
||||||
sentry?: string,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||||
@@ -509,11 +569,14 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
|
export function fromLegacyCrawlerOptions(x: any): {
|
||||||
|
crawlOptions: CrawlerOptions;
|
||||||
|
internalOptions: InternalOptions;
|
||||||
|
} {
|
||||||
return {
|
return {
|
||||||
crawlOptions: crawlerOptions.parse({
|
crawlOptions: crawlerOptions.parse({
|
||||||
includePaths: x.includes,
|
includePaths: x.includes,
|
||||||
@@ -526,37 +589,50 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
|||||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters
|
||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
v0CrawlOnlyUrls: x.returnOnlyUrls
|
||||||
},
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export interface MapDocument {
|
export interface MapDocument {
|
||||||
url: string;
|
url: string;
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
}
|
}
|
||||||
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
|
export function fromLegacyScrapeOptions(
|
||||||
|
pageOptions: PageOptions,
|
||||||
|
extractorOptions: ExtractorOptions | undefined,
|
||||||
|
timeout: number | undefined
|
||||||
|
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||||
return {
|
return {
|
||||||
scrapeOptions: scrapeOptions.parse({
|
scrapeOptions: scrapeOptions.parse({
|
||||||
formats: [
|
formats: [
|
||||||
(pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
|
(pageOptions.includeMarkdown ?? true) ? ("markdown" as const) : null,
|
||||||
(pageOptions.includeHtml ?? false) ? "html" as const : null,
|
(pageOptions.includeHtml ?? false) ? ("html" as const) : null,
|
||||||
(pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
|
(pageOptions.includeRawHtml ?? false) ? ("rawHtml" as const) : null,
|
||||||
(pageOptions.screenshot ?? false) ? "screenshot" as const : null,
|
(pageOptions.screenshot ?? false) ? ("screenshot" as const) : null,
|
||||||
(pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
|
(pageOptions.fullPageScreenshot ?? false)
|
||||||
(extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
|
? ("screenshot@fullPage" as const)
|
||||||
|
: null,
|
||||||
|
extractorOptions !== undefined &&
|
||||||
|
extractorOptions.mode.includes("llm-extraction")
|
||||||
|
? ("extract" as const)
|
||||||
|
: null,
|
||||||
"links"
|
"links"
|
||||||
].filter(x => x !== null),
|
].filter((x) => x !== null),
|
||||||
waitFor: pageOptions.waitFor,
|
waitFor: pageOptions.waitFor,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
|
includeTags:
|
||||||
excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
|
typeof pageOptions.onlyIncludeTags === "string"
|
||||||
|
? [pageOptions.onlyIncludeTags]
|
||||||
|
: pageOptions.onlyIncludeTags,
|
||||||
|
excludeTags:
|
||||||
|
typeof pageOptions.removeTags === "string"
|
||||||
|
? [pageOptions.removeTags]
|
||||||
|
: pageOptions.removeTags,
|
||||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||||
timeout: timeout,
|
timeout: timeout,
|
||||||
parsePDF: pageOptions.parsePDF,
|
parsePDF: pageOptions.parsePDF,
|
||||||
@@ -564,29 +640,45 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio
|
|||||||
location: pageOptions.geolocation,
|
location: pageOptions.geolocation,
|
||||||
skipTlsVerification: pageOptions.skipTlsVerification,
|
skipTlsVerification: pageOptions.skipTlsVerification,
|
||||||
removeBase64Images: pageOptions.removeBase64Images,
|
removeBase64Images: pageOptions.removeBase64Images,
|
||||||
extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
|
extract:
|
||||||
|
extractorOptions !== undefined &&
|
||||||
|
extractorOptions.mode.includes("llm-extraction")
|
||||||
|
? {
|
||||||
systemPrompt: extractorOptions.extractionPrompt,
|
systemPrompt: extractorOptions.extractionPrompt,
|
||||||
prompt: extractorOptions.userPrompt,
|
prompt: extractorOptions.userPrompt,
|
||||||
schema: extractorOptions.extractionSchema,
|
schema: extractorOptions.extractionSchema
|
||||||
} : undefined,
|
}
|
||||||
mobile: pageOptions.mobile,
|
: undefined,
|
||||||
|
mobile: pageOptions.mobile
|
||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
atsv: pageOptions.atsv,
|
atsv: pageOptions.atsv,
|
||||||
v0DisableJsDom: pageOptions.disableJsDom,
|
v0DisableJsDom: pageOptions.disableJsDom,
|
||||||
v0UseFastMode: pageOptions.useFastMode,
|
v0UseFastMode: pageOptions.useFastMode
|
||||||
},
|
|
||||||
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
|
||||||
}
|
}
|
||||||
|
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
|
export function fromLegacyCombo(
|
||||||
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
|
pageOptions: PageOptions,
|
||||||
|
extractorOptions: ExtractorOptions | undefined,
|
||||||
|
timeout: number | undefined,
|
||||||
|
crawlerOptions: any
|
||||||
|
): { scrapeOptions: ScrapeOptions; internalOptions: InternalOptions } {
|
||||||
|
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(
|
||||||
|
pageOptions,
|
||||||
|
extractorOptions,
|
||||||
|
timeout
|
||||||
|
);
|
||||||
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
|
||||||
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
|
||||||
}
|
}
|
||||||
|
|
||||||
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
|
export function toLegacyDocument(
|
||||||
|
document: Document,
|
||||||
|
internalOptions: InternalOptions
|
||||||
|
): V0Document | { url: string } {
|
||||||
if (internalOptions.v0CrawlOnlyUrls) {
|
if (internalOptions.v0CrawlOnlyUrls) {
|
||||||
return { url: document.metadata.sourceURL! };
|
return { url: document.metadata.sourceURL! };
|
||||||
}
|
}
|
||||||
@@ -604,9 +696,9 @@ export function toLegacyDocument(document: Document, internalOptions: InternalOp
|
|||||||
statusCode: undefined,
|
statusCode: undefined,
|
||||||
pageError: document.metadata.error,
|
pageError: document.metadata.error,
|
||||||
pageStatusCode: document.metadata.statusCode,
|
pageStatusCode: document.metadata.statusCode,
|
||||||
screenshot: document.screenshot,
|
screenshot: document.screenshot
|
||||||
},
|
},
|
||||||
actions: document.actions ,
|
actions: document.actions,
|
||||||
warning: document.warning,
|
warning: document.warning
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
+66
-33
@@ -1,5 +1,5 @@
|
|||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import "./services/sentry"
|
import "./services/sentry";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import express, { NextFunction, Request, Response } from "express";
|
import express, { NextFunction, Request, Response } from "express";
|
||||||
import bodyParser from "body-parser";
|
import bodyParser from "body-parser";
|
||||||
@@ -9,9 +9,9 @@ import { v0Router } from "./routes/v0";
|
|||||||
import os from "os";
|
import os from "os";
|
||||||
import { logger } from "./lib/logger";
|
import { logger } from "./lib/logger";
|
||||||
import { adminRouter } from "./routes/admin";
|
import { adminRouter } from "./routes/admin";
|
||||||
import http from 'node:http';
|
import http from "node:http";
|
||||||
import https from 'node:https';
|
import https from "node:https";
|
||||||
import CacheableLookup from 'cacheable-lookup';
|
import CacheableLookup from "cacheable-lookup";
|
||||||
import { v1Router } from "./routes/v1";
|
import { v1Router } from "./routes/v1";
|
||||||
import expressWs from "express-ws";
|
import expressWs from "express-ws";
|
||||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||||
@@ -25,14 +25,12 @@ const { ExpressAdapter } = require("@bull-board/express");
|
|||||||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||||
logger.info(`Number of CPUs: ${numCPUs} available`);
|
logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||||
|
|
||||||
const cacheable = new CacheableLookup()
|
const cacheable = new CacheableLookup();
|
||||||
|
|
||||||
|
|
||||||
// Install cacheable lookup for all other requests
|
// Install cacheable lookup for all other requests
|
||||||
cacheable.install(http.globalAgent);
|
cacheable.install(http.globalAgent);
|
||||||
cacheable.install(https.globalAgent);
|
cacheable.install(https.globalAgent);
|
||||||
|
|
||||||
|
|
||||||
const ws = expressWs(express());
|
const ws = expressWs(express());
|
||||||
const app = ws.app;
|
const app = ws.app;
|
||||||
|
|
||||||
@@ -48,7 +46,7 @@ serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`);
|
|||||||
|
|
||||||
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
|
||||||
queues: [new BullAdapter(getScrapeQueue())],
|
queues: [new BullAdapter(getScrapeQueue())],
|
||||||
serverAdapter: serverAdapter,
|
serverAdapter: serverAdapter
|
||||||
});
|
});
|
||||||
|
|
||||||
app.use(
|
app.use(
|
||||||
@@ -82,15 +80,15 @@ function startServer(port = DEFAULT_PORT) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
const exitHandler = () => {
|
const exitHandler = () => {
|
||||||
logger.info('SIGTERM signal received: closing HTTP server')
|
logger.info("SIGTERM signal received: closing HTTP server");
|
||||||
server.close(() => {
|
server.close(() => {
|
||||||
logger.info("Server closed.");
|
logger.info("Server closed.");
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
process.on('SIGTERM', exitHandler);
|
process.on("SIGTERM", exitHandler);
|
||||||
process.on('SIGINT', exitHandler);
|
process.on("SIGINT", exitHandler);
|
||||||
return server;
|
return server;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,13 +99,11 @@ if (require.main === module) {
|
|||||||
app.get(`/serverHealthCheck`, async (req, res) => {
|
app.get(`/serverHealthCheck`, async (req, res) => {
|
||||||
try {
|
try {
|
||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const [waitingJobs] = await Promise.all([
|
const [waitingJobs] = await Promise.all([scrapeQueue.getWaitingCount()]);
|
||||||
scrapeQueue.getWaitingCount(),
|
|
||||||
]);
|
|
||||||
const noWaitingJobs = waitingJobs === 0;
|
const noWaitingJobs = waitingJobs === 0;
|
||||||
// 200 if no active jobs, 503 if there are active jobs
|
// 200 if no active jobs, 503 if there are active jobs
|
||||||
return res.status(noWaitingJobs ? 200 : 500).json({
|
return res.status(noWaitingJobs ? 200 : 500).json({
|
||||||
waitingJobs,
|
waitingJobs
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
@@ -124,7 +120,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||||||
const getWaitingJobsCount = async () => {
|
const getWaitingJobsCount = async () => {
|
||||||
const scrapeQueue = getScrapeQueue();
|
const scrapeQueue = getScrapeQueue();
|
||||||
const [waitingJobsCount] = await Promise.all([
|
const [waitingJobsCount] = await Promise.all([
|
||||||
scrapeQueue.getWaitingCount(),
|
scrapeQueue.getWaitingCount()
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return waitingJobsCount;
|
return waitingJobsCount;
|
||||||
@@ -144,15 +140,15 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
|
|||||||
const message = {
|
const message = {
|
||||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||||
timeout / 60000
|
timeout / 60000
|
||||||
} minute(s).`,
|
} minute(s).`
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await fetch(slackWebhookUrl, {
|
const response = await fetch(slackWebhookUrl, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json"
|
||||||
},
|
},
|
||||||
body: JSON.stringify(message),
|
body: JSON.stringify(message)
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
@@ -175,23 +171,48 @@ app.get("/is-production", (req, res) => {
|
|||||||
res.send({ isProduction: global.isProduction });
|
res.send({ isProduction: global.isProduction });
|
||||||
});
|
});
|
||||||
|
|
||||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
|
app.use(
|
||||||
|
(
|
||||||
|
err: unknown,
|
||||||
|
req: Request<{}, ErrorResponse, undefined>,
|
||||||
|
res: Response<ErrorResponse>,
|
||||||
|
next: NextFunction
|
||||||
|
) => {
|
||||||
if (err instanceof ZodError) {
|
if (err instanceof ZodError) {
|
||||||
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
|
if (
|
||||||
|
Array.isArray(err.errors) &&
|
||||||
|
err.errors.find((x) => x.message === "URL uses unsupported protocol")
|
||||||
|
) {
|
||||||
logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
|
||||||
}
|
}
|
||||||
|
|
||||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
res
|
||||||
|
.status(400)
|
||||||
|
.json({ success: false, error: "Bad Request", details: err.errors });
|
||||||
} else {
|
} else {
|
||||||
next(err);
|
next(err);
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
Sentry.setupExpressErrorHandler(app);
|
Sentry.setupExpressErrorHandler(app);
|
||||||
|
|
||||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
|
app.use(
|
||||||
if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) {
|
(
|
||||||
return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' });
|
err: unknown,
|
||||||
|
req: Request<{}, ErrorResponse, undefined>,
|
||||||
|
res: ResponseWithSentry<ErrorResponse>,
|
||||||
|
next: NextFunction
|
||||||
|
) => {
|
||||||
|
if (
|
||||||
|
err instanceof SyntaxError &&
|
||||||
|
"status" in err &&
|
||||||
|
err.status === 400 &&
|
||||||
|
"body" in err
|
||||||
|
) {
|
||||||
|
return res
|
||||||
|
.status(400)
|
||||||
|
.json({ success: false, error: "Bad request, malformed JSON" });
|
||||||
}
|
}
|
||||||
|
|
||||||
const id = res.sentry ?? uuidv4();
|
const id = res.sentry ?? uuidv4();
|
||||||
@@ -201,14 +222,29 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
|
|||||||
verbose = JSON.stringify({
|
verbose = JSON.stringify({
|
||||||
message: err.message,
|
message: err.message,
|
||||||
name: err.name,
|
name: err.name,
|
||||||
stack: err.stack,
|
stack: err.stack
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
logger.error(
|
||||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
|
"Error occurred in request! (" +
|
||||||
});
|
req.path +
|
||||||
|
") -- ID " +
|
||||||
|
id +
|
||||||
|
" -- " +
|
||||||
|
verbose
|
||||||
|
);
|
||||||
|
res
|
||||||
|
.status(500)
|
||||||
|
.json({
|
||||||
|
success: false,
|
||||||
|
error:
|
||||||
|
"An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " +
|
||||||
|
id
|
||||||
|
});
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
logger.info(`Worker ${process.pid} started`);
|
logger.info(`Worker ${process.pid} started`);
|
||||||
|
|
||||||
@@ -220,6 +256,3 @@ logger.info(`Worker ${process.pid} started`);
|
|||||||
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
// sq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused"));
|
||||||
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
// sq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed"));
|
||||||
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
// sq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ export async function generateCompletions(
|
|||||||
schema: schema,
|
schema: schema,
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
systemPrompt: systemPrompt,
|
systemPrompt: systemPrompt,
|
||||||
mode: mode,
|
mode: mode
|
||||||
});
|
});
|
||||||
// Validate the JSON output against the schema using AJV
|
// Validate the JSON output against the schema using AJV
|
||||||
if (schema) {
|
if (schema) {
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ export async function generateOpenAICompletions({
|
|||||||
systemPrompt = defaultPrompt,
|
systemPrompt = defaultPrompt,
|
||||||
prompt,
|
prompt,
|
||||||
temperature,
|
temperature,
|
||||||
mode,
|
mode
|
||||||
}: {
|
}: {
|
||||||
client: OpenAI;
|
client: OpenAI;
|
||||||
model?: string;
|
model?: string;
|
||||||
@@ -68,7 +68,7 @@ export async function generateOpenAICompletions({
|
|||||||
return {
|
return {
|
||||||
...document,
|
...document,
|
||||||
warning:
|
warning:
|
||||||
"LLM extraction was not performed since the document's content is empty or missing.",
|
"LLM extraction was not performed since the document's content is empty or missing."
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
const [content, numTokens] = preparedDoc;
|
const [content, numTokens] = preparedDoc;
|
||||||
@@ -81,16 +81,16 @@ export async function generateOpenAICompletions({
|
|||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: systemPrompt,
|
content: systemPrompt
|
||||||
},
|
},
|
||||||
{ role: "user", content },
|
{ role: "user", content },
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
|
content: `Transform the above content into structured json output based on the following user request: ${prompt}`
|
||||||
},
|
}
|
||||||
],
|
],
|
||||||
response_format: { type: "json_object" },
|
response_format: { type: "json_object" },
|
||||||
temperature,
|
temperature
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -106,9 +106,9 @@ export async function generateOpenAICompletions({
|
|||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: systemPrompt,
|
content: systemPrompt
|
||||||
},
|
},
|
||||||
{ role: "user", content },
|
{ role: "user", content }
|
||||||
],
|
],
|
||||||
tools: [
|
tools: [
|
||||||
{
|
{
|
||||||
@@ -116,12 +116,12 @@ export async function generateOpenAICompletions({
|
|||||||
function: {
|
function: {
|
||||||
name: "extract_content",
|
name: "extract_content",
|
||||||
description: "Extracts the content from the given webpage(s)",
|
description: "Extracts the content from the given webpage(s)",
|
||||||
parameters: schema,
|
parameters: schema
|
||||||
},
|
}
|
||||||
},
|
}
|
||||||
],
|
],
|
||||||
tool_choice: { type: "function", function: { name: "extract_content" } },
|
tool_choice: { type: "function", function: { name: "extract_content" } },
|
||||||
temperature,
|
temperature
|
||||||
});
|
});
|
||||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||||
|
|
||||||
@@ -140,6 +140,6 @@ export async function generateOpenAICompletions({
|
|||||||
warning:
|
warning:
|
||||||
numTokens > maxTokens
|
numTokens > maxTokens
|
||||||
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
? `Page was trimmed to fit the maximum token limit defined by the LLM model (Max: ${maxTokens} tokens, Attemped: ${numTokens} tokens). If results are not good, email us at help@mendable.ai so we can help you.`
|
||||||
: undefined,
|
: undefined
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,36 +1,46 @@
|
|||||||
import { parseMarkdown } from '../html-to-markdown';
|
import { parseMarkdown } from "../html-to-markdown";
|
||||||
|
|
||||||
describe('parseMarkdown', () => {
|
describe("parseMarkdown", () => {
|
||||||
it('should correctly convert simple HTML to Markdown', async () => {
|
it("should correctly convert simple HTML to Markdown", async () => {
|
||||||
const html = '<p>Hello, world!</p>';
|
const html = "<p>Hello, world!</p>";
|
||||||
const expectedMarkdown = 'Hello, world!';
|
const expectedMarkdown = "Hello, world!";
|
||||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should convert complex HTML with nested elements to Markdown', async () => {
|
it("should convert complex HTML with nested elements to Markdown", async () => {
|
||||||
const html = '<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>';
|
const html =
|
||||||
const expectedMarkdown = 'Hello **bold** world!\n\n- List item';
|
"<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>";
|
||||||
|
const expectedMarkdown = "Hello **bold** world!\n\n- List item";
|
||||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should return empty string when input is empty', async () => {
|
it("should return empty string when input is empty", async () => {
|
||||||
const html = '';
|
const html = "";
|
||||||
const expectedMarkdown = '';
|
const expectedMarkdown = "";
|
||||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle null input gracefully', async () => {
|
it("should handle null input gracefully", async () => {
|
||||||
const html = null;
|
const html = null;
|
||||||
const expectedMarkdown = '';
|
const expectedMarkdown = "";
|
||||||
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle various types of invalid HTML gracefully', async () => {
|
it("should handle various types of invalid HTML gracefully", async () => {
|
||||||
const invalidHtmls = [
|
const invalidHtmls = [
|
||||||
{ html: '<html><p>Unclosed tag', expected: 'Unclosed tag' },
|
{ html: "<html><p>Unclosed tag", expected: "Unclosed tag" },
|
||||||
{ html: '<div><span>Missing closing div', expected: 'Missing closing div' },
|
{
|
||||||
{ html: '<p><strong>Wrong nesting</em></strong></p>', expected: '**Wrong nesting**' },
|
html: "<div><span>Missing closing div",
|
||||||
{ html: '<a href="http://example.com">Link without closing tag', expected: '[Link without closing tag](http://example.com)' }
|
expected: "Missing closing div"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
html: "<p><strong>Wrong nesting</em></strong></p>",
|
||||||
|
expected: "**Wrong nesting**"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
html: '<a href="http://example.com">Link without closing tag',
|
||||||
|
expected: "[Link without closing tag](http://example.com)"
|
||||||
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const { html, expected } of invalidHtmls) {
|
for (const { html, expected } of invalidHtmls) {
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import {
|
import {
|
||||||
getJobPriority,
|
getJobPriority,
|
||||||
addJobPriority,
|
addJobPriority,
|
||||||
deleteJobPriority,
|
deleteJobPriority
|
||||||
} from "../job-priority";
|
} from "../job-priority";
|
||||||
import { redisConnection } from "../../services/queue-service";
|
import { redisConnection } from "../../services/queue-service";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
@@ -11,8 +11,8 @@ jest.mock("../../services/queue-service", () => ({
|
|||||||
sadd: jest.fn(),
|
sadd: jest.fn(),
|
||||||
srem: jest.fn(),
|
srem: jest.fn(),
|
||||||
scard: jest.fn(),
|
scard: jest.fn(),
|
||||||
expire: jest.fn(),
|
expire: jest.fn()
|
||||||
},
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
describe("Job Priority Tests", () => {
|
describe("Job Priority Tests", () => {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ export async function batchProcess<T>(
|
|||||||
array: T[],
|
array: T[],
|
||||||
batchSize: number,
|
batchSize: number,
|
||||||
asyncFunction: (item: T, index: number) => Promise<void>
|
asyncFunction: (item: T, index: number) => Promise<void>
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const batches: T[][] = [];
|
const batches: T[][] = [];
|
||||||
for (let i = 0; i < array.length; i += batchSize) {
|
for (let i = 0; i < array.length; i += batchSize) {
|
||||||
const batch = array.slice(i, i + batchSize);
|
const batch = array.slice(i, i + batchSize);
|
||||||
@@ -12,5 +12,4 @@ export async function batchProcess<T>(
|
|||||||
for (const batch of batches) {
|
for (const batch of batches) {
|
||||||
await Promise.all(batch.map((item, i) => asyncFunction(item, i)));
|
await Promise.all(batch.map((item, i) => asyncFunction(item, i)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,18 +2,28 @@ import IORedis from "ioredis";
|
|||||||
import { ScrapeOptions } from "../controllers/v1/types";
|
import { ScrapeOptions } from "../controllers/v1/types";
|
||||||
import { InternalOptions } from "../scraper/scrapeURL";
|
import { InternalOptions } from "../scraper/scrapeURL";
|
||||||
import { logger as _logger } from "./logger";
|
import { logger as _logger } from "./logger";
|
||||||
const logger = _logger.child({module: "cache"});
|
const logger = _logger.child({ module: "cache" });
|
||||||
|
|
||||||
export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
|
export const cacheRedis = process.env.CACHE_REDIS_URL
|
||||||
maxRetriesPerRequest: null,
|
? new IORedis(process.env.CACHE_REDIS_URL, {
|
||||||
}) : null;
|
maxRetriesPerRequest: null
|
||||||
|
})
|
||||||
|
: null;
|
||||||
|
|
||||||
export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
|
export function cacheKey(
|
||||||
|
url: string,
|
||||||
|
scrapeOptions: ScrapeOptions,
|
||||||
|
internalOptions: InternalOptions
|
||||||
|
): string | null {
|
||||||
if (!cacheRedis) return null;
|
if (!cacheRedis) return null;
|
||||||
|
|
||||||
// these options disqualify a cache
|
// these options disqualify a cache
|
||||||
if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
|
if (
|
||||||
|| (scrapeOptions.actions && scrapeOptions.actions.length > 0)
|
internalOptions.v0CrawlOnlyUrls ||
|
||||||
|
internalOptions.forceEngine ||
|
||||||
|
internalOptions.v0UseFastMode ||
|
||||||
|
internalOptions.atsv ||
|
||||||
|
(scrapeOptions.actions && scrapeOptions.actions.length > 0)
|
||||||
) {
|
) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -38,11 +48,13 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
|
export async function getEntryFromCache(
|
||||||
|
key: string
|
||||||
|
): Promise<CacheEntry | null> {
|
||||||
if (!cacheRedis) return null;
|
if (!cacheRedis) return null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return JSON.parse(await cacheRedis.get(key) ?? "null");
|
return JSON.parse((await cacheRedis.get(key)) ?? "null");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn("Failed to get from cache", { key, error });
|
logger.warn("Failed to get from cache", { key, error });
|
||||||
return null;
|
return null;
|
||||||
|
|||||||
@@ -4,26 +4,48 @@ import { RateLimiterMode } from "../types";
|
|||||||
import { JobsOptions } from "bullmq";
|
import { JobsOptions } from "bullmq";
|
||||||
|
|
||||||
const constructKey = (team_id: string) => "concurrency-limiter:" + team_id;
|
const constructKey = (team_id: string) => "concurrency-limiter:" + team_id;
|
||||||
const constructQueueKey = (team_id: string) => "concurrency-limit-queue:" + team_id;
|
const constructQueueKey = (team_id: string) =>
|
||||||
|
"concurrency-limit-queue:" + team_id;
|
||||||
const stalledJobTimeoutMs = 2 * 60 * 1000;
|
const stalledJobTimeoutMs = 2 * 60 * 1000;
|
||||||
|
|
||||||
export function getConcurrencyLimitMax(plan: string): number {
|
export function getConcurrencyLimitMax(plan: string): number {
|
||||||
return getRateLimiterPoints(RateLimiterMode.Scrape, undefined, plan);
|
return getRateLimiterPoints(RateLimiterMode.Scrape, undefined, plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function cleanOldConcurrencyLimitEntries(team_id: string, now: number = Date.now()) {
|
export async function cleanOldConcurrencyLimitEntries(
|
||||||
|
team_id: string,
|
||||||
|
now: number = Date.now()
|
||||||
|
) {
|
||||||
await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
|
await redisConnection.zremrangebyscore(constructKey(team_id), -Infinity, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getConcurrencyLimitActiveJobs(team_id: string, now: number = Date.now()): Promise<string[]> {
|
export async function getConcurrencyLimitActiveJobs(
|
||||||
return await redisConnection.zrangebyscore(constructKey(team_id), now, Infinity);
|
team_id: string,
|
||||||
|
now: number = Date.now()
|
||||||
|
): Promise<string[]> {
|
||||||
|
return await redisConnection.zrangebyscore(
|
||||||
|
constructKey(team_id),
|
||||||
|
now,
|
||||||
|
Infinity
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function pushConcurrencyLimitActiveJob(team_id: string, id: string, now: number = Date.now()) {
|
export async function pushConcurrencyLimitActiveJob(
|
||||||
await redisConnection.zadd(constructKey(team_id), now + stalledJobTimeoutMs, id);
|
team_id: string,
|
||||||
|
id: string,
|
||||||
|
now: number = Date.now()
|
||||||
|
) {
|
||||||
|
await redisConnection.zadd(
|
||||||
|
constructKey(team_id),
|
||||||
|
now + stalledJobTimeoutMs,
|
||||||
|
id
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function removeConcurrencyLimitActiveJob(team_id: string, id: string) {
|
export async function removeConcurrencyLimitActiveJob(
|
||||||
|
team_id: string,
|
||||||
|
id: string
|
||||||
|
) {
|
||||||
await redisConnection.zrem(constructKey(team_id), id);
|
await redisConnection.zrem(constructKey(team_id), id);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,9 +54,11 @@ export type ConcurrencyLimitedJob = {
|
|||||||
data: any;
|
data: any;
|
||||||
opts: JobsOptions;
|
opts: JobsOptions;
|
||||||
priority?: number;
|
priority?: number;
|
||||||
}
|
};
|
||||||
|
|
||||||
export async function takeConcurrencyLimitedJob(team_id: string): Promise<ConcurrencyLimitedJob | null> {
|
export async function takeConcurrencyLimitedJob(
|
||||||
|
team_id: string
|
||||||
|
): Promise<ConcurrencyLimitedJob | null> {
|
||||||
const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
|
const res = await redisConnection.zmpop(1, constructQueueKey(team_id), "MIN");
|
||||||
if (res === null || res === undefined) {
|
if (res === null || res === undefined) {
|
||||||
return null;
|
return null;
|
||||||
@@ -43,6 +67,13 @@ export async function takeConcurrencyLimitedJob(team_id: string): Promise<Concur
|
|||||||
return JSON.parse(res[1][0][0]);
|
return JSON.parse(res[1][0][0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function pushConcurrencyLimitedJob(team_id: string, job: ConcurrencyLimitedJob) {
|
export async function pushConcurrencyLimitedJob(
|
||||||
await redisConnection.zadd(constructQueueKey(team_id), job.priority ?? 1, JSON.stringify(job));
|
team_id: string,
|
||||||
|
job: ConcurrencyLimitedJob
|
||||||
|
) {
|
||||||
|
await redisConnection.zadd(
|
||||||
|
constructQueueKey(team_id),
|
||||||
|
job.priority ?? 1,
|
||||||
|
JSON.stringify(job)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,32 +2,40 @@ import { generateURLPermutations } from "./crawl-redis";
|
|||||||
|
|
||||||
describe("generateURLPermutations", () => {
|
describe("generateURLPermutations", () => {
|
||||||
it("generates permutations correctly", () => {
|
it("generates permutations correctly", () => {
|
||||||
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(x => x.href);
|
const bareHttps = generateURLPermutations("https://firecrawl.dev").map(
|
||||||
|
(x) => x.href
|
||||||
|
);
|
||||||
expect(bareHttps.length).toBe(4);
|
expect(bareHttps.length).toBe(4);
|
||||||
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
expect(bareHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
expect(bareHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
expect(bareHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(bareHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(x => x.href);
|
const bareHttp = generateURLPermutations("http://firecrawl.dev").map(
|
||||||
|
(x) => x.href
|
||||||
|
);
|
||||||
expect(bareHttp.length).toBe(4);
|
expect(bareHttp.length).toBe(4);
|
||||||
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
expect(bareHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
expect(bareHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
expect(bareHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(bareHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(x => x.href);
|
const wwwHttps = generateURLPermutations("https://www.firecrawl.dev").map(
|
||||||
|
(x) => x.href
|
||||||
|
);
|
||||||
expect(wwwHttps.length).toBe(4);
|
expect(wwwHttps.length).toBe(4);
|
||||||
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
expect(wwwHttps.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
expect(wwwHttps.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
expect(wwwHttps.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(wwwHttps.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
|
|
||||||
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(x => x.href);
|
const wwwHttp = generateURLPermutations("http://www.firecrawl.dev").map(
|
||||||
|
(x) => x.href
|
||||||
|
);
|
||||||
expect(wwwHttp.length).toBe(4);
|
expect(wwwHttp.length).toBe(4);
|
||||||
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
expect(wwwHttp.includes("https://firecrawl.dev/")).toBe(true);
|
||||||
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
expect(wwwHttp.includes("https://www.firecrawl.dev/")).toBe(true);
|
||||||
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
expect(wwwHttp.includes("http://firecrawl.dev/")).toBe(true);
|
||||||
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
expect(wwwHttp.includes("http://www.firecrawl.dev/")).toBe(true);
|
||||||
})
|
});
|
||||||
});
|
});
|
||||||
+144
-35
@@ -18,7 +18,14 @@ export type StoredCrawl = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
export async function saveCrawl(id: string, crawl: StoredCrawl) {
|
||||||
_logger.debug("Saving crawl " + id + " to Redis...", { crawl, module: "crawl-redis", method: "saveCrawl", crawlId: id, teamId: crawl.team_id, plan: crawl.plan });
|
_logger.debug("Saving crawl " + id + " to Redis...", {
|
||||||
|
crawl,
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "saveCrawl",
|
||||||
|
crawlId: id,
|
||||||
|
teamId: crawl.team_id,
|
||||||
|
plan: crawl.plan
|
||||||
|
});
|
||||||
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
await redisConnection.set("crawl:" + id, JSON.stringify(crawl));
|
||||||
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
@@ -42,25 +49,52 @@ export async function getCrawlExpiry(id: string): Promise<Date> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJob(id: string, job_id: string) {
|
export async function addCrawlJob(id: string, job_id: string) {
|
||||||
_logger.debug("Adding crawl job " + job_id + " to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJob", crawlId: id });
|
_logger.debug("Adding crawl job " + job_id + " to Redis...", {
|
||||||
|
jobId: job_id,
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "addCrawlJob",
|
||||||
|
crawlId: id
|
||||||
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||||
_logger.debug("Adding crawl jobs to Redis...", { jobIds: job_ids, module: "crawl-redis", method: "addCrawlJobs", crawlId: id });
|
_logger.debug("Adding crawl jobs to Redis...", {
|
||||||
|
jobIds: job_ids,
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "addCrawlJobs",
|
||||||
|
crawlId: id
|
||||||
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function addCrawlJobDone(id: string, job_id: string, success: boolean) {
|
export async function addCrawlJobDone(
|
||||||
_logger.debug("Adding done crawl job to Redis...", { jobId: job_id, module: "crawl-redis", method: "addCrawlJobDone", crawlId: id });
|
id: string,
|
||||||
|
job_id: string,
|
||||||
|
success: boolean
|
||||||
|
) {
|
||||||
|
_logger.debug("Adding done crawl job to Redis...", {
|
||||||
|
jobId: job_id,
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "addCrawlJobDone",
|
||||||
|
crawlId: id
|
||||||
|
});
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
await redisConnection.expire(
|
||||||
|
"crawl:" + id + ":jobs_done",
|
||||||
|
24 * 60 * 60,
|
||||||
|
"NX"
|
||||||
|
);
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
await redisConnection.expire(
|
||||||
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
|
24 * 60 * 60,
|
||||||
|
"NX"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,28 +102,47 @@ export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
|||||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
|
export async function getDoneJobsOrdered(
|
||||||
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
|
id: string,
|
||||||
|
start = 0,
|
||||||
|
end = -1
|
||||||
|
): Promise<string[]> {
|
||||||
|
return await redisConnection.lrange(
|
||||||
|
"crawl:" + id + ":jobs_done_ordered",
|
||||||
|
start,
|
||||||
|
end
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinished(id: string) {
|
export async function isCrawlFinished(id: string) {
|
||||||
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
|
return (
|
||||||
|
(await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
|
||||||
|
(await redisConnection.scard("crawl:" + id + ":jobs"))
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinishedLocked(id: string) {
|
export async function isCrawlFinishedLocked(id: string) {
|
||||||
return (await redisConnection.exists("crawl:" + id + ":finish"));
|
return await redisConnection.exists("crawl:" + id + ":finish");
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function finishCrawl(id: string) {
|
export async function finishCrawl(id: string) {
|
||||||
if (await isCrawlFinished(id)) {
|
if (await isCrawlFinished(id)) {
|
||||||
_logger.debug("Marking crawl as finished.", { module: "crawl-redis", method: "finishCrawl", crawlId: id });
|
_logger.debug("Marking crawl as finished.", {
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "finishCrawl",
|
||||||
|
crawlId: id
|
||||||
|
});
|
||||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||||
if (set === 1) {
|
if (set === 1) {
|
||||||
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60);
|
||||||
}
|
}
|
||||||
return set === 1
|
return set === 1;
|
||||||
} else {
|
} else {
|
||||||
_logger.debug("Crawl can not be finished yet, not marking as finished.", { module: "crawl-redis", method: "finishCrawl", crawlId: id });
|
_logger.debug("Crawl can not be finished yet, not marking as finished.", {
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "finishCrawl",
|
||||||
|
crawlId: id
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,7 +151,11 @@ export async function getCrawlJobs(id: string): Promise<string[]> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
||||||
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
|
return await redisConnection.zrangebyscore(
|
||||||
|
"concurrency-limiter:" + teamId + ":throttled",
|
||||||
|
Date.now(),
|
||||||
|
Infinity
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
||||||
@@ -125,7 +182,7 @@ export function generateURLPermutations(url: string | URL): URL[] {
|
|||||||
let permutations = [urlWithWWW, urlWithoutWWW];
|
let permutations = [urlWithWWW, urlWithoutWWW];
|
||||||
|
|
||||||
// Construct more versions for http/https
|
// Construct more versions for http/https
|
||||||
permutations = permutations.flatMap(urlO => {
|
permutations = permutations.flatMap((urlO) => {
|
||||||
if (!["http:", "https:"].includes(urlO.protocol)) {
|
if (!["http:", "https:"].includes(urlO.protocol)) {
|
||||||
return [urlO];
|
return [urlO];
|
||||||
}
|
}
|
||||||
@@ -141,12 +198,28 @@ export function generateURLPermutations(url: string | URL): URL[] {
|
|||||||
return permutations;
|
return permutations;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
|
export async function lockURL(
|
||||||
let logger = _logger.child({ crawlId: id, module: "crawl-redis", method: "lockURL", preNormalizedURL: url, teamId: sc.team_id, plan: sc.plan });
|
id: string,
|
||||||
|
sc: StoredCrawl,
|
||||||
|
url: string
|
||||||
|
): Promise<boolean> {
|
||||||
|
let logger = _logger.child({
|
||||||
|
crawlId: id,
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "lockURL",
|
||||||
|
preNormalizedURL: url,
|
||||||
|
teamId: sc.team_id,
|
||||||
|
plan: sc.plan
|
||||||
|
});
|
||||||
|
|
||||||
if (typeof sc.crawlerOptions?.limit === "number") {
|
if (typeof sc.crawlerOptions?.limit === "number") {
|
||||||
if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
|
if (
|
||||||
logger.debug("Crawl has already hit visited_unique limit, not locking URL.");
|
(await redisConnection.scard("crawl:" + id + ":visited_unique")) >=
|
||||||
|
sc.crawlerOptions.limit
|
||||||
|
) {
|
||||||
|
logger.debug(
|
||||||
|
"Crawl has already hit visited_unique limit, not locking URL."
|
||||||
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -155,42 +228,70 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
logger = logger.child({ url });
|
logger = logger.child({ url });
|
||||||
|
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
|
||||||
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
await redisConnection.expire(
|
||||||
|
"crawl:" + id + ":visited_unique",
|
||||||
|
24 * 60 * 60,
|
||||||
|
"NX"
|
||||||
|
);
|
||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
|
||||||
} else {
|
} else {
|
||||||
const permutations = generateURLPermutations(url).map(x => x.href);
|
const permutations = generateURLPermutations(url).map((x) => x.href);
|
||||||
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
// logger.debug("Adding URL permutations for URL " + JSON.stringify(url) + "...", { permutations });
|
||||||
const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations));
|
const x = await redisConnection.sadd(
|
||||||
|
"crawl:" + id + ":visited",
|
||||||
|
...permutations
|
||||||
|
);
|
||||||
res = x === permutations.length;
|
res = x === permutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
|
|
||||||
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, { res });
|
logger.debug("Locking URL " + JSON.stringify(url) + "... result: " + res, {
|
||||||
|
res
|
||||||
|
});
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
|
||||||
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
|
export async function lockURLs(
|
||||||
urls = urls.map(url => normalizeURL(url, sc));
|
id: string,
|
||||||
const logger = _logger.child({ crawlId: id, module: "crawl-redis", method: "lockURL", teamId: sc.team_id, plan: sc.plan });
|
sc: StoredCrawl,
|
||||||
|
urls: string[]
|
||||||
|
): Promise<boolean> {
|
||||||
|
urls = urls.map((url) => normalizeURL(url, sc));
|
||||||
|
const logger = _logger.child({
|
||||||
|
crawlId: id,
|
||||||
|
module: "crawl-redis",
|
||||||
|
method: "lockURL",
|
||||||
|
teamId: sc.team_id,
|
||||||
|
plan: sc.plan
|
||||||
|
});
|
||||||
|
|
||||||
// Add to visited_unique set
|
// Add to visited_unique set
|
||||||
logger.debug("Locking " + urls.length + " URLs...");
|
logger.debug("Locking " + urls.length + " URLs...");
|
||||||
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
|
||||||
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
|
await redisConnection.expire(
|
||||||
|
"crawl:" + id + ":visited_unique",
|
||||||
|
24 * 60 * 60,
|
||||||
|
"NX"
|
||||||
|
);
|
||||||
|
|
||||||
let res: boolean;
|
let res: boolean;
|
||||||
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
|
||||||
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls);
|
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls);
|
||||||
res = x === urls.length;
|
res = x === urls.length;
|
||||||
} else {
|
} else {
|
||||||
const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href));
|
const allPermutations = urls.flatMap((url) =>
|
||||||
|
generateURLPermutations(url).map((x) => x.href)
|
||||||
|
);
|
||||||
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
logger.debug("Adding " + allPermutations.length + " URL permutations...");
|
||||||
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations);
|
const x = await redisConnection.sadd(
|
||||||
|
"crawl:" + id + ":visited",
|
||||||
|
...allPermutations
|
||||||
|
);
|
||||||
res = x === allPermutations.length;
|
res = x === allPermutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,7 +301,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
|
export function crawlToCrawler(
|
||||||
|
id: string,
|
||||||
|
sc: StoredCrawl,
|
||||||
|
newBase?: string
|
||||||
|
): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
initialUrl: sc.originUrl!,
|
initialUrl: sc.originUrl!,
|
||||||
@@ -208,13 +313,17 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
|||||||
includes: sc.crawlerOptions?.includes ?? [],
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10),
|
maxCrawledDepth: getAdjustedMaxDepth(
|
||||||
|
sc.originUrl!,
|
||||||
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
|
),
|
||||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
allowExternalContentLinks:
|
||||||
|
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ export class CustomError extends Error {
|
|||||||
statusCode: number,
|
statusCode: number,
|
||||||
status: string,
|
status: string,
|
||||||
message: string = "",
|
message: string = "",
|
||||||
dataIngestionJob?: any,
|
dataIngestionJob?: any
|
||||||
) {
|
) {
|
||||||
super(message);
|
super(message);
|
||||||
this.statusCode = statusCode;
|
this.statusCode = statusCode;
|
||||||
@@ -19,4 +19,3 @@ export class CustomError extends Error {
|
|||||||
Object.setPrototypeOf(this, CustomError.prototype);
|
Object.setPrototypeOf(this, CustomError.prototype);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,15 +14,15 @@ export const defaultPageOptions = {
|
|||||||
export const defaultCrawlerOptions = {
|
export const defaultCrawlerOptions = {
|
||||||
allowBackwardCrawling: false,
|
allowBackwardCrawling: false,
|
||||||
limit: 10000
|
limit: 10000
|
||||||
}
|
};
|
||||||
|
|
||||||
export const defaultCrawlPageOptions = {
|
export const defaultCrawlPageOptions = {
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
removeTags: [],
|
removeTags: [],
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
}
|
};
|
||||||
|
|
||||||
export const defaultExtractorOptions = {
|
export const defaultExtractorOptions = {
|
||||||
mode: "markdown"
|
mode: "markdown"
|
||||||
}
|
};
|
||||||
|
|||||||
@@ -12,32 +12,40 @@ export interface Progress {
|
|||||||
currentDocument?: Document;
|
currentDocument?: Document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type Action = {
|
export type Action =
|
||||||
type: "wait",
|
| {
|
||||||
milliseconds?: number,
|
type: "wait";
|
||||||
selector?: string,
|
milliseconds?: number;
|
||||||
} | {
|
selector?: string;
|
||||||
type: "click",
|
}
|
||||||
selector: string,
|
| {
|
||||||
} | {
|
type: "click";
|
||||||
type: "screenshot",
|
selector: string;
|
||||||
fullPage?: boolean,
|
}
|
||||||
} | {
|
| {
|
||||||
type: "write",
|
type: "screenshot";
|
||||||
text: string,
|
fullPage?: boolean;
|
||||||
} | {
|
}
|
||||||
type: "press",
|
| {
|
||||||
key: string,
|
type: "write";
|
||||||
} | {
|
text: string;
|
||||||
type: "scroll",
|
}
|
||||||
direction?: "up" | "down",
|
| {
|
||||||
selector?: string,
|
type: "press";
|
||||||
} | {
|
key: string;
|
||||||
type: "scrape",
|
}
|
||||||
} | {
|
| {
|
||||||
type: "executeJavascript",
|
type: "scroll";
|
||||||
script: string,
|
direction?: "up" | "down";
|
||||||
}
|
selector?: string;
|
||||||
|
}
|
||||||
|
| {
|
||||||
|
type: "scrape";
|
||||||
|
}
|
||||||
|
| {
|
||||||
|
type: "executeJavascript";
|
||||||
|
script: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
includeMarkdown?: boolean;
|
includeMarkdown?: boolean;
|
||||||
@@ -69,11 +77,15 @@ export type PageOptions = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
mode: "markdown" | "llm-extraction" | "llm-extraction-from-markdown" | "llm-extraction-from-raw-html";
|
mode:
|
||||||
|
| "markdown"
|
||||||
|
| "llm-extraction"
|
||||||
|
| "llm-extraction-from-markdown"
|
||||||
|
| "llm-extraction-from-raw-html";
|
||||||
extractionPrompt?: string;
|
extractionPrompt?: string;
|
||||||
extractionSchema?: Record<string, any>;
|
extractionSchema?: Record<string, any>;
|
||||||
userPrompt?: string;
|
userPrompt?: string;
|
||||||
}
|
};
|
||||||
|
|
||||||
export type SearchOptions = {
|
export type SearchOptions = {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
@@ -97,7 +109,7 @@ export type CrawlerOptions = {
|
|||||||
mode?: "default" | "fast"; // have a mode of some sort
|
mode?: "default" | "fast"; // have a mode of some sort
|
||||||
allowBackwardCrawling?: boolean;
|
allowBackwardCrawling?: boolean;
|
||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
}
|
};
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type WebScraperOptions = {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
@@ -137,7 +149,7 @@ export class Document {
|
|||||||
actions?: {
|
actions?: {
|
||||||
screenshots?: string[];
|
screenshots?: string[];
|
||||||
scrapes?: ScrapeActionContent[];
|
scrapes?: ScrapeActionContent[];
|
||||||
}
|
};
|
||||||
|
|
||||||
index?: number;
|
index?: number;
|
||||||
linksOnPage?: string[]; // Add this new field as a separate property
|
linksOnPage?: string[]; // Add this new field as a separate property
|
||||||
@@ -158,7 +170,6 @@ export class Document {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export class SearchResult {
|
export class SearchResult {
|
||||||
url: string;
|
url: string;
|
||||||
title: string;
|
title: string;
|
||||||
@@ -188,8 +199,7 @@ export interface FireEngineResponse {
|
|||||||
scrapeActionContent?: ScrapeActionContent[];
|
scrapeActionContent?: ScrapeActionContent[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface FireEngineOptions {
|
||||||
export interface FireEngineOptions{
|
|
||||||
mobileProxy?: boolean;
|
mobileProxy?: boolean;
|
||||||
method?: string;
|
method?: string;
|
||||||
engine?: string;
|
engine?: string;
|
||||||
|
|||||||
@@ -5,9 +5,11 @@ export function buildDocument(document: Document): string {
|
|||||||
const markdown = document.markdown;
|
const markdown = document.markdown;
|
||||||
|
|
||||||
// for each key in the metadata allow up to 250 characters
|
// for each key in the metadata allow up to 250 characters
|
||||||
const metadataString = Object.entries(metadata).map(([key, value]) => {
|
const metadataString = Object.entries(metadata)
|
||||||
|
.map(([key, value]) => {
|
||||||
return `${key}: ${value?.toString().slice(0, 250)}`;
|
return `${key}: ${value?.toString().slice(0, 250)}`;
|
||||||
}).join('\n');
|
})
|
||||||
|
.join("\n");
|
||||||
|
|
||||||
const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
|
const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
|
||||||
const documentString = `${markdown}${documentMetadataString}`;
|
const documentString = `${markdown}${documentMetadataString}`;
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { CohereClient } from "cohere-ai";
|
import { CohereClient } from "cohere-ai";
|
||||||
import { MapDocument } from "../../controllers/v1/types";
|
import { MapDocument } from "../../controllers/v1/types";
|
||||||
const cohere = new CohereClient({
|
const cohere = new CohereClient({
|
||||||
token: process.env.COHERE_API_KEY,
|
token: process.env.COHERE_API_KEY
|
||||||
});
|
});
|
||||||
|
|
||||||
export async function rerankDocuments(
|
export async function rerankDocuments(
|
||||||
@@ -15,8 +15,14 @@ export async function rerankDocuments(
|
|||||||
query,
|
query,
|
||||||
topN,
|
topN,
|
||||||
model,
|
model,
|
||||||
returnDocuments: true,
|
returnDocuments: true
|
||||||
});
|
});
|
||||||
|
|
||||||
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
|
return rerank.results
|
||||||
|
.sort((a, b) => b.relevanceScore - a.relevanceScore)
|
||||||
|
.map((x) => ({
|
||||||
|
document: x.document,
|
||||||
|
index: x.index,
|
||||||
|
relevanceScore: x.relevanceScore
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,16 +1,20 @@
|
|||||||
|
import koffi from "koffi";
|
||||||
import koffi from 'koffi';
|
import { join } from "path";
|
||||||
import { join } from 'path';
|
import "../services/sentry";
|
||||||
import "../services/sentry"
|
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
import dotenv from 'dotenv';
|
import dotenv from "dotenv";
|
||||||
import { logger } from './logger';
|
import { logger } from "./logger";
|
||||||
import { stat } from 'fs/promises';
|
import { stat } from "fs/promises";
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
// TODO: add a timeout to the Go parser
|
// TODO: add a timeout to the Go parser
|
||||||
const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
|
const goExecutablePath = join(
|
||||||
|
process.cwd(),
|
||||||
|
"sharedLibs",
|
||||||
|
"go-html-to-md",
|
||||||
|
"html-to-markdown.so"
|
||||||
|
);
|
||||||
|
|
||||||
class GoMarkdownConverter {
|
class GoMarkdownConverter {
|
||||||
private static instance: GoMarkdownConverter;
|
private static instance: GoMarkdownConverter;
|
||||||
@@ -18,7 +22,7 @@ class GoMarkdownConverter {
|
|||||||
|
|
||||||
private constructor() {
|
private constructor() {
|
||||||
const lib = koffi.load(goExecutablePath);
|
const lib = koffi.load(goExecutablePath);
|
||||||
this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
|
this.convert = lib.func("ConvertHTMLToMarkdown", "string", ["string"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static async getInstance(): Promise<GoMarkdownConverter> {
|
public static async getInstance(): Promise<GoMarkdownConverter> {
|
||||||
@@ -46,9 +50,11 @@ class GoMarkdownConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function parseMarkdown(html: string | null | undefined): Promise<string> {
|
export async function parseMarkdown(
|
||||||
|
html: string | null | undefined
|
||||||
|
): Promise<string> {
|
||||||
if (!html) {
|
if (!html) {
|
||||||
return '';
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -62,17 +68,25 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
|||||||
return markdownContent;
|
return markdownContent;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (!(error instanceof Error) || error.message !== "Go shared library not found") {
|
if (
|
||||||
|
!(error instanceof Error) ||
|
||||||
|
error.message !== "Go shared library not found"
|
||||||
|
) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
|
logger.error(
|
||||||
|
`Error converting HTML to Markdown with Go parser: ${error}`
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
|
logger.warn(
|
||||||
|
"Tried to use Go parser, but it doesn't exist in the file system.",
|
||||||
|
{ goExecutablePath }
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to TurndownService if Go parser fails or is not enabled
|
// Fallback to TurndownService if Go parser fails or is not enabled
|
||||||
var TurndownService = require("turndown");
|
var TurndownService = require("turndown");
|
||||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
var turndownPluginGfm = require("joplin-turndown-plugin-gfm");
|
||||||
|
|
||||||
const turndownService = new TurndownService();
|
const turndownService = new TurndownService();
|
||||||
turndownService.addRule("inlineLink", {
|
turndownService.addRule("inlineLink", {
|
||||||
@@ -87,7 +101,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
|||||||
var href = node.getAttribute("href").trim();
|
var href = node.getAttribute("href").trim();
|
||||||
var title = node.title ? ' "' + node.title + '"' : "";
|
var title = node.title ? ' "' + node.title + '"' : "";
|
||||||
return "[" + content.trim() + "](" + href + title + ")\n";
|
return "[" + content.trim() + "](" + href + title + ")\n";
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
var gfm = turndownPluginGfm.gfm;
|
var gfm = turndownPluginGfm.gfm;
|
||||||
turndownService.use(gfm);
|
turndownService.use(gfm);
|
||||||
@@ -99,7 +113,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
|
|||||||
|
|
||||||
return markdownContent;
|
return markdownContent;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error("Error converting HTML to Markdown", {error});
|
logger.error("Error converting HTML to Markdown", { error });
|
||||||
return ""; // Optionally return an empty string or handle the error as needed
|
return ""; // Optionally return an empty string or handle the error as needed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ export async function deleteJobPriority(team_id, job_id) {
|
|||||||
export async function getJobPriority({
|
export async function getJobPriority({
|
||||||
plan,
|
plan,
|
||||||
team_id,
|
team_id,
|
||||||
basePriority = 10,
|
basePriority = 10
|
||||||
}: {
|
}: {
|
||||||
plan: PlanType | undefined;
|
plan: PlanType | undefined;
|
||||||
team_id: string;
|
team_id: string;
|
||||||
|
|||||||
+23
-15
@@ -3,24 +3,26 @@ import * as winston from "winston";
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
const logFormat = winston.format.printf(info =>
|
const logFormat = winston.format.printf(
|
||||||
`${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
|
(info) =>
|
||||||
info.metadata,
|
`${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${
|
||||||
(_, value) => {
|
info.level.includes("error") || info.level.includes("warn")
|
||||||
|
? JSON.stringify(info.metadata, (_, value) => {
|
||||||
if (value instanceof Error) {
|
if (value instanceof Error) {
|
||||||
return {
|
return {
|
||||||
...value,
|
...value,
|
||||||
name: value.name,
|
name: value.name,
|
||||||
message: value.message,
|
message: value.message,
|
||||||
stack: value.stack,
|
stack: value.stack,
|
||||||
cause: value.cause,
|
cause: value.cause
|
||||||
}
|
};
|
||||||
} else {
|
} else {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
) : ""}`
|
: ""
|
||||||
)
|
}`
|
||||||
|
);
|
||||||
|
|
||||||
export const logger = winston.createLogger({
|
export const logger = winston.createLogger({
|
||||||
level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
|
level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
|
||||||
@@ -32,8 +34,8 @@ export const logger = winston.createLogger({
|
|||||||
name: value.name,
|
name: value.name,
|
||||||
message: value.message,
|
message: value.message,
|
||||||
stack: value.stack,
|
stack: value.stack,
|
||||||
cause: value.cause,
|
cause: value.cause
|
||||||
}
|
};
|
||||||
} else {
|
} else {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
@@ -43,9 +45,15 @@ export const logger = winston.createLogger({
|
|||||||
new winston.transports.Console({
|
new winston.transports.Console({
|
||||||
format: winston.format.combine(
|
format: winston.format.combine(
|
||||||
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
||||||
winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
|
winston.format.metadata({
|
||||||
...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
|
fillExcept: ["message", "level", "timestamp"]
|
||||||
),
|
|
||||||
}),
|
}),
|
||||||
],
|
...((process.env.ENV === "production" &&
|
||||||
|
process.env.SENTRY_ENVIRONMENT === "dev") ||
|
||||||
|
process.env.ENV !== "production"
|
||||||
|
? [winston.format.colorize(), logFormat]
|
||||||
|
: [])
|
||||||
|
)
|
||||||
|
})
|
||||||
|
]
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ export function parseApi(api: string) {
|
|||||||
return uuid;
|
return uuid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export function uuidToFcUuid(uuid: string) {
|
export function uuidToFcUuid(uuid: string) {
|
||||||
const uuidWithoutDashes = uuid.replace(/-/g, "");
|
const uuidWithoutDashes = uuid.replace(/-/g, "");
|
||||||
return `fc-${uuidWithoutDashes}`;
|
return `fc-${uuidWithoutDashes}`;
|
||||||
|
|||||||
@@ -1,20 +1,20 @@
|
|||||||
import { performRanking } from './ranker';
|
import { performRanking } from "./ranker";
|
||||||
|
|
||||||
describe('performRanking', () => {
|
describe("performRanking", () => {
|
||||||
it('should rank links based on similarity to search query', async () => {
|
it("should rank links based on similarity to search query", async () => {
|
||||||
const linksWithContext = [
|
const linksWithContext = [
|
||||||
'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds',
|
"url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds",
|
||||||
'url: https://example.com/cats, title: Cat care guide, description: Everything about cats',
|
"url: https://example.com/cats, title: Cat care guide, description: Everything about cats",
|
||||||
'url: https://example.com/pets, title: General pet care, description: Care for all types of pets'
|
"url: https://example.com/pets, title: General pet care, description: Care for all types of pets"
|
||||||
];
|
];
|
||||||
|
|
||||||
const links = [
|
const links = [
|
||||||
'https://example.com/dogs',
|
"https://example.com/dogs",
|
||||||
'https://example.com/cats',
|
"https://example.com/cats",
|
||||||
'https://example.com/pets'
|
"https://example.com/pets"
|
||||||
];
|
];
|
||||||
|
|
||||||
const searchQuery = 'cats training';
|
const searchQuery = "cats training";
|
||||||
|
|
||||||
const result = await performRanking(linksWithContext, links, searchQuery);
|
const result = await performRanking(linksWithContext, links, searchQuery);
|
||||||
|
|
||||||
@@ -23,42 +23,39 @@ describe('performRanking', () => {
|
|||||||
expect(result.length).toBe(3);
|
expect(result.length).toBe(3);
|
||||||
|
|
||||||
// First result should be the dogs page since query is about dogs
|
// First result should be the dogs page since query is about dogs
|
||||||
expect(result[0].link).toBe('https://example.com/cats');
|
expect(result[0].link).toBe("https://example.com/cats");
|
||||||
|
|
||||||
// Each result should have required properties
|
// Each result should have required properties
|
||||||
result.forEach(item => {
|
result.forEach((item) => {
|
||||||
expect(item).toHaveProperty('link');
|
expect(item).toHaveProperty("link");
|
||||||
expect(item).toHaveProperty('linkWithContext');
|
expect(item).toHaveProperty("linkWithContext");
|
||||||
expect(item).toHaveProperty('score');
|
expect(item).toHaveProperty("score");
|
||||||
expect(item).toHaveProperty('originalIndex');
|
expect(item).toHaveProperty("originalIndex");
|
||||||
expect(typeof item.score).toBe('number');
|
expect(typeof item.score).toBe("number");
|
||||||
expect(item.score).toBeGreaterThanOrEqual(0);
|
expect(item.score).toBeGreaterThanOrEqual(0);
|
||||||
expect(item.score).toBeLessThanOrEqual(1);
|
expect(item.score).toBeLessThanOrEqual(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Scores should be in descending order
|
// Scores should be in descending order
|
||||||
for (let i = 1; i < result.length; i++) {
|
for (let i = 1; i < result.length; i++) {
|
||||||
expect(result[i].score).toBeLessThanOrEqual(result[i-1].score);
|
expect(result[i].score).toBeLessThanOrEqual(result[i - 1].score);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should handle empty inputs', async () => {
|
it("should handle empty inputs", async () => {
|
||||||
const result = await performRanking([], [], '');
|
const result = await performRanking([], [], "");
|
||||||
expect(result).toEqual([]);
|
expect(result).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should maintain original order for equal scores', async () => {
|
it("should maintain original order for equal scores", async () => {
|
||||||
const linksWithContext = [
|
const linksWithContext = [
|
||||||
'url: https://example.com/1, title: Similar content A, description: test',
|
"url: https://example.com/1, title: Similar content A, description: test",
|
||||||
'url: https://example.com/2, title: Similar content B, description: test'
|
"url: https://example.com/2, title: Similar content B, description: test"
|
||||||
];
|
];
|
||||||
|
|
||||||
const links = [
|
const links = ["https://example.com/1", "https://example.com/2"];
|
||||||
'https://example.com/1',
|
|
||||||
'https://example.com/2'
|
|
||||||
];
|
|
||||||
|
|
||||||
const searchQuery = 'test';
|
const searchQuery = "test";
|
||||||
|
|
||||||
const result = await performRanking(linksWithContext, links, searchQuery);
|
const result = await performRanking(linksWithContext, links, searchQuery);
|
||||||
|
|
||||||
|
|||||||
+15
-13
@@ -1,18 +1,18 @@
|
|||||||
import axios from 'axios';
|
import axios from "axios";
|
||||||
import { configDotenv } from 'dotenv';
|
import { configDotenv } from "dotenv";
|
||||||
import OpenAI from "openai";
|
import OpenAI from "openai";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
const openai = new OpenAI({
|
const openai = new OpenAI({
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
apiKey: process.env.OPENAI_API_KEY
|
||||||
});
|
});
|
||||||
|
|
||||||
async function getEmbedding(text: string) {
|
async function getEmbedding(text: string) {
|
||||||
const embedding = await openai.embeddings.create({
|
const embedding = await openai.embeddings.create({
|
||||||
model: "text-embedding-ada-002",
|
model: "text-embedding-ada-002",
|
||||||
input: text,
|
input: text,
|
||||||
encoding_format: "float",
|
encoding_format: "float"
|
||||||
});
|
});
|
||||||
|
|
||||||
return embedding.data[0].embedding;
|
return embedding.data[0].embedding;
|
||||||
@@ -20,12 +20,8 @@ async function getEmbedding(text: string) {
|
|||||||
|
|
||||||
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
|
||||||
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
|
||||||
const magnitude1 = Math.sqrt(
|
const magnitude1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0));
|
||||||
vec1.reduce((sum, val) => sum + val * val, 0)
|
const magnitude2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0));
|
||||||
);
|
|
||||||
const magnitude2 = Math.sqrt(
|
|
||||||
vec2.reduce((sum, val) => sum + val * val, 0)
|
|
||||||
);
|
|
||||||
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
if (magnitude1 === 0 || magnitude2 === 0) return 0;
|
||||||
return dotProduct / (magnitude1 * magnitude2);
|
return dotProduct / (magnitude1 * magnitude2);
|
||||||
};
|
};
|
||||||
@@ -40,7 +36,11 @@ const textToVector = (searchQuery: string, text: string): number[] => {
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
|
async function performRanking(
|
||||||
|
linksWithContext: string[],
|
||||||
|
links: string[],
|
||||||
|
searchQuery: string
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
// Handle invalid inputs
|
// Handle invalid inputs
|
||||||
if (!searchQuery || !linksWithContext.length || !links.length) {
|
if (!searchQuery || !linksWithContext.length || !links.length) {
|
||||||
@@ -54,7 +54,8 @@ async function performRanking(linksWithContext: string[], links: string[], searc
|
|||||||
const queryEmbedding = await getEmbedding(sanitizedQuery);
|
const queryEmbedding = await getEmbedding(sanitizedQuery);
|
||||||
|
|
||||||
// Generate embeddings for each link and calculate similarity
|
// Generate embeddings for each link and calculate similarity
|
||||||
const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
|
const linksAndScores = await Promise.all(
|
||||||
|
linksWithContext.map(async (linkWithContext, index) => {
|
||||||
try {
|
try {
|
||||||
const linkEmbedding = await getEmbedding(linkWithContext);
|
const linkEmbedding = await getEmbedding(linkWithContext);
|
||||||
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
|
||||||
@@ -74,7 +75,8 @@ async function performRanking(linksWithContext: string[], links: string[], searc
|
|||||||
originalIndex: index
|
originalIndex: index
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}));
|
})
|
||||||
|
);
|
||||||
|
|
||||||
// Sort links based on similarity scores while preserving original order for equal scores
|
// Sort links based on similarity scores while preserving original order for equal scores
|
||||||
linksAndScores.sort((a, b) => {
|
linksAndScores.sort((a, b) => {
|
||||||
|
|||||||
@@ -6,47 +6,61 @@ import { Engine } from "../scraper/scrapeURL/engines";
|
|||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export type ScrapeErrorEvent = {
|
export type ScrapeErrorEvent = {
|
||||||
type: "error",
|
type: "error";
|
||||||
message: string,
|
message: string;
|
||||||
stack?: string,
|
stack?: string;
|
||||||
}
|
};
|
||||||
|
|
||||||
export type ScrapeScrapeEvent = {
|
export type ScrapeScrapeEvent = {
|
||||||
type: "scrape",
|
type: "scrape";
|
||||||
url: string,
|
url: string;
|
||||||
worker?: string,
|
worker?: string;
|
||||||
method: Engine,
|
method: Engine;
|
||||||
result: null | {
|
result: null | {
|
||||||
success: boolean,
|
success: boolean;
|
||||||
response_code?: number,
|
response_code?: number;
|
||||||
response_size?: number,
|
response_size?: number;
|
||||||
error?: string | object,
|
error?: string | object;
|
||||||
// proxy?: string,
|
// proxy?: string,
|
||||||
time_taken: number,
|
time_taken: number;
|
||||||
},
|
};
|
||||||
}
|
};
|
||||||
|
|
||||||
export type ScrapeQueueEvent = {
|
export type ScrapeQueueEvent = {
|
||||||
type: "queue",
|
type: "queue";
|
||||||
event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed",
|
event:
|
||||||
worker?: string,
|
| "waiting"
|
||||||
}
|
| "active"
|
||||||
|
| "completed"
|
||||||
|
| "paused"
|
||||||
|
| "resumed"
|
||||||
|
| "removed"
|
||||||
|
| "failed";
|
||||||
|
worker?: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent;
|
export type ScrapeEvent =
|
||||||
|
| ScrapeErrorEvent
|
||||||
|
| ScrapeScrapeEvent
|
||||||
|
| ScrapeQueueEvent;
|
||||||
|
|
||||||
export class ScrapeEvents {
|
export class ScrapeEvents {
|
||||||
static async insert(jobId: string, content: ScrapeEvent) {
|
static async insert(jobId: string, content: ScrapeEvent) {
|
||||||
if (jobId === "TEST") return null;
|
if (jobId === "TEST") return null;
|
||||||
|
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if (useDbAuthentication) {
|
if (useDbAuthentication) {
|
||||||
try {
|
try {
|
||||||
const result = await supabase.from("scrape_events").insert({
|
const result = await supabase
|
||||||
|
.from("scrape_events")
|
||||||
|
.insert({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
type: content.type,
|
type: content.type,
|
||||||
content: content,
|
content: content
|
||||||
// created_at
|
// created_at
|
||||||
}).select().single();
|
})
|
||||||
|
.select()
|
||||||
|
.single();
|
||||||
return (result.data as any).id;
|
return (result.data as any).id;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// logger.error(`Error inserting scrape event: ${error}`);
|
// logger.error(`Error inserting scrape event: ${error}`);
|
||||||
@@ -57,17 +71,25 @@ export class ScrapeEvents {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) {
|
static async updateScrapeResult(
|
||||||
|
logId: number | null,
|
||||||
|
result: ScrapeScrapeEvent["result"]
|
||||||
|
) {
|
||||||
if (logId === null) return;
|
if (logId === null) return;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any;
|
const previousLog = (
|
||||||
await supabase.from("scrape_events").update({
|
await supabase.from("scrape_events").select().eq("id", logId).single()
|
||||||
|
).data as any;
|
||||||
|
await supabase
|
||||||
|
.from("scrape_events")
|
||||||
|
.update({
|
||||||
content: {
|
content: {
|
||||||
...previousLog.content,
|
...previousLog.content,
|
||||||
result,
|
result
|
||||||
}
|
}
|
||||||
}).eq("id", logId);
|
})
|
||||||
|
.eq("id", logId);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error updating scrape result: ${error}`);
|
logger.error(`Error updating scrape result: ${error}`);
|
||||||
}
|
}
|
||||||
@@ -78,7 +100,7 @@ export class ScrapeEvents {
|
|||||||
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
await this.insert(((job as any).id ? (job as any).id : job) as string, {
|
||||||
type: "queue",
|
type: "queue",
|
||||||
event,
|
event,
|
||||||
worker: process.env.FLY_MACHINE_ID,
|
worker: process.env.FLY_MACHINE_ID
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error logging job event: ${error}`);
|
logger.error(`Error logging job event: ${error}`);
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
|||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
.select()
|
.select()
|
||||||
.eq("crawl_id", crawlId)
|
.eq("crawl_id", crawlId);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
|
||||||
@@ -73,7 +73,6 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
|
|||||||
return data;
|
return data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
export const supabaseGetJobByIdOnlyData = async (jobId: string) => {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -18,7 +18,10 @@ describe("isSameDomain", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should return true for a subdomain with different protocols", () => {
|
it("should return true for a subdomain with different protocols", () => {
|
||||||
const result = isSameDomain("https://sub.example.com", "http://example.com");
|
const result = isSameDomain(
|
||||||
|
"https://sub.example.com",
|
||||||
|
"http://example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -30,32 +33,44 @@ describe("isSameDomain", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should return true for a subdomain with www prefix", () => {
|
it("should return true for a subdomain with www prefix", () => {
|
||||||
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
|
const result = isSameDomain(
|
||||||
|
"http://www.sub.example.com",
|
||||||
|
"http://example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return true for the same domain with www prefix", () => {
|
it("should return true for the same domain with www prefix", () => {
|
||||||
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
|
const result = isSameDomain(
|
||||||
|
"http://docs.s.s.example.com",
|
||||||
|
"http://example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
describe("isSameSubdomain", () => {
|
describe("isSameSubdomain", () => {
|
||||||
it("should return false for a subdomain", () => {
|
it("should return false for a subdomain", () => {
|
||||||
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
|
const result = isSameSubdomain(
|
||||||
|
"http://example.com",
|
||||||
|
"http://docs.example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return true for the same subdomain", () => {
|
it("should return true for the same subdomain", () => {
|
||||||
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
|
const result = isSameSubdomain(
|
||||||
|
"http://docs.example.com",
|
||||||
|
"http://docs.example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return false for different subdomains", () => {
|
it("should return false for different subdomains", () => {
|
||||||
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
|
const result = isSameSubdomain(
|
||||||
|
"http://docs.example.com",
|
||||||
|
"http://blog.example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -72,17 +87,26 @@ describe("isSameSubdomain", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should return true for the same subdomain with different protocols", () => {
|
it("should return true for the same subdomain with different protocols", () => {
|
||||||
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
|
const result = isSameSubdomain(
|
||||||
|
"https://docs.example.com",
|
||||||
|
"http://docs.example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return true for the same subdomain with www prefix", () => {
|
it("should return true for the same subdomain with www prefix", () => {
|
||||||
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
|
const result = isSameSubdomain(
|
||||||
|
"http://www.docs.example.com",
|
||||||
|
"http://docs.example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(true);
|
expect(result).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
const result = isSameSubdomain(
|
||||||
|
"http://www.docs.example.com",
|
||||||
|
"http://blog.example.com"
|
||||||
|
);
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -116,19 +140,13 @@ describe("removeDuplicateUrls", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should prefer https over http", () => {
|
it("should prefer https over http", () => {
|
||||||
const urls = [
|
const urls = ["http://example.com", "https://example.com"];
|
||||||
"http://example.com",
|
|
||||||
"https://example.com"
|
|
||||||
];
|
|
||||||
const result = removeDuplicateUrls(urls);
|
const result = removeDuplicateUrls(urls);
|
||||||
expect(result).toEqual(["https://example.com"]);
|
expect(result).toEqual(["https://example.com"]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should prefer non-www over www", () => {
|
it("should prefer non-www over www", () => {
|
||||||
const urls = [
|
const urls = ["https://www.example.com", "https://example.com"];
|
||||||
"https://www.example.com",
|
|
||||||
"https://example.com"
|
|
||||||
];
|
|
||||||
const result = removeDuplicateUrls(urls);
|
const result = removeDuplicateUrls(urls);
|
||||||
expect(result).toEqual(["https://example.com"]);
|
expect(result).toEqual(["https://example.com"]);
|
||||||
});
|
});
|
||||||
@@ -140,19 +158,13 @@ describe("removeDuplicateUrls", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("should handle URLs with different cases", () => {
|
it("should handle URLs with different cases", () => {
|
||||||
const urls = [
|
const urls = ["https://EXAMPLE.com", "https://example.com"];
|
||||||
"https://EXAMPLE.com",
|
|
||||||
"https://example.com"
|
|
||||||
];
|
|
||||||
const result = removeDuplicateUrls(urls);
|
const result = removeDuplicateUrls(urls);
|
||||||
expect(result).toEqual(["https://EXAMPLE.com"]);
|
expect(result).toEqual(["https://EXAMPLE.com"]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should handle URLs with trailing slashes", () => {
|
it("should handle URLs with trailing slashes", () => {
|
||||||
const urls = [
|
const urls = ["https://example.com", "https://example.com/"];
|
||||||
"https://example.com",
|
|
||||||
"https://example.com/"
|
|
||||||
];
|
|
||||||
const result = removeDuplicateUrls(urls);
|
const result = removeDuplicateUrls(urls);
|
||||||
expect(result).toEqual(["https://example.com"]);
|
expect(result).toEqual(["https://example.com"]);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -74,16 +74,21 @@ export function isSameDomain(url: string, baseUrl: string) {
|
|||||||
const typedUrlObj2 = urlObj2 as URL;
|
const typedUrlObj2 = urlObj2 as URL;
|
||||||
|
|
||||||
const cleanHostname = (hostname: string) => {
|
const cleanHostname = (hostname: string) => {
|
||||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
return hostname.startsWith("www.") ? hostname.slice(4) : hostname;
|
||||||
};
|
};
|
||||||
|
|
||||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
const domain1 = cleanHostname(typedUrlObj1.hostname)
|
||||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
.split(".")
|
||||||
|
.slice(-2)
|
||||||
|
.join(".");
|
||||||
|
const domain2 = cleanHostname(typedUrlObj2.hostname)
|
||||||
|
.split(".")
|
||||||
|
.slice(-2)
|
||||||
|
.join(".");
|
||||||
|
|
||||||
return domain1 === domain2;
|
return domain1 === domain2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export function isSameSubdomain(url: string, baseUrl: string) {
|
export function isSameSubdomain(url: string, baseUrl: string) {
|
||||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||||
@@ -96,20 +101,31 @@ export function isSameSubdomain(url: string, baseUrl: string) {
|
|||||||
const typedUrlObj2 = urlObj2 as URL;
|
const typedUrlObj2 = urlObj2 as URL;
|
||||||
|
|
||||||
const cleanHostname = (hostname: string) => {
|
const cleanHostname = (hostname: string) => {
|
||||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
return hostname.startsWith("www.") ? hostname.slice(4) : hostname;
|
||||||
};
|
};
|
||||||
|
|
||||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
const domain1 = cleanHostname(typedUrlObj1.hostname)
|
||||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
.split(".")
|
||||||
|
.slice(-2)
|
||||||
|
.join(".");
|
||||||
|
const domain2 = cleanHostname(typedUrlObj2.hostname)
|
||||||
|
.split(".")
|
||||||
|
.slice(-2)
|
||||||
|
.join(".");
|
||||||
|
|
||||||
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
|
const subdomain1 = cleanHostname(typedUrlObj1.hostname)
|
||||||
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
|
.split(".")
|
||||||
|
.slice(0, -2)
|
||||||
|
.join(".");
|
||||||
|
const subdomain2 = cleanHostname(typedUrlObj2.hostname)
|
||||||
|
.split(".")
|
||||||
|
.slice(0, -2)
|
||||||
|
.join(".");
|
||||||
|
|
||||||
// Check if the domains are the same and the subdomains are the same
|
// Check if the domains are the same and the subdomains are the same
|
||||||
return domain1 === domain2 && subdomain1 === subdomain2;
|
return domain1 === domain2 && subdomain1 === subdomain2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export const checkAndUpdateURLForMap = (url: string) => {
|
export const checkAndUpdateURLForMap = (url: string) => {
|
||||||
if (!protocolIncluded(url)) {
|
if (!protocolIncluded(url)) {
|
||||||
url = `http://${url}`;
|
url = `http://${url}`;
|
||||||
@@ -119,7 +135,6 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
|||||||
url = url.slice(0, -1);
|
url = url.slice(0, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const { error, urlObj } = getURLobj(url);
|
const { error, urlObj } = getURLobj(url);
|
||||||
if (error) {
|
if (error) {
|
||||||
throw new Error("Invalid URL");
|
throw new Error("Invalid URL");
|
||||||
@@ -137,17 +152,13 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
|||||||
return { urlObj: typedUrlObj, url: url };
|
return { urlObj: typedUrlObj, url: url };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export function removeDuplicateUrls(urls: string[]): string[] {
|
export function removeDuplicateUrls(urls: string[]): string[] {
|
||||||
const urlMap = new Map<string, string>();
|
const urlMap = new Map<string, string>();
|
||||||
|
|
||||||
for (const url of urls) {
|
for (const url of urls) {
|
||||||
const parsedUrl = new URL(url);
|
const parsedUrl = new URL(url);
|
||||||
const protocol = parsedUrl.protocol;
|
const protocol = parsedUrl.protocol;
|
||||||
const hostname = parsedUrl.hostname.replace(/^www\./, '');
|
const hostname = parsedUrl.hostname.replace(/^www\./, "");
|
||||||
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
||||||
|
|
||||||
const key = `${hostname}${path}`;
|
const key = `${hostname}${path}`;
|
||||||
@@ -158,9 +169,13 @@ export function removeDuplicateUrls(urls: string[]): string[] {
|
|||||||
const existingUrl = new URL(urlMap.get(key)!);
|
const existingUrl = new URL(urlMap.get(key)!);
|
||||||
const existingProtocol = existingUrl.protocol;
|
const existingProtocol = existingUrl.protocol;
|
||||||
|
|
||||||
if (protocol === 'https:' && existingProtocol === 'http:') {
|
if (protocol === "https:" && existingProtocol === "http:") {
|
||||||
urlMap.set(key, url);
|
urlMap.set(key, url);
|
||||||
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
|
} else if (
|
||||||
|
protocol === existingProtocol &&
|
||||||
|
!parsedUrl.hostname.startsWith("www.") &&
|
||||||
|
existingUrl.hostname.startsWith("www.")
|
||||||
|
) {
|
||||||
urlMap.set(key, url);
|
urlMap.set(key, url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,10 +8,10 @@ let warningCount = 0;
|
|||||||
|
|
||||||
export function withAuth<T, U extends any[]>(
|
export function withAuth<T, U extends any[]>(
|
||||||
originalFunction: (...args: U) => Promise<T>,
|
originalFunction: (...args: U) => Promise<T>,
|
||||||
mockSuccess: T,
|
mockSuccess: T
|
||||||
) {
|
) {
|
||||||
return async function (...args: U): Promise<T> {
|
return async function (...args: U): Promise<T> {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if (!useDbAuthentication) {
|
if (!useDbAuthentication) {
|
||||||
if (warningCount < 5) {
|
if (warningCount < 5) {
|
||||||
logger.warn("You're bypassing authentication");
|
logger.warn("You're bypassing authentication");
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { Job } from "bullmq";
|
|||||||
import {
|
import {
|
||||||
WebScraperOptions,
|
WebScraperOptions,
|
||||||
RunWebScraperParams,
|
RunWebScraperParams,
|
||||||
RunWebScraperResult,
|
RunWebScraperResult
|
||||||
} from "../types";
|
} from "../types";
|
||||||
import { billTeam } from "../services/billing/credit_billing";
|
import { billTeam } from "../services/billing/credit_billing";
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
@@ -10,25 +10,31 @@ import { supabase_service } from "../services/supabase";
|
|||||||
import { logger } from "../lib/logger";
|
import { logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
|
import {
|
||||||
|
EngineResultsTracker,
|
||||||
|
scrapeURL,
|
||||||
|
ScrapeUrlResponse
|
||||||
|
} from "../scraper/scrapeURL";
|
||||||
import { Engine } from "../scraper/scrapeURL/engines";
|
import { Engine } from "../scraper/scrapeURL/engines";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
token,
|
token
|
||||||
}: {
|
}: {
|
||||||
job: Job<WebScraperOptions> & { id: string };
|
job: Job<WebScraperOptions> & { id: string };
|
||||||
token: string;
|
token: string;
|
||||||
}) {
|
}) {
|
||||||
return (await runWebScraper({
|
return await runWebScraper({
|
||||||
url: job.data.url,
|
url: job.data.url,
|
||||||
mode: job.data.mode,
|
mode: job.data.mode,
|
||||||
scrapeOptions: {
|
scrapeOptions: {
|
||||||
...job.data.scrapeOptions,
|
...job.data.scrapeOptions,
|
||||||
...(job.data.crawl_id ? ({
|
...(job.data.crawl_id
|
||||||
formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
|
? {
|
||||||
}): {}),
|
formats: job.data.scrapeOptions.formats.concat(["rawHtml"])
|
||||||
|
}
|
||||||
|
: {})
|
||||||
},
|
},
|
||||||
internalOptions: job.data.internalOptions,
|
internalOptions: job.data.internalOptions,
|
||||||
// onSuccess: (result, mode) => {
|
// onSuccess: (result, mode) => {
|
||||||
@@ -42,8 +48,8 @@ export async function startWebScraperPipeline({
|
|||||||
team_id: job.data.team_id,
|
team_id: job.data.team_id,
|
||||||
bull_job_id: job.id.toString(),
|
bull_job_id: job.id.toString(),
|
||||||
priority: job.opts.priority,
|
priority: job.opts.priority,
|
||||||
is_scrape: job.data.is_scrape ?? false,
|
is_scrape: job.data.is_scrape ?? false
|
||||||
}));
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function runWebScraper({
|
export async function runWebScraper({
|
||||||
@@ -56,28 +62,40 @@ export async function runWebScraper({
|
|||||||
team_id,
|
team_id,
|
||||||
bull_job_id,
|
bull_job_id,
|
||||||
priority,
|
priority,
|
||||||
is_scrape=false,
|
is_scrape = false
|
||||||
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
|
||||||
let response: ScrapeUrlResponse | undefined = undefined;
|
let response: ScrapeUrlResponse | undefined = undefined;
|
||||||
let engines: EngineResultsTracker = {};
|
let engines: EngineResultsTracker = {};
|
||||||
try {
|
try {
|
||||||
response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
|
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
|
||||||
|
priority,
|
||||||
|
...internalOptions
|
||||||
|
});
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
if (response.error instanceof Error) {
|
if (response.error instanceof Error) {
|
||||||
throw response.error;
|
throw response.error;
|
||||||
} else {
|
} else {
|
||||||
throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
|
throw new Error(
|
||||||
|
"scrapeURL error: " +
|
||||||
|
(Array.isArray(response.error)
|
||||||
|
? JSON.stringify(response.error)
|
||||||
|
: typeof response.error === "object"
|
||||||
|
? JSON.stringify({ ...response.error })
|
||||||
|
: response.error)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(is_scrape === false) {
|
if (is_scrape === false) {
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
if (scrapeOptions.extract) {
|
if (scrapeOptions.extract) {
|
||||||
creditsToBeBilled = 5;
|
creditsToBeBilled = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
|
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
|
||||||
logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
logger.error(
|
||||||
|
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`
|
||||||
|
);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -88,42 +106,70 @@ export async function runWebScraper({
|
|||||||
engines = response.engines;
|
engines = response.engines;
|
||||||
return response;
|
return response;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
|
engines =
|
||||||
|
response !== undefined
|
||||||
|
? response.engines
|
||||||
|
: typeof error === "object" && error !== null
|
||||||
|
? ((error as any).results ?? {})
|
||||||
|
: {};
|
||||||
|
|
||||||
if (response !== undefined) {
|
if (response !== undefined) {
|
||||||
return {
|
return {
|
||||||
...response,
|
...response,
|
||||||
success: false,
|
success: false,
|
||||||
error,
|
error
|
||||||
}
|
};
|
||||||
} else {
|
} else {
|
||||||
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
|
return {
|
||||||
|
success: false,
|
||||||
|
error,
|
||||||
|
logs: ["no logs -- error coming from runWebScraper"],
|
||||||
|
engines
|
||||||
|
};
|
||||||
}
|
}
|
||||||
// onError(error);
|
// onError(error);
|
||||||
} finally {
|
} finally {
|
||||||
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
|
const engineOrder = Object.entries(engines)
|
||||||
|
.sort((a, b) => a[1].startedAt - b[1].startedAt)
|
||||||
|
.map((x) => x[0]) as Engine[];
|
||||||
|
|
||||||
for (const engine of engineOrder) {
|
for (const engine of engineOrder) {
|
||||||
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
|
const result = engines[engine] as Exclude<
|
||||||
|
EngineResultsTracker[Engine],
|
||||||
|
undefined
|
||||||
|
>;
|
||||||
ScrapeEvents.insert(bull_job_id, {
|
ScrapeEvents.insert(bull_job_id, {
|
||||||
type: "scrape",
|
type: "scrape",
|
||||||
url,
|
url,
|
||||||
method: engine,
|
method: engine,
|
||||||
result: {
|
result: {
|
||||||
success: result.state === "success",
|
success: result.state === "success",
|
||||||
response_code: (result.state === "success" ? result.result.statusCode : undefined),
|
response_code:
|
||||||
response_size: (result.state === "success" ? result.result.html.length : undefined),
|
result.state === "success" ? result.result.statusCode : undefined,
|
||||||
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
|
response_size:
|
||||||
time_taken: result.finishedAt - result.startedAt,
|
result.state === "success" ? result.result.html.length : undefined,
|
||||||
},
|
error:
|
||||||
|
result.state === "error"
|
||||||
|
? result.error
|
||||||
|
: result.state === "timeout"
|
||||||
|
? "Timed out"
|
||||||
|
: undefined,
|
||||||
|
time_taken: result.finishedAt - result.startedAt
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
|
const saveJob = async (
|
||||||
|
job: Job,
|
||||||
|
result: any,
|
||||||
|
token: string,
|
||||||
|
mode: string,
|
||||||
|
engines?: EngineResultsTracker
|
||||||
|
) => {
|
||||||
try {
|
try {
|
||||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
|
||||||
if (useDbAuthentication) {
|
if (useDbAuthentication) {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import {
|
|||||||
autoscalerController,
|
autoscalerController,
|
||||||
checkQueuesController,
|
checkQueuesController,
|
||||||
cleanBefore24hCompleteJobsController,
|
cleanBefore24hCompleteJobsController,
|
||||||
queuesController,
|
queuesController
|
||||||
} from "../controllers/v0/admin/queue";
|
} from "../controllers/v0/admin/queue";
|
||||||
import { wrap } from "./v1";
|
import { wrap } from "./v1";
|
||||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||||
@@ -26,10 +26,7 @@ adminRouter.get(
|
|||||||
checkQueuesController
|
checkQueuesController
|
||||||
);
|
);
|
||||||
|
|
||||||
adminRouter.get(
|
adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController);
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/queues`,
|
|
||||||
queuesController
|
|
||||||
);
|
|
||||||
|
|
||||||
adminRouter.get(
|
adminRouter.get(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||||
@@ -38,5 +35,5 @@ adminRouter.get(
|
|||||||
|
|
||||||
adminRouter.post(
|
adminRouter.post(
|
||||||
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
||||||
wrap(acucCacheClearController),
|
wrap(acucCacheClearController)
|
||||||
);
|
);
|
||||||
|
|||||||
+60
-40
@@ -4,7 +4,12 @@ import { crawlController } from "../controllers/v1/crawl";
|
|||||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||||
import { crawlStatusController } from "../controllers/v1/crawl-status";
|
import { crawlStatusController } from "../controllers/v1/crawl-status";
|
||||||
import { mapController } from "../controllers/v1/map";
|
import { mapController } from "../controllers/v1/map";
|
||||||
import { ErrorResponse, RequestWithACUC, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
import {
|
||||||
|
ErrorResponse,
|
||||||
|
RequestWithACUC,
|
||||||
|
RequestWithAuth,
|
||||||
|
RequestWithMaybeAuth
|
||||||
|
} from "../controllers/v1/types";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { authenticateUser } from "../controllers/auth";
|
import { authenticateUser } from "../controllers/auth";
|
||||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||||
@@ -27,41 +32,55 @@ import { extractController } from "../controllers/v1/extract";
|
|||||||
// import { livenessController } from "../controllers/v1/liveness";
|
// import { livenessController } from "../controllers/v1/liveness";
|
||||||
// import { readinessController } from "../controllers/v1/readiness";
|
// import { readinessController } from "../controllers/v1/readiness";
|
||||||
|
|
||||||
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
function checkCreditsMiddleware(
|
||||||
|
minimum?: number
|
||||||
|
): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
if (!minimum && req.body) {
|
if (!minimum && req.body) {
|
||||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
minimum =
|
||||||
|
(req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||||
}
|
}
|
||||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
|
const { success, remainingCredits, chunk } = await checkTeamCredits(
|
||||||
|
req.acuc,
|
||||||
|
req.auth.team_id,
|
||||||
|
minimum ?? 1
|
||||||
|
);
|
||||||
if (chunk) {
|
if (chunk) {
|
||||||
req.acuc = chunk;
|
req.acuc = chunk;
|
||||||
}
|
}
|
||||||
if (!success) {
|
if (!success) {
|
||||||
logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
logger.error(
|
||||||
|
`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`
|
||||||
|
);
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
return res
|
||||||
|
.status(402)
|
||||||
|
.json({
|
||||||
|
success: false,
|
||||||
|
error:
|
||||||
|
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value."
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
req.account = { remainingCredits };
|
req.account = { remainingCredits };
|
||||||
next();
|
next();
|
||||||
})()
|
})().catch((err) => next(err));
|
||||||
.catch(err => next(err));
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
export function authMiddleware(
|
||||||
|
rateLimiterMode: RateLimiterMode
|
||||||
|
): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
const auth = await authenticateUser(
|
const auth = await authenticateUser(req, res, rateLimiterMode);
|
||||||
req,
|
|
||||||
res,
|
|
||||||
rateLimiterMode,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!auth.success) {
|
if (!auth.success) {
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(auth.status).json({ success: false, error: auth.error });
|
return res
|
||||||
|
.status(auth.status)
|
||||||
|
.json({ success: false, error: auth.error });
|
||||||
} else {
|
} else {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -75,41 +94,52 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW
|
|||||||
req.account = { remainingCredits: chunk.remaining_credits };
|
req.account = { remainingCredits: chunk.remaining_credits };
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
})()
|
})().catch((err) => next(err));
|
||||||
.catch(err => next(err));
|
};
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
function idempotencyMiddleware(
|
||||||
|
req: Request,
|
||||||
|
res: Response,
|
||||||
|
next: NextFunction
|
||||||
|
) {
|
||||||
(async () => {
|
(async () => {
|
||||||
if (req.headers["x-idempotency-key"]) {
|
if (req.headers["x-idempotency-key"]) {
|
||||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||||
if (!isIdempotencyValid) {
|
if (!isIdempotencyValid) {
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
return res
|
||||||
|
.status(409)
|
||||||
|
.json({ success: false, error: "Idempotency key already used" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
createIdempotencyKey(req);
|
createIdempotencyKey(req);
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
})()
|
})().catch((err) => next(err));
|
||||||
.catch(err => next(err));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
return res
|
||||||
|
.status(403)
|
||||||
|
.json({
|
||||||
|
success: false,
|
||||||
|
error:
|
||||||
|
"URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions."
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
}
|
}
|
||||||
|
|
||||||
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
export function wrap(
|
||||||
|
controller: (req: Request, res: Response) => Promise<any>
|
||||||
|
): (req: Request, res: Response, next: NextFunction) => any {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
controller(req, res)
|
controller(req, res).catch((err) => next(err));
|
||||||
.catch(err => next(err))
|
};
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
expressWs(express());
|
expressWs(express());
|
||||||
@@ -160,13 +190,10 @@ v1Router.get(
|
|||||||
"/batch/scrape/:jobId",
|
"/batch/scrape/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
// Yes, it uses the same controller as the normal crawl status controller
|
// Yes, it uses the same controller as the normal crawl status controller
|
||||||
wrap((req:any, res):any => crawlStatusController(req, res, true))
|
wrap((req: any, res): any => crawlStatusController(req, res, true))
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.get(
|
v1Router.get("/scrape/:jobId", wrap(scrapeStatusController));
|
||||||
"/scrape/:jobId",
|
|
||||||
wrap(scrapeStatusController)
|
|
||||||
);
|
|
||||||
|
|
||||||
v1Router.get(
|
v1Router.get(
|
||||||
"/concurrency-check",
|
"/concurrency-check",
|
||||||
@@ -174,10 +201,7 @@ v1Router.get(
|
|||||||
wrap(concurrencyCheckController)
|
wrap(concurrencyCheckController)
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.ws(
|
v1Router.ws("/crawl/:jobId", crawlStatusWSController);
|
||||||
"/crawl/:jobId",
|
|
||||||
crawlStatusWSController
|
|
||||||
);
|
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/extract",
|
"/extract",
|
||||||
@@ -186,11 +210,8 @@ v1Router.post(
|
|||||||
wrap(extractController)
|
wrap(extractController)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||||
|
|
||||||
|
|
||||||
v1Router.delete(
|
v1Router.delete(
|
||||||
"/crawl/:jobId",
|
"/crawl/:jobId",
|
||||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||||
@@ -207,4 +228,3 @@ v1Router.delete(
|
|||||||
// Health/Probe routes
|
// Health/Probe routes
|
||||||
// v1Router.get("/health/liveness", livenessController);
|
// v1Router.get("/health/liveness", livenessController);
|
||||||
// v1Router.get("/health/readiness", readinessController);
|
// v1Router.get("/health/readiness", readinessController);
|
||||||
|
|
||||||
|
|||||||
@@ -18,19 +18,19 @@ async function sendCrawl(result: Result): Promise<string | undefined> {
|
|||||||
{
|
{
|
||||||
url: url,
|
url: url,
|
||||||
crawlerOptions: {
|
crawlerOptions: {
|
||||||
limit: 75,
|
limit: 75
|
||||||
},
|
},
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
includeHtml: true,
|
includeHtml: true,
|
||||||
replaceAllPathsWithAbsolutePaths: true,
|
replaceAllPathsWithAbsolutePaths: true,
|
||||||
waitFor: 1000,
|
waitFor: 1000
|
||||||
},
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
Authorization: `Bearer `,
|
Authorization: `Bearer `
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
result.idempotency_key = idempotencyKey;
|
result.idempotency_key = idempotencyKey;
|
||||||
@@ -51,8 +51,8 @@ async function getContent(result: Result): Promise<boolean> {
|
|||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
Authorization: `Bearer `,
|
Authorization: `Bearer `
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
if (response.data.status === "completed") {
|
if (response.data.status === "completed") {
|
||||||
@@ -95,9 +95,9 @@ async function processResults(results: Result[]): Promise<void> {
|
|||||||
// Save the result to the file
|
// Save the result to the file
|
||||||
try {
|
try {
|
||||||
// Save job id along with the start_url
|
// Save job id along with the start_url
|
||||||
const resultWithJobId = results.map(r => ({
|
const resultWithJobId = results.map((r) => ({
|
||||||
start_url: r.start_url,
|
start_url: r.start_url,
|
||||||
job_id: r.job_id,
|
job_id: r.job_id
|
||||||
}));
|
}));
|
||||||
await fs.writeFile(
|
await fs.writeFile(
|
||||||
"results_with_job_id_4000_6000.json",
|
"results_with_job_id_4000_6000.json",
|
||||||
|
|||||||
@@ -1,27 +1,29 @@
|
|||||||
// crawler.test.ts
|
// crawler.test.ts
|
||||||
import { WebCrawler } from '../crawler';
|
import { WebCrawler } from "../crawler";
|
||||||
import axios from 'axios';
|
import axios from "axios";
|
||||||
import robotsParser from 'robots-parser';
|
import robotsParser from "robots-parser";
|
||||||
|
|
||||||
jest.mock('axios');
|
jest.mock("axios");
|
||||||
jest.mock('robots-parser');
|
jest.mock("robots-parser");
|
||||||
|
|
||||||
describe('WebCrawler', () => {
|
describe("WebCrawler", () => {
|
||||||
let crawler: WebCrawler;
|
let crawler: WebCrawler;
|
||||||
const mockAxios = axios as jest.Mocked<typeof axios>;
|
const mockAxios = axios as jest.Mocked<typeof axios>;
|
||||||
const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;
|
const mockRobotsParser = robotsParser as jest.MockedFunction<
|
||||||
|
typeof robotsParser
|
||||||
|
>;
|
||||||
|
|
||||||
let maxCrawledDepth: number;
|
let maxCrawledDepth: number;
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
// Setup default mocks
|
// Setup default mocks
|
||||||
mockAxios.get.mockImplementation((url) => {
|
mockAxios.get.mockImplementation((url) => {
|
||||||
if (url.includes('robots.txt')) {
|
if (url.includes("robots.txt")) {
|
||||||
return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
|
return Promise.resolve({ data: "User-agent: *\nAllow: /" });
|
||||||
} else if (url.includes('sitemap.xml')) {
|
} else if (url.includes("sitemap.xml")) {
|
||||||
return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
|
return Promise.resolve({ data: "sitemap content" }); // You would normally parse this to URLs
|
||||||
}
|
}
|
||||||
return Promise.resolve({ data: '<html></html>' });
|
return Promise.resolve({ data: "<html></html>" });
|
||||||
});
|
});
|
||||||
|
|
||||||
mockRobotsParser.mockReturnValue({
|
mockRobotsParser.mockReturnValue({
|
||||||
@@ -30,12 +32,12 @@ describe('WebCrawler', () => {
|
|||||||
getMatchingLineNumber: jest.fn().mockReturnValue(0),
|
getMatchingLineNumber: jest.fn().mockReturnValue(0),
|
||||||
getCrawlDelay: jest.fn().mockReturnValue(0),
|
getCrawlDelay: jest.fn().mockReturnValue(0),
|
||||||
getSitemaps: jest.fn().mockReturnValue([]),
|
getSitemaps: jest.fn().mockReturnValue([]),
|
||||||
getPreferredHost: jest.fn().mockReturnValue('example.com')
|
getPreferredHost: jest.fn().mockReturnValue("example.com")
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should respect the limit parameter by not returning more links than specified', async () => {
|
it("should respect the limit parameter by not returning more links than specified", async () => {
|
||||||
const initialUrl = 'http://example.com';
|
const initialUrl = "http://example.com";
|
||||||
const limit = 2; // Set a limit for the number of links
|
const limit = 2; // Set a limit for the number of links
|
||||||
|
|
||||||
crawler = new WebCrawler({
|
crawler = new WebCrawler({
|
||||||
@@ -48,24 +50,27 @@ describe('WebCrawler', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Mock sitemap fetching function to return more links than the limit
|
// Mock sitemap fetching function to return more links than the limit
|
||||||
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
crawler["tryFetchSitemapLinks"] = jest
|
||||||
|
.fn()
|
||||||
|
.mockResolvedValue([
|
||||||
initialUrl,
|
initialUrl,
|
||||||
initialUrl + '/page1',
|
initialUrl + "/page1",
|
||||||
initialUrl + '/page2',
|
initialUrl + "/page2",
|
||||||
initialUrl + '/page3'
|
initialUrl + "/page3"
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const filteredLinks = crawler['filterLinks'](
|
const filteredLinks = crawler["filterLinks"](
|
||||||
[initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'],
|
[
|
||||||
|
initialUrl,
|
||||||
|
initialUrl + "/page1",
|
||||||
|
initialUrl + "/page2",
|
||||||
|
initialUrl + "/page3"
|
||||||
|
],
|
||||||
limit,
|
limit,
|
||||||
10
|
10
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit
|
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit
|
||||||
expect(filteredLinks).toEqual([
|
expect(filteredLinks).toEqual([initialUrl, initialUrl + "/page1"]);
|
||||||
initialUrl,
|
|
||||||
initialUrl + '/page1'
|
|
||||||
]);
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import CacheableLookup from 'cacheable-lookup';
|
import CacheableLookup from "cacheable-lookup";
|
||||||
import https from 'node:https';
|
import https from "node:https";
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
|
||||||
describe("DNS", () => {
|
describe("DNS", () => {
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false,
|
allowExternalContentLinks = false,
|
||||||
allowSubdomains = false,
|
allowSubdomains = false,
|
||||||
ignoreRobotsTxt = false,
|
ignoreRobotsTxt = false
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@@ -75,9 +75,14 @@ export class WebCrawler {
|
|||||||
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
public filterLinks(
|
||||||
|
sitemapLinks: string[],
|
||||||
|
limit: number,
|
||||||
|
maxDepth: number,
|
||||||
|
fromMap: boolean = false
|
||||||
|
): string[] {
|
||||||
// If the initial URL is a sitemap.xml, skip filtering
|
// If the initial URL is a sitemap.xml, skip filtering
|
||||||
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
|
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
|
||||||
return sitemapLinks.slice(0, limit);
|
return sitemapLinks.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,14 +92,17 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
url = new URL(link.trim(), this.baseUrl);
|
url = new URL(link.trim(), this.baseUrl);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Error processing link: ${link}`, { link, error, method: "filterLinks" });
|
this.logger.debug(`Error processing link: ${link}`, {
|
||||||
|
link,
|
||||||
|
error,
|
||||||
|
method: "filterLinks"
|
||||||
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
|
||||||
const depth = getURLDepth(url.toString());
|
const depth = getURLDepth(url.toString());
|
||||||
|
|
||||||
|
|
||||||
// Check if the link exceeds the maximum depth allowed
|
// Check if the link exceeds the maximum depth allowed
|
||||||
if (depth > maxDepth) {
|
if (depth > maxDepth) {
|
||||||
return false;
|
return false;
|
||||||
@@ -113,9 +121,11 @@ export class WebCrawler {
|
|||||||
|
|
||||||
// Check if the link matches the include patterns, if any are specified
|
// Check if the link matches the include patterns, if any are specified
|
||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
if (!this.includes.some((includePattern) =>
|
if (
|
||||||
|
!this.includes.some((includePattern) =>
|
||||||
new RegExp(includePattern).test(path)
|
new RegExp(includePattern).test(path)
|
||||||
)) {
|
)
|
||||||
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -128,8 +138,11 @@ export class WebCrawler {
|
|||||||
} catch (_) {
|
} catch (_) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
|
const initialHostname = normalizedInitialUrl.hostname.replace(
|
||||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
/^www\./,
|
||||||
|
""
|
||||||
|
);
|
||||||
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
|
||||||
|
|
||||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||||
// commented to able to handling external link on allowExternalContentLinks
|
// commented to able to handling external link on allowExternalContentLinks
|
||||||
@@ -138,15 +151,22 @@ export class WebCrawler {
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
if (!this.allowBackwardCrawling) {
|
if (!this.allowBackwardCrawling) {
|
||||||
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
if (
|
||||||
|
!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
|
||||||
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
|
const isAllowed = this.ignoreRobotsTxt
|
||||||
|
? true
|
||||||
|
: (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
|
||||||
// Check if the link is disallowed by robots.txt
|
// Check if the link is disallowed by robots.txt
|
||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
this.logger.debug(`Link disallowed by robots.txt: ${link}`, { method: "filterLinks", link });
|
this.logger.debug(`Link disallowed by robots.txt: ${link}`, {
|
||||||
|
method: "filterLinks",
|
||||||
|
link
|
||||||
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,12 +181,15 @@ export class WebCrawler {
|
|||||||
|
|
||||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||||
let extraArgs = {};
|
let extraArgs = {};
|
||||||
if(skipTlsVerification) {
|
if (skipTlsVerification) {
|
||||||
extraArgs["httpsAgent"] = new https.Agent({
|
extraArgs["httpsAgent"] = new https.Agent({
|
||||||
rejectUnauthorized: false
|
rejectUnauthorized: false
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
const response = await axios.get(this.robotsTxtUrl, {
|
||||||
|
timeout: axiosTimeout,
|
||||||
|
...extraArgs
|
||||||
|
});
|
||||||
return response.data;
|
return response.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,15 +197,25 @@ export class WebCrawler {
|
|||||||
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
this.robots = robotsParser(this.robotsTxtUrl, txt);
|
||||||
}
|
}
|
||||||
|
|
||||||
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
|
public async tryGetSitemap(
|
||||||
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { method: "tryGetSitemap" });
|
fromMap: boolean = false,
|
||||||
|
onlySitemap: boolean = false
|
||||||
|
): Promise<{ url: string; html: string }[] | null> {
|
||||||
|
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
|
||||||
|
method: "tryGetSitemap"
|
||||||
|
});
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if(fromMap && onlySitemap) {
|
if (fromMap && onlySitemap) {
|
||||||
return sitemapLinks.map(link => ({ url: link, html: "" }));
|
return sitemapLinks.map((link) => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
|
let filteredLinks = this.filterLinks(
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
sitemapLinks,
|
||||||
|
this.limit,
|
||||||
|
this.maxCrawledDepth,
|
||||||
|
fromMap
|
||||||
|
);
|
||||||
|
return filteredLinks.map((link) => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -204,15 +237,18 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
if (this.isInternalLink(fullUrl)) {
|
||||||
if (this.isInternalLink(fullUrl) &&
|
// INTERNAL LINKS
|
||||||
|
if (
|
||||||
|
this.isInternalLink(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
) {
|
) {
|
||||||
return fullUrl;
|
return fullUrl;
|
||||||
}
|
}
|
||||||
} else { // EXTERNAL LINKS
|
} else {
|
||||||
|
// EXTERNAL LINKS
|
||||||
if (
|
if (
|
||||||
this.isInternalLink(url) &&
|
this.isInternalLink(url) &&
|
||||||
this.allowExternalContentLinks &&
|
this.allowExternalContentLinks &&
|
||||||
@@ -224,7 +260,11 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
|
if (
|
||||||
|
this.allowSubdomains &&
|
||||||
|
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||||
|
this.isSubdomain(fullUrl)
|
||||||
|
) {
|
||||||
return fullUrl;
|
return fullUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,14 +301,20 @@ export class WebCrawler {
|
|||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean {
|
private isRobotsAllowed(
|
||||||
return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true))
|
url: string,
|
||||||
|
ignoreRobotsTxt: boolean = false
|
||||||
|
): boolean {
|
||||||
|
return ignoreRobotsTxt
|
||||||
|
? true
|
||||||
|
: this.robots
|
||||||
|
? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true)
|
||||||
|
: true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||||
return this.excludes.some((pattern) => {
|
return this.excludes.some((pattern) => {
|
||||||
if (onlyDomains)
|
if (onlyDomains) return this.matchesExcludesExternalDomains(url);
|
||||||
return this.matchesExcludesExternalDomains(url);
|
|
||||||
|
|
||||||
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
|
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
|
||||||
});
|
});
|
||||||
@@ -282,11 +328,14 @@ export class WebCrawler {
|
|||||||
const pathname = urlObj.pathname;
|
const pathname = urlObj.pathname;
|
||||||
|
|
||||||
for (let domain of this.excludes) {
|
for (let domain of this.excludes) {
|
||||||
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
|
let domainObj = new URL("http://" + domain.replace(/^https?:\/\//, ""));
|
||||||
let domainHostname = domainObj.hostname;
|
let domainHostname = domainObj.hostname;
|
||||||
let domainPathname = domainObj.pathname;
|
let domainPathname = domainObj.pathname;
|
||||||
|
|
||||||
if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
|
if (
|
||||||
|
hostname === domainHostname ||
|
||||||
|
hostname.endsWith(`.${domainHostname}`)
|
||||||
|
) {
|
||||||
if (pathname.startsWith(domainPathname)) {
|
if (pathname.startsWith(domainPathname)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -298,8 +347,13 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private isExternalMainPage(url:string):boolean {
|
private isExternalMainPage(url: string): boolean {
|
||||||
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
|
return !Boolean(
|
||||||
|
url
|
||||||
|
.split("/")
|
||||||
|
.slice(3)
|
||||||
|
.filter((subArray) => subArray.length > 0).length
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private noSections(link: string): boolean {
|
private noSections(link: string): boolean {
|
||||||
@@ -308,14 +362,19 @@ export class WebCrawler {
|
|||||||
|
|
||||||
private isInternalLink(link: string): boolean {
|
private isInternalLink(link: string): boolean {
|
||||||
const urlObj = new URL(link, this.baseUrl);
|
const urlObj = new URL(link, this.baseUrl);
|
||||||
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
|
const baseDomain = this.baseUrl
|
||||||
|
.replace(/^https?:\/\//, "")
|
||||||
|
.replace(/^www\./, "")
|
||||||
|
.trim();
|
||||||
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
|
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
|
||||||
|
|
||||||
return linkDomain === baseDomain;
|
return linkDomain === baseDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
private isSubdomain(link: string): boolean {
|
private isSubdomain(link: string): boolean {
|
||||||
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
|
return new URL(link, this.baseUrl).hostname.endsWith(
|
||||||
|
"." + new URL(this.baseUrl).hostname.split(".").slice(-2).join(".")
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public isFile(url: string): boolean {
|
public isFile(url: string): boolean {
|
||||||
@@ -350,10 +409,13 @@ export class WebCrawler {
|
|||||||
];
|
];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
const urlWithoutQuery = url.split("?")[0].toLowerCase();
|
||||||
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error(`Error processing URL in isFile`, { method: "isFile", error });
|
this.logger.error(`Error processing URL in isFile`, {
|
||||||
|
method: "isFile",
|
||||||
|
error
|
||||||
|
});
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -369,7 +431,7 @@ export class WebCrawler {
|
|||||||
"github.com",
|
"github.com",
|
||||||
"calendly.com",
|
"calendly.com",
|
||||||
"discord.gg",
|
"discord.gg",
|
||||||
"discord.com",
|
"discord.com"
|
||||||
];
|
];
|
||||||
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
return socialMediaOrEmail.some((ext) => url.includes(ext));
|
||||||
}
|
}
|
||||||
@@ -383,10 +445,7 @@ export class WebCrawler {
|
|||||||
return url;
|
return url;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
||||||
const sitemapUrl = url.endsWith(".xml")
|
|
||||||
? url
|
|
||||||
: `${url}/sitemap.xml`;
|
|
||||||
|
|
||||||
let sitemapLinks: string[] = [];
|
let sitemapLinks: string[] = [];
|
||||||
|
|
||||||
@@ -396,11 +455,17 @@ export class WebCrawler {
|
|||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}`, { method: "tryFetchSitemapLinks", sitemapUrl, error });
|
this.logger.debug(
|
||||||
|
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
|
||||||
|
{ method: "tryFetchSitemapLinks", sitemapUrl, error }
|
||||||
|
);
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }, this.logger);
|
const response = await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl, mode: "fire-engine" },
|
||||||
|
this.logger
|
||||||
|
);
|
||||||
if (response) {
|
if (response) {
|
||||||
sitemapLinks = response;
|
sitemapLinks = response;
|
||||||
}
|
}
|
||||||
@@ -410,24 +475,41 @@ export class WebCrawler {
|
|||||||
if (sitemapLinks.length === 0) {
|
if (sitemapLinks.length === 0) {
|
||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
const response = await axios.get(baseUrlSitemap, {
|
||||||
|
timeout: axiosTimeout
|
||||||
|
});
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }, this.logger);
|
sitemapLinks = await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
||||||
|
this.logger
|
||||||
|
);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", sitemapUrl: baseUrlSitemap, error });
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
|
method: "tryFetchSitemapLinks",
|
||||||
|
sitemapUrl: baseUrlSitemap,
|
||||||
|
error
|
||||||
|
});
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
if (error instanceof AxiosError && error.response?.status === 404) {
|
||||||
// ignore 404
|
// ignore 404
|
||||||
} else {
|
} else {
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }, this.logger);
|
sitemapLinks = await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
||||||
|
this.logger
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const normalizedUrl = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
const normalizedSitemapLinks = sitemapLinks.map((link) =>
|
||||||
|
normalizeUrl(link)
|
||||||
|
);
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||||
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
if (
|
||||||
|
!normalizedSitemapLinks.includes(normalizedUrl) &&
|
||||||
|
sitemapLinks.length > 0
|
||||||
|
) {
|
||||||
sitemapLinks.push(url);
|
sitemapLinks.push(url);
|
||||||
}
|
}
|
||||||
return sitemapLinks;
|
return sitemapLinks;
|
||||||
|
|||||||
@@ -3,9 +3,17 @@ import { logger } from "../../../lib/logger";
|
|||||||
export async function handleCustomScraping(
|
export async function handleCustomScraping(
|
||||||
text: string,
|
text: string,
|
||||||
url: string
|
url: string
|
||||||
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
|
): Promise<{
|
||||||
|
scraper: string;
|
||||||
|
url: string;
|
||||||
|
waitAfterLoad?: number;
|
||||||
|
pageOptions?: { scrollXPaths?: string[] };
|
||||||
|
} | null> {
|
||||||
// Check for Readme Docs special case
|
// Check for Readme Docs special case
|
||||||
if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
|
if (
|
||||||
|
text.includes('<meta name="readme-deploy"') &&
|
||||||
|
!url.includes("developers.notion.com")
|
||||||
|
) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
|
||||||
);
|
);
|
||||||
@@ -14,7 +22,9 @@ export async function handleCustomScraping(
|
|||||||
url: url,
|
url: url,
|
||||||
waitAfterLoad: 1000,
|
waitAfterLoad: 1000,
|
||||||
pageOptions: {
|
pageOptions: {
|
||||||
scrollXPaths: ['//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]']
|
scrollXPaths: [
|
||||||
|
'//*[@id="ReferencePlayground"]/section[3]/div/pre/div/div/div[5]'
|
||||||
|
]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -27,18 +37,21 @@ export async function handleCustomScraping(
|
|||||||
return {
|
return {
|
||||||
scraper: "fire-engine",
|
scraper: "fire-engine",
|
||||||
url: url,
|
url: url,
|
||||||
waitAfterLoad: 3000,
|
waitAfterLoad: 3000
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for Google Drive PDF links in meta tags
|
// Check for Google Drive PDF links in meta tags
|
||||||
const googleDriveMetaPattern = /<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
|
const googleDriveMetaPattern =
|
||||||
|
/<meta itemprop="url" content="(https:\/\/drive\.google\.com\/file\/d\/[^"]+)"/;
|
||||||
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
|
||||||
if (googleDriveMetaMatch) {
|
if (googleDriveMetaMatch) {
|
||||||
const url = googleDriveMetaMatch[1];
|
const url = googleDriveMetaMatch[1];
|
||||||
logger.debug(`Google Drive PDF link detected: ${url}`);
|
logger.debug(`Google Drive PDF link detected: ${url}`);
|
||||||
|
|
||||||
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
|
const fileIdMatch = url.match(
|
||||||
|
/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/
|
||||||
|
);
|
||||||
if (fileIdMatch) {
|
if (fileIdMatch) {
|
||||||
const fileId = fileIdMatch[1];
|
const fileId = fileIdMatch[1];
|
||||||
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
const pdfUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
||||||
|
|||||||
@@ -10,29 +10,39 @@ export async function getLinksFromSitemap(
|
|||||||
{
|
{
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
allUrls = [],
|
allUrls = [],
|
||||||
mode = 'axios'
|
mode = "axios"
|
||||||
}: {
|
}: {
|
||||||
sitemapUrl: string,
|
sitemapUrl: string;
|
||||||
allUrls?: string[],
|
allUrls?: string[];
|
||||||
mode?: 'axios' | 'fire-engine'
|
mode?: "axios" | "fire-engine";
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} else if (mode === 'fire-engine') {
|
} else if (mode === "fire-engine") {
|
||||||
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
|
const response = await scrapeURL(
|
||||||
|
"sitemap",
|
||||||
|
sitemapUrl,
|
||||||
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
|
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }
|
||||||
|
);
|
||||||
if (!response.success) {
|
if (!response.success) {
|
||||||
throw response.error;
|
throw response.error;
|
||||||
}
|
}
|
||||||
content = response.document.rawHtml!;
|
content = response.document.rawHtml!;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error });
|
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||||
|
method: "getLinksFromSitemap",
|
||||||
|
mode,
|
||||||
|
sitemapUrl,
|
||||||
|
error
|
||||||
|
});
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
@@ -42,26 +52,46 @@ export async function getLinksFromSitemap(
|
|||||||
|
|
||||||
if (root && root.sitemap) {
|
if (root && root.sitemap) {
|
||||||
const sitemapPromises = root.sitemap
|
const sitemapPromises = root.sitemap
|
||||||
.filter(sitemap => sitemap.loc && sitemap.loc.length > 0)
|
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||||
.map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }, logger));
|
.map((sitemap) =>
|
||||||
|
getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
||||||
|
logger
|
||||||
|
)
|
||||||
|
);
|
||||||
await Promise.all(sitemapPromises);
|
await Promise.all(sitemapPromises);
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
const validUrls = root.url
|
const validUrls = root.url
|
||||||
.filter(url => url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0]))
|
.filter(
|
||||||
.map(url => url.loc[0]);
|
(url) =>
|
||||||
|
url.loc &&
|
||||||
|
url.loc.length > 0 &&
|
||||||
|
!WebCrawler.prototype.isFile(url.loc[0])
|
||||||
|
)
|
||||||
|
.map((url) => url.loc[0]);
|
||||||
allUrls.push(...validUrls);
|
allUrls.push(...validUrls);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error });
|
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
|
||||||
|
method: "getLinksFromSitemap",
|
||||||
|
mode,
|
||||||
|
sitemapUrl,
|
||||||
|
error
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const fetchSitemapData = async (url: string, timeout?: number): Promise<SitemapEntry[] | null> => {
|
export const fetchSitemapData = async (
|
||||||
|
url: string,
|
||||||
|
timeout?: number
|
||||||
|
): Promise<SitemapEntry[] | null> => {
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout });
|
const response = await axios.get(sitemapUrl, {
|
||||||
|
timeout: timeout || axiosTimeout
|
||||||
|
});
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const xml = response.data;
|
const xml = response.data;
|
||||||
const parsedXml = await parseStringPromise(xml);
|
const parsedXml = await parseStringPromise(xml);
|
||||||
@@ -71,8 +101,10 @@ export const fetchSitemapData = async (url: string, timeout?: number): Promise<S
|
|||||||
for (const urlElement of parsedXml.urlset.url) {
|
for (const urlElement of parsedXml.urlset.url) {
|
||||||
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
|
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
|
||||||
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
|
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
|
||||||
if (urlElement.changefreq) sitemapEntry.changefreq = urlElement.changefreq[0];
|
if (urlElement.changefreq)
|
||||||
if (urlElement.priority) sitemapEntry.priority = Number(urlElement.priority[0]);
|
sitemapEntry.changefreq = urlElement.changefreq[0];
|
||||||
|
if (urlElement.priority)
|
||||||
|
sitemapEntry.priority = Number(urlElement.priority[0]);
|
||||||
sitemapData.push(sitemapEntry);
|
sitemapData.push(sitemapEntry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -84,7 +116,7 @@ export const fetchSitemapData = async (url: string, timeout?: number): Promise<S
|
|||||||
// Error handling for failed sitemap fetch
|
// Error handling for failed sitemap fetch
|
||||||
}
|
}
|
||||||
return [];
|
return [];
|
||||||
}
|
};
|
||||||
|
|
||||||
export interface SitemapEntry {
|
export interface SitemapEntry {
|
||||||
loc: string;
|
loc: string;
|
||||||
|
|||||||
@@ -1,88 +1,94 @@
|
|||||||
import { isUrlBlocked } from '../blocklist';
|
import { isUrlBlocked } from "../blocklist";
|
||||||
|
|
||||||
describe('Blocklist Functionality', () => {
|
describe("Blocklist Functionality", () => {
|
||||||
describe('isUrlBlocked', () => {
|
describe("isUrlBlocked", () => {
|
||||||
test.each([
|
test.each([
|
||||||
'https://facebook.com/fake-test',
|
"https://facebook.com/fake-test",
|
||||||
'https://x.com/user-profile',
|
"https://x.com/user-profile",
|
||||||
'https://twitter.com/home',
|
"https://twitter.com/home",
|
||||||
'https://instagram.com/explore',
|
"https://instagram.com/explore",
|
||||||
'https://linkedin.com/in/johndoe',
|
"https://linkedin.com/in/johndoe",
|
||||||
'https://snapchat.com/add/johndoe',
|
"https://snapchat.com/add/johndoe",
|
||||||
'https://tiktok.com/@johndoe',
|
"https://tiktok.com/@johndoe",
|
||||||
'https://reddit.com/r/funny',
|
"https://reddit.com/r/funny",
|
||||||
'https://tumblr.com/dashboard',
|
"https://tumblr.com/dashboard",
|
||||||
'https://flickr.com/photos/johndoe',
|
"https://flickr.com/photos/johndoe",
|
||||||
'https://whatsapp.com/download',
|
"https://whatsapp.com/download",
|
||||||
'https://wechat.com/features',
|
"https://wechat.com/features",
|
||||||
'https://telegram.org/apps'
|
"https://telegram.org/apps"
|
||||||
])('should return true for blocklisted URL %s', (url) => {
|
])("should return true for blocklisted URL %s", (url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
'https://facebook.com/policy',
|
"https://facebook.com/policy",
|
||||||
'https://twitter.com/tos',
|
"https://twitter.com/tos",
|
||||||
'https://instagram.com/about/legal/terms',
|
"https://instagram.com/about/legal/terms",
|
||||||
'https://linkedin.com/legal/privacy-policy',
|
"https://linkedin.com/legal/privacy-policy",
|
||||||
'https://pinterest.com/about/privacy',
|
"https://pinterest.com/about/privacy",
|
||||||
'https://snapchat.com/legal/terms',
|
"https://snapchat.com/legal/terms",
|
||||||
'https://tiktok.com/legal/privacy-policy',
|
"https://tiktok.com/legal/privacy-policy",
|
||||||
'https://reddit.com/policies',
|
"https://reddit.com/policies",
|
||||||
'https://tumblr.com/policy/en/privacy',
|
"https://tumblr.com/policy/en/privacy",
|
||||||
'https://flickr.com/help/terms',
|
"https://flickr.com/help/terms",
|
||||||
'https://whatsapp.com/legal',
|
"https://whatsapp.com/legal",
|
||||||
'https://wechat.com/en/privacy-policy',
|
"https://wechat.com/en/privacy-policy",
|
||||||
'https://telegram.org/tos'
|
"https://telegram.org/tos"
|
||||||
])('should return false for allowed URLs with keywords %s', (url) => {
|
])("should return false for allowed URLs with keywords %s", (url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('should return false for non-blocklisted domain', () => {
|
test("should return false for non-blocklisted domain", () => {
|
||||||
const url = 'https://example.com';
|
const url = "https://example.com";
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('should handle invalid URLs gracefully', () => {
|
test("should handle invalid URLs gracefully", () => {
|
||||||
const url = 'htp://invalid-url';
|
const url = "htp://invalid-url";
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
'https://subdomain.facebook.com',
|
"https://subdomain.facebook.com",
|
||||||
'https://facebook.com.someotherdomain.com',
|
"https://facebook.com.someotherdomain.com",
|
||||||
'https://www.facebook.com/profile',
|
"https://www.facebook.com/profile",
|
||||||
'https://api.twitter.com/info',
|
"https://api.twitter.com/info",
|
||||||
'https://instagram.com/accounts/login'
|
"https://instagram.com/accounts/login"
|
||||||
])('should return true for URLs with blocklisted domains in subdomains or paths %s', (url) => {
|
])(
|
||||||
|
"should return true for URLs with blocklisted domains in subdomains or paths %s",
|
||||||
|
(url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
'https://example.com/facebook.com',
|
"https://example.com/facebook.com",
|
||||||
'https://example.com/redirect?url=https://twitter.com',
|
"https://example.com/redirect?url=https://twitter.com",
|
||||||
'https://facebook.com.policy.example.com'
|
"https://facebook.com.policy.example.com"
|
||||||
])('should return false for URLs where blocklisted domain is part of another domain or path %s', (url) => {
|
])(
|
||||||
|
"should return false for URLs where blocklisted domain is part of another domain or path %s",
|
||||||
|
(url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
|
||||||
|
"should handle case variations %s",
|
||||||
|
(url) => {
|
||||||
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
'https://FACEBOOK.com',
|
"https://facebook.com?redirect=https://example.com",
|
||||||
'https://INSTAGRAM.com/@something'
|
"https://twitter.com?query=something"
|
||||||
])('should handle case variations %s', (url) => {
|
])("should handle query parameters %s", (url) => {
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
expect(isUrlBlocked(url)).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test("should handle internationalized domain names", () => {
|
||||||
'https://facebook.com?redirect=https://example.com',
|
const url = "https://xn--d1acpjx3f.xn--p1ai";
|
||||||
'https://twitter.com?query=something'
|
|
||||||
])('should handle query parameters %s', (url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
test('should handle internationalized domain names', () => {
|
|
||||||
const url = 'https://xn--d1acpjx3f.xn--p1ai';
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked(url)).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -1,47 +1,42 @@
|
|||||||
import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils';
|
import { getURLDepth, getAdjustedMaxDepth } from "../maxDepthUtils";
|
||||||
|
|
||||||
describe('Testing getURLDepth and getAdjustedMaxDepth', () => {
|
describe("Testing getURLDepth and getAdjustedMaxDepth", () => {
|
||||||
it('should return 0 for root - mendable.ai', () => {
|
it("should return 0 for root - mendable.ai", () => {
|
||||||
const enteredURL = "https://www.mendable.ai/"
|
const enteredURL = "https://www.mendable.ai/";
|
||||||
expect(getURLDepth(enteredURL)).toBe(0);
|
expect(getURLDepth(enteredURL)).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should return 0 for root - scrapethissite.com', () => {
|
it("should return 0 for root - scrapethissite.com", () => {
|
||||||
const enteredURL = "https://scrapethissite.com/"
|
const enteredURL = "https://scrapethissite.com/";
|
||||||
expect(getURLDepth(enteredURL)).toBe(0);
|
expect(getURLDepth(enteredURL)).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should return 1 for scrapethissite.com/pages', () => {
|
it("should return 1 for scrapethissite.com/pages", () => {
|
||||||
const enteredURL = "https://scrapethissite.com/pages"
|
const enteredURL = "https://scrapethissite.com/pages";
|
||||||
expect(getURLDepth(enteredURL)).toBe(1);
|
expect(getURLDepth(enteredURL)).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should return 2 for scrapethissite.com/pages/articles', () => {
|
it("should return 2 for scrapethissite.com/pages/articles", () => {
|
||||||
const enteredURL = "https://scrapethissite.com/pages/articles"
|
const enteredURL = "https://scrapethissite.com/pages/articles";
|
||||||
expect(getURLDepth(enteredURL)).toBe(2);
|
expect(getURLDepth(enteredURL)).toBe(2);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => {
|
it("Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1", () => {
|
||||||
const enteredURL = "https://scrapethissite.com"
|
const enteredURL = "https://scrapethissite.com";
|
||||||
expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
|
expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1);
|
||||||
|
|
||||||
});
|
});
|
||||||
it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => {
|
it("Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0", () => {
|
||||||
const enteredURL = "https://scrapethissite.com"
|
const enteredURL = "https://scrapethissite.com";
|
||||||
expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => {
|
|
||||||
const enteredURL = "https://mendable.ai"
|
|
||||||
expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
|
expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => {
|
it("Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0", () => {
|
||||||
const enteredURL = "https://scrapethissite.com/pages/articles"
|
const enteredURL = "https://mendable.ai";
|
||||||
|
expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2", () => {
|
||||||
|
const enteredURL = "https://scrapethissite.com/pages/articles";
|
||||||
expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4);
|
expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,68 +1,75 @@
|
|||||||
import { logger } from "../../../lib/logger";
|
import { logger } from "../../../lib/logger";
|
||||||
|
|
||||||
const socialMediaBlocklist = [
|
const socialMediaBlocklist = [
|
||||||
'facebook.com',
|
"facebook.com",
|
||||||
'x.com',
|
"x.com",
|
||||||
'twitter.com',
|
"twitter.com",
|
||||||
'instagram.com',
|
"instagram.com",
|
||||||
'linkedin.com',
|
"linkedin.com",
|
||||||
'snapchat.com',
|
"snapchat.com",
|
||||||
'tiktok.com',
|
"tiktok.com",
|
||||||
'reddit.com',
|
"reddit.com",
|
||||||
'tumblr.com',
|
"tumblr.com",
|
||||||
'flickr.com',
|
"flickr.com",
|
||||||
'whatsapp.com',
|
"whatsapp.com",
|
||||||
'wechat.com',
|
"wechat.com",
|
||||||
'telegram.org',
|
"telegram.org",
|
||||||
'researchhub.com',
|
"researchhub.com",
|
||||||
'youtube.com',
|
"youtube.com",
|
||||||
'corterix.com',
|
"corterix.com",
|
||||||
'southwest.com',
|
"southwest.com",
|
||||||
'ryanair.com'
|
"ryanair.com"
|
||||||
];
|
];
|
||||||
|
|
||||||
const allowedKeywords = [
|
const allowedKeywords = [
|
||||||
'pulse',
|
"pulse",
|
||||||
'privacy',
|
"privacy",
|
||||||
'terms',
|
"terms",
|
||||||
'policy',
|
"policy",
|
||||||
'user-agreement',
|
"user-agreement",
|
||||||
'legal',
|
"legal",
|
||||||
'help',
|
"help",
|
||||||
'policies',
|
"policies",
|
||||||
'support',
|
"support",
|
||||||
'contact',
|
"contact",
|
||||||
'about',
|
"about",
|
||||||
'careers',
|
"careers",
|
||||||
'blog',
|
"blog",
|
||||||
'press',
|
"press",
|
||||||
'conditions',
|
"conditions",
|
||||||
'tos',
|
"tos",
|
||||||
'://library.tiktok.com',
|
"://library.tiktok.com",
|
||||||
'://ads.tiktok.com',
|
"://ads.tiktok.com",
|
||||||
'://tiktok.com/business',
|
"://tiktok.com/business",
|
||||||
'://developers.facebook.com'
|
"://developers.facebook.com"
|
||||||
];
|
];
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string): boolean {
|
||||||
const lowerCaseUrl = url.toLowerCase();
|
const lowerCaseUrl = url.toLowerCase();
|
||||||
|
|
||||||
// Check if the URL contains any allowed keywords as whole words
|
// Check if the URL contains any allowed keywords as whole words
|
||||||
if (allowedKeywords.some(keyword => new RegExp(`\\b${keyword}\\b`, 'i').test(lowerCaseUrl))) {
|
if (
|
||||||
|
allowedKeywords.some((keyword) =>
|
||||||
|
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl)
|
||||||
|
)
|
||||||
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
||||||
url = 'https://' + url;
|
url = "https://" + url;
|
||||||
}
|
}
|
||||||
|
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
const hostname = urlObj.hostname.toLowerCase();
|
const hostname = urlObj.hostname.toLowerCase();
|
||||||
|
|
||||||
// Check if the URL matches any domain in the blocklist
|
// Check if the URL matches any domain in the blocklist
|
||||||
const isBlocked = socialMediaBlocklist.some(domain => {
|
const isBlocked = socialMediaBlocklist.some((domain) => {
|
||||||
const domainPattern = new RegExp(`(^|\\.)${domain.replace('.', '\\.')}(\\.|$)`, 'i');
|
const domainPattern = new RegExp(
|
||||||
|
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
|
||||||
|
"i"
|
||||||
|
);
|
||||||
return domainPattern.test(hostname);
|
return domainPattern.test(hostname);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,15 @@
|
|||||||
|
export function getAdjustedMaxDepth(
|
||||||
|
url: string,
|
||||||
export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number {
|
maxCrawlDepth: number
|
||||||
|
): number {
|
||||||
const baseURLDepth = getURLDepth(url);
|
const baseURLDepth = getURLDepth(url);
|
||||||
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
|
const adjustedMaxDepth = maxCrawlDepth + baseURLDepth;
|
||||||
return adjustedMaxDepth;
|
return adjustedMaxDepth;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getURLDepth(url: string): number {
|
export function getURLDepth(url: string): number {
|
||||||
const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html");
|
const pathSplits = new URL(url).pathname
|
||||||
|
.split("/")
|
||||||
|
.filter((x) => x !== "" && x !== "index.php" && x !== "index.html");
|
||||||
return pathSplits.length;
|
return pathSplits.length;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
export const removeBase64Images = async (
|
export const removeBase64Images = async (markdown: string) => {
|
||||||
markdown: string,
|
|
||||||
) => {
|
|
||||||
const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;
|
const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;
|
||||||
markdown = markdown.replace(regex, '$1(<Base64-Image-Removed>)');
|
markdown = markdown.replace(regex, "$1(<Base64-Image-Removed>)");
|
||||||
return markdown;
|
return markdown;
|
||||||
};
|
};
|
||||||
|
|||||||
+1
-1
@@ -14,6 +14,6 @@ export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
url: entry.url,
|
url: entry.url,
|
||||||
html: entry.html,
|
html: entry.html,
|
||||||
statusCode: entry.statusCode,
|
statusCode: entry.statusCode,
|
||||||
error: entry.error,
|
error: entry.error
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -10,6 +10,6 @@ export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
url: response.url,
|
url: response.url,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
|
||||||
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
|
html: (await mammoth.convertToHtml({ path: tempFilePath })).value
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,26 +3,34 @@ import { Meta } from "../..";
|
|||||||
import { TimeoutError } from "../../error";
|
import { TimeoutError } from "../../error";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
|
|
||||||
export async function scrapeURLWithFetch(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeURLWithFetch(
|
||||||
|
meta: Meta
|
||||||
|
): Promise<EngineScrapeResult> {
|
||||||
const timeout = 20000;
|
const timeout = 20000;
|
||||||
|
|
||||||
const response = await Promise.race([
|
const response = await Promise.race([
|
||||||
fetch(meta.url, {
|
fetch(meta.url, {
|
||||||
redirect: "follow",
|
redirect: "follow",
|
||||||
headers: meta.options.headers,
|
headers: meta.options.headers
|
||||||
}),
|
}),
|
||||||
(async () => {
|
(async () => {
|
||||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||||
throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } });
|
throw new TimeoutError(
|
||||||
|
"Fetch was unable to scrape the page before timing out",
|
||||||
|
{ cause: { timeout } }
|
||||||
|
);
|
||||||
})()
|
})()
|
||||||
]);
|
]);
|
||||||
|
|
||||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any));
|
specialtyScrapeCheck(
|
||||||
|
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
||||||
|
Object.fromEntries(response.headers as any)
|
||||||
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: response.url,
|
url: response.url,
|
||||||
html: await response.text(),
|
html: await response.text(),
|
||||||
statusCode: response.status,
|
statusCode: response.status
|
||||||
// TODO: error?
|
// TODO: error?
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,56 +28,72 @@ const successSchema = z.object({
|
|||||||
|
|
||||||
// new: actions
|
// new: actions
|
||||||
screenshots: z.string().array().optional(),
|
screenshots: z.string().array().optional(),
|
||||||
actionContent: z.object({
|
actionContent: z
|
||||||
|
.object({
|
||||||
url: z.string(),
|
url: z.string(),
|
||||||
html: z.string(),
|
html: z.string()
|
||||||
}).array().optional(),
|
})
|
||||||
})
|
.array()
|
||||||
|
.optional()
|
||||||
|
});
|
||||||
|
|
||||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||||
|
|
||||||
const processingSchema = z.object({
|
const processingSchema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown", "prioritized"]),
|
state: z.enum([
|
||||||
processing: z.boolean(),
|
"delayed",
|
||||||
|
"active",
|
||||||
|
"waiting",
|
||||||
|
"waiting-children",
|
||||||
|
"unknown",
|
||||||
|
"prioritized"
|
||||||
|
]),
|
||||||
|
processing: z.boolean()
|
||||||
});
|
});
|
||||||
|
|
||||||
const failedSchema = z.object({
|
const failedSchema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
state: z.literal("failed"),
|
state: z.literal("failed"),
|
||||||
processing: z.literal(false),
|
processing: z.literal(false),
|
||||||
error: z.string(),
|
error: z.string()
|
||||||
});
|
});
|
||||||
|
|
||||||
export class StillProcessingError extends Error {
|
export class StillProcessingError extends Error {
|
||||||
constructor(jobId: string) {
|
constructor(jobId: string) {
|
||||||
super("Job is still under processing", { cause: { jobId } })
|
super("Job is still under processing", { cause: { jobId } });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise<FireEngineCheckStatusSuccess> {
|
export async function fireEngineCheckStatus(
|
||||||
|
logger: Logger,
|
||||||
|
jobId: string
|
||||||
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
const status = await Sentry.startSpan({
|
const status = await Sentry.startSpan(
|
||||||
|
{
|
||||||
name: "fire-engine: Check status",
|
name: "fire-engine: Check status",
|
||||||
attributes: {
|
attributes: {
|
||||||
jobId,
|
jobId
|
||||||
}
|
}
|
||||||
}, async span => {
|
},
|
||||||
return await robustFetch(
|
async (span) => {
|
||||||
{
|
return await robustFetch({
|
||||||
url: `${fireEngineURL}/scrape/${jobId}`,
|
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
|
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
|
||||||
headers: {
|
headers: {
|
||||||
...(Sentry.isInitialized() ? ({
|
...(Sentry.isInitialized()
|
||||||
|
? {
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
"baggage": Sentry.spanToBaggageHeader(span),
|
baggage: Sentry.spanToBaggageHeader(span)
|
||||||
}) : {}),
|
}
|
||||||
},
|
: {})
|
||||||
}
|
}
|
||||||
)
|
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
const successParse = successSchema.safeParse(status);
|
const successParse = successSchema.safeParse(status);
|
||||||
const processingParse = processingSchema.safeParse(status);
|
const processingParse = processingSchema.safeParse(status);
|
||||||
@@ -90,21 +106,32 @@ export async function fireEngineCheckStatus(logger: Logger, jobId: string): Prom
|
|||||||
throw new StillProcessingError(jobId);
|
throw new StillProcessingError(jobId);
|
||||||
} else if (failedParse.success) {
|
} else if (failedParse.success) {
|
||||||
logger.debug("Scrape job failed", { status, jobId });
|
logger.debug("Scrape job failed", { status, jobId });
|
||||||
if (typeof status.error === "string" && status.error.includes("Chrome error: ")) {
|
if (
|
||||||
|
typeof status.error === "string" &&
|
||||||
|
status.error.includes("Chrome error: ")
|
||||||
|
) {
|
||||||
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
||||||
} else {
|
} else {
|
||||||
throw new EngineError("Scrape job failed", {
|
throw new EngineError("Scrape job failed", {
|
||||||
cause: {
|
cause: {
|
||||||
status, jobId
|
status,
|
||||||
|
jobId
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Check status returned response not matched by any schema", { status, jobId });
|
logger.debug("Check status returned response not matched by any schema", {
|
||||||
throw new Error("Check status returned response not matched by any schema", {
|
status,
|
||||||
cause: {
|
jobId
|
||||||
status, jobId
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
throw new Error(
|
||||||
|
"Check status returned response not matched by any schema",
|
||||||
|
{
|
||||||
|
cause: {
|
||||||
|
status,
|
||||||
|
jobId
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,28 +6,31 @@ import { robustFetch } from "../../lib/fetch";
|
|||||||
export async function fireEngineDelete(logger: Logger, jobId: string) {
|
export async function fireEngineDelete(logger: Logger, jobId: string) {
|
||||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
await Sentry.startSpan({
|
await Sentry.startSpan(
|
||||||
|
{
|
||||||
name: "fire-engine: Delete scrape",
|
name: "fire-engine: Delete scrape",
|
||||||
attributes: {
|
attributes: {
|
||||||
jobId,
|
jobId
|
||||||
}
|
}
|
||||||
}, async span => {
|
},
|
||||||
await robustFetch(
|
async (span) => {
|
||||||
{
|
await robustFetch({
|
||||||
url: `${fireEngineURL}/scrape/${jobId}`,
|
url: `${fireEngineURL}/scrape/${jobId}`,
|
||||||
method: "DELETE",
|
method: "DELETE",
|
||||||
headers: {
|
headers: {
|
||||||
...(Sentry.isInitialized() ? ({
|
...(Sentry.isInitialized()
|
||||||
|
? {
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
"baggage": Sentry.spanToBaggageHeader(span),
|
baggage: Sentry.spanToBaggageHeader(span)
|
||||||
}) : {}),
|
}
|
||||||
|
: {})
|
||||||
},
|
},
|
||||||
ignoreResponse: true,
|
ignoreResponse: true,
|
||||||
ignoreFailure: true,
|
ignoreFailure: true,
|
||||||
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId })
|
||||||
}
|
|
||||||
)
|
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
// We do not care whether this fails or not.
|
// We do not care whether this fails or not.
|
||||||
}
|
}
|
||||||
@@ -1,8 +1,18 @@
|
|||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape";
|
import {
|
||||||
|
fireEngineScrape,
|
||||||
|
FireEngineScrapeRequestChromeCDP,
|
||||||
|
FireEngineScrapeRequestCommon,
|
||||||
|
FireEngineScrapeRequestPlaywright,
|
||||||
|
FireEngineScrapeRequestTLSClient
|
||||||
|
} from "./scrape";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus";
|
import {
|
||||||
|
fireEngineCheckStatus,
|
||||||
|
FireEngineCheckStatusSuccess,
|
||||||
|
StillProcessingError
|
||||||
|
} from "./checkStatus";
|
||||||
import { EngineError, SiteError, TimeoutError } from "../../error";
|
import { EngineError, SiteError, TimeoutError } from "../../error";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Action } from "../../../../lib/entities";
|
import { Action } from "../../../../lib/entities";
|
||||||
@@ -13,12 +23,20 @@ export const defaultTimeout = 10000;
|
|||||||
// This function does not take `Meta` on purpose. It may not access any
|
// This function does not take `Meta` on purpose. It may not access any
|
||||||
// meta values to construct the request -- that must be done by the
|
// meta values to construct the request -- that must be done by the
|
||||||
// `scrapeURLWithFireEngine*` functions.
|
// `scrapeURLWithFireEngine*` functions.
|
||||||
async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient>(
|
async function performFireEngineScrape<
|
||||||
|
Engine extends
|
||||||
|
| FireEngineScrapeRequestChromeCDP
|
||||||
|
| FireEngineScrapeRequestPlaywright
|
||||||
|
| FireEngineScrapeRequestTLSClient
|
||||||
|
>(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
request: FireEngineScrapeRequestCommon & Engine,
|
request: FireEngineScrapeRequestCommon & Engine,
|
||||||
timeout = defaultTimeout,
|
timeout = defaultTimeout
|
||||||
): Promise<FireEngineCheckStatusSuccess> {
|
): Promise<FireEngineCheckStatusSuccess> {
|
||||||
const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request);
|
const scrape = await fireEngineScrape(
|
||||||
|
logger.child({ method: "fireEngineScrape" }),
|
||||||
|
request
|
||||||
|
);
|
||||||
|
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const errorLimit = 3;
|
const errorLimit = 3;
|
||||||
@@ -28,26 +46,43 @@ async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChr
|
|||||||
while (status === undefined) {
|
while (status === undefined) {
|
||||||
if (errors.length >= errorLimit) {
|
if (errors.length >= errorLimit) {
|
||||||
logger.error("Error limit hit.", { errors });
|
logger.error("Error limit hit.", { errors });
|
||||||
throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } });
|
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
||||||
|
cause: { errors }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Date.now() - startTime > timeout) {
|
if (Date.now() - startTime > timeout) {
|
||||||
logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout });
|
logger.info(
|
||||||
throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } });
|
"Fire-engine was unable to scrape the page before timing out.",
|
||||||
|
{ errors, timeout }
|
||||||
|
);
|
||||||
|
throw new TimeoutError(
|
||||||
|
"Fire-engine was unable to scrape the page before timing out",
|
||||||
|
{ cause: { errors, timeout } }
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId)
|
status = await fireEngineCheckStatus(
|
||||||
|
logger.child({ method: "fireEngineCheckStatus" }),
|
||||||
|
scrape.jobId
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof StillProcessingError) {
|
if (error instanceof StillProcessingError) {
|
||||||
// nop
|
// nop
|
||||||
} else if (error instanceof EngineError || error instanceof SiteError) {
|
} else if (error instanceof EngineError || error instanceof SiteError) {
|
||||||
logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId });
|
logger.debug("Fire-engine scrape job failed.", {
|
||||||
|
error,
|
||||||
|
jobId: scrape.jobId
|
||||||
|
});
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
errors.push(error);
|
errors.push(error);
|
||||||
logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId });
|
logger.debug(
|
||||||
|
`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`,
|
||||||
|
{ error, jobId: scrape.jobId }
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,62 +92,97 @@ async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChr
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeURLWithFireEngineChromeCDP(
|
||||||
|
meta: Meta
|
||||||
|
): Promise<EngineScrapeResult> {
|
||||||
const actions: Action[] = [
|
const actions: Action[] = [
|
||||||
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
||||||
...(meta.options.waitFor !== 0 ? [{
|
...(meta.options.waitFor !== 0
|
||||||
|
? [
|
||||||
|
{
|
||||||
type: "wait" as const,
|
type: "wait" as const,
|
||||||
milliseconds: meta.options.waitFor,
|
milliseconds: meta.options.waitFor
|
||||||
}] : []),
|
}
|
||||||
|
]
|
||||||
|
: []),
|
||||||
|
|
||||||
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
||||||
...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{
|
...(meta.options.formats.includes("screenshot") ||
|
||||||
|
meta.options.formats.includes("screenshot@fullPage")
|
||||||
|
? [
|
||||||
|
{
|
||||||
type: "screenshot" as const,
|
type: "screenshot" as const,
|
||||||
fullPage: meta.options.formats.includes("screenshot@fullPage"),
|
fullPage: meta.options.formats.includes("screenshot@fullPage")
|
||||||
}] : []),
|
}
|
||||||
|
]
|
||||||
|
: []),
|
||||||
|
|
||||||
// Include specified actions
|
// Include specified actions
|
||||||
...(meta.options.actions ?? []),
|
...(meta.options.actions ?? [])
|
||||||
];
|
];
|
||||||
|
|
||||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = {
|
const request: FireEngineScrapeRequestCommon &
|
||||||
|
FireEngineScrapeRequestChromeCDP = {
|
||||||
url: meta.url,
|
url: meta.url,
|
||||||
engine: "chrome-cdp",
|
engine: "chrome-cdp",
|
||||||
instantReturn: true,
|
instantReturn: true,
|
||||||
skipTlsVerification: meta.options.skipTlsVerification,
|
skipTlsVerification: meta.options.skipTlsVerification,
|
||||||
headers: meta.options.headers,
|
headers: meta.options.headers,
|
||||||
...(actions.length > 0 ? ({
|
...(actions.length > 0
|
||||||
actions,
|
? {
|
||||||
}) : {}),
|
actions
|
||||||
|
}
|
||||||
|
: {}),
|
||||||
priority: meta.internalOptions.priority,
|
priority: meta.internalOptions.priority,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation,
|
||||||
mobile: meta.options.mobile,
|
mobile: meta.options.mobile,
|
||||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
||||||
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache
|
||||||
// TODO: scrollXPaths
|
// TODO: scrollXPaths
|
||||||
};
|
};
|
||||||
|
|
||||||
const totalWait = actions.reduce((a,x) => x.type === "wait" ? (x.milliseconds ?? 1000) + a : a, 0);
|
const totalWait = actions.reduce(
|
||||||
|
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
||||||
let response = await performFireEngineScrape(
|
0
|
||||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
|
||||||
request,
|
|
||||||
meta.options.timeout !== undefined
|
|
||||||
? defaultTimeout + totalWait
|
|
||||||
: Infinity, // TODO: better timeout handling
|
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
|
let response = await performFireEngineScrape(
|
||||||
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
|
request
|
||||||
|
}),
|
||||||
|
request,
|
||||||
|
meta.options.timeout !== undefined ? defaultTimeout + totalWait : Infinity // TODO: better timeout handling
|
||||||
|
);
|
||||||
|
|
||||||
if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) {
|
specialtyScrapeCheck(
|
||||||
meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots });
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck"
|
||||||
|
}),
|
||||||
|
response.responseHeaders
|
||||||
|
);
|
||||||
|
|
||||||
|
if (
|
||||||
|
meta.options.formats.includes("screenshot") ||
|
||||||
|
meta.options.formats.includes("screenshot@fullPage")
|
||||||
|
) {
|
||||||
|
meta.logger.debug(
|
||||||
|
"Transforming screenshots from actions into screenshot field",
|
||||||
|
{ screenshots: response.screenshots }
|
||||||
|
);
|
||||||
response.screenshot = (response.screenshots ?? [])[0];
|
response.screenshot = (response.screenshots ?? [])[0];
|
||||||
(response.screenshots ?? []).splice(0, 1);
|
(response.screenshots ?? []).splice(0, 1);
|
||||||
meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot });
|
meta.logger.debug("Screenshot transformation done", {
|
||||||
|
screenshots: response.screenshots,
|
||||||
|
screenshot: response.screenshot
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
|
response,
|
||||||
|
sourceURL: meta.url
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -123,17 +193,22 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
|
|||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
screenshot: response.screenshot,
|
screenshot: response.screenshot,
|
||||||
...(actions.length > 0 ? {
|
...(actions.length > 0
|
||||||
|
? {
|
||||||
actions: {
|
actions: {
|
||||||
screenshots: response.screenshots ?? [],
|
screenshots: response.screenshots ?? [],
|
||||||
scrapes: response.actionContent ?? [],
|
scrapes: response.actionContent ?? []
|
||||||
}
|
}
|
||||||
} : {}),
|
}
|
||||||
|
: {})
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeURLWithFireEnginePlaywright(
|
||||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = {
|
meta: Meta
|
||||||
|
): Promise<EngineScrapeResult> {
|
||||||
|
const request: FireEngineScrapeRequestCommon &
|
||||||
|
FireEngineScrapeRequestPlaywright = {
|
||||||
url: meta.url,
|
url: meta.url,
|
||||||
engine: "playwright",
|
engine: "playwright",
|
||||||
instantReturn: true,
|
instantReturn: true,
|
||||||
@@ -145,21 +220,32 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
|
|||||||
wait: meta.options.waitFor,
|
wait: meta.options.waitFor,
|
||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation,
|
||||||
|
|
||||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = await performFireEngineScrape(
|
let response = await performFireEngineScrape(
|
||||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
|
request
|
||||||
|
}),
|
||||||
request,
|
request,
|
||||||
meta.options.timeout !== undefined
|
meta.options.timeout !== undefined
|
||||||
? defaultTimeout + meta.options.waitFor
|
? defaultTimeout + meta.options.waitFor
|
||||||
: Infinity, // TODO: better timeout handling
|
: Infinity // TODO: better timeout handling
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
|
specialtyScrapeCheck(
|
||||||
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck"
|
||||||
|
}),
|
||||||
|
response.responseHeaders
|
||||||
|
);
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
|
response,
|
||||||
|
sourceURL: meta.url
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -169,14 +255,19 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
|
|||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({
|
...(response.screenshots !== undefined && response.screenshots.length > 0
|
||||||
screenshot: response.screenshots[0],
|
? {
|
||||||
}) : {}),
|
screenshot: response.screenshots[0]
|
||||||
|
}
|
||||||
|
: {})
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeURLWithFireEngineTLSClient(
|
||||||
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = {
|
meta: Meta
|
||||||
|
): Promise<EngineScrapeResult> {
|
||||||
|
const request: FireEngineScrapeRequestCommon &
|
||||||
|
FireEngineScrapeRequestTLSClient = {
|
||||||
url: meta.url,
|
url: meta.url,
|
||||||
engine: "tlsclient",
|
engine: "tlsclient",
|
||||||
instantReturn: true,
|
instantReturn: true,
|
||||||
@@ -188,21 +279,30 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
|
|||||||
geolocation: meta.options.geolocation,
|
geolocation: meta.options.geolocation,
|
||||||
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
||||||
|
|
||||||
timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
|
timeout: meta.options.timeout === undefined ? 300000 : undefined // TODO: better timeout logic
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = await performFireEngineScrape(
|
let response = await performFireEngineScrape(
|
||||||
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
||||||
|
request
|
||||||
|
}),
|
||||||
request,
|
request,
|
||||||
meta.options.timeout !== undefined
|
meta.options.timeout !== undefined ? defaultTimeout : Infinity // TODO: better timeout handling
|
||||||
? defaultTimeout
|
|
||||||
: Infinity, // TODO: better timeout handling
|
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
|
specialtyScrapeCheck(
|
||||||
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck"
|
||||||
|
}),
|
||||||
|
response.responseHeaders
|
||||||
|
);
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
|
response,
|
||||||
|
sourceURL: meta.url
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -210,6 +310,6 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
|
|||||||
|
|
||||||
html: response.content,
|
html: response.content,
|
||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,10 +24,10 @@ export type FireEngineScrapeRequestCommon = {
|
|||||||
// team_id?: string; // unused
|
// team_id?: string; // unused
|
||||||
logRequest?: boolean; // default: true
|
logRequest?: boolean; // default: true
|
||||||
instantReturn?: boolean; // default: false
|
instantReturn?: boolean; // default: false
|
||||||
geolocation?: { country?: string; languages?: string[]; };
|
geolocation?: { country?: string; languages?: string[] };
|
||||||
|
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
}
|
};
|
||||||
|
|
||||||
export type FireEngineScrapeRequestChromeCDP = {
|
export type FireEngineScrapeRequestChromeCDP = {
|
||||||
engine: "chrome-cdp";
|
engine: "chrome-cdp";
|
||||||
@@ -58,40 +58,48 @@ export type FireEngineScrapeRequestTLSClient = {
|
|||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
processing: z.boolean(),
|
processing: z.boolean()
|
||||||
});
|
});
|
||||||
|
|
||||||
export async function fireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient> (
|
export async function fireEngineScrape<
|
||||||
|
Engine extends
|
||||||
|
| FireEngineScrapeRequestChromeCDP
|
||||||
|
| FireEngineScrapeRequestPlaywright
|
||||||
|
| FireEngineScrapeRequestTLSClient
|
||||||
|
>(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
request: FireEngineScrapeRequestCommon & Engine,
|
request: FireEngineScrapeRequestCommon & Engine
|
||||||
): Promise<z.infer<typeof schema>> {
|
): Promise<z.infer<typeof schema>> {
|
||||||
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
||||||
|
|
||||||
// TODO: retries
|
// TODO: retries
|
||||||
|
|
||||||
const scrapeRequest = await Sentry.startSpan({
|
const scrapeRequest = await Sentry.startSpan(
|
||||||
|
{
|
||||||
name: "fire-engine: Scrape",
|
name: "fire-engine: Scrape",
|
||||||
attributes: {
|
attributes: {
|
||||||
url: request.url,
|
url: request.url
|
||||||
|
}
|
||||||
},
|
},
|
||||||
}, async span => {
|
async (span) => {
|
||||||
return await robustFetch(
|
return await robustFetch({
|
||||||
{
|
|
||||||
url: `${fireEngineURL}/scrape`,
|
url: `${fireEngineURL}/scrape`,
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
...(Sentry.isInitialized() ? ({
|
...(Sentry.isInitialized()
|
||||||
|
? {
|
||||||
"sentry-trace": Sentry.spanToTraceHeader(span),
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
||||||
"baggage": Sentry.spanToBaggageHeader(span),
|
baggage: Sentry.spanToBaggageHeader(span)
|
||||||
}) : {}),
|
}
|
||||||
|
: {})
|
||||||
},
|
},
|
||||||
body: request,
|
body: request,
|
||||||
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
|
||||||
schema,
|
schema,
|
||||||
tryCount: 3,
|
tryCount: 3
|
||||||
|
});
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
});
|
|
||||||
|
|
||||||
return scrapeRequest;
|
return scrapeRequest;
|
||||||
}
|
}
|
||||||
@@ -1,29 +1,58 @@
|
|||||||
import { ScrapeActionContent } from "../../../lib/entities";
|
import { ScrapeActionContent } from "../../../lib/entities";
|
||||||
import { Meta } from "..";
|
import { Meta } from "..";
|
||||||
import { scrapeDOCX } from "./docx";
|
import { scrapeDOCX } from "./docx";
|
||||||
import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
|
import {
|
||||||
|
scrapeURLWithFireEngineChromeCDP,
|
||||||
|
scrapeURLWithFireEnginePlaywright,
|
||||||
|
scrapeURLWithFireEngineTLSClient
|
||||||
|
} from "./fire-engine";
|
||||||
import { scrapePDF } from "./pdf";
|
import { scrapePDF } from "./pdf";
|
||||||
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
import { scrapeURLWithScrapingBee } from "./scrapingbee";
|
||||||
import { scrapeURLWithFetch } from "./fetch";
|
import { scrapeURLWithFetch } from "./fetch";
|
||||||
import { scrapeURLWithPlaywright } from "./playwright";
|
import { scrapeURLWithPlaywright } from "./playwright";
|
||||||
import { scrapeCache } from "./cache";
|
import { scrapeCache } from "./cache";
|
||||||
|
|
||||||
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
|
export type Engine =
|
||||||
|
| "fire-engine;chrome-cdp"
|
||||||
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
| "fire-engine;playwright"
|
||||||
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
| "fire-engine;tlsclient"
|
||||||
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
| "scrapingbee"
|
||||||
const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
|
| "scrapingbeeLoad"
|
||||||
|
| "playwright"
|
||||||
|
| "fetch"
|
||||||
|
| "pdf"
|
||||||
|
| "docx"
|
||||||
|
| "cache";
|
||||||
|
|
||||||
|
const useScrapingBee =
|
||||||
|
process.env.SCRAPING_BEE_API_KEY !== "" &&
|
||||||
|
process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||||
|
const useFireEngine =
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
|
const usePlaywright =
|
||||||
|
process.env.PLAYWRIGHT_MICROSERVICE_URL !== "" &&
|
||||||
|
process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
|
||||||
|
const useCache =
|
||||||
|
process.env.CACHE_REDIS_URL !== "" &&
|
||||||
|
process.env.CACHE_REDIS_URL !== undefined;
|
||||||
|
|
||||||
export const engines: Engine[] = [
|
export const engines: Engine[] = [
|
||||||
// ...(useCache ? [ "cache" as const ] : []),
|
// ...(useCache ? [ "cache" as const ] : []),
|
||||||
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
|
...(useFireEngine
|
||||||
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
|
? [
|
||||||
...(usePlaywright ? [ "playwright" as const ] : []),
|
"fire-engine;chrome-cdp" as const,
|
||||||
|
"fire-engine;playwright" as const,
|
||||||
|
"fire-engine;tlsclient" as const
|
||||||
|
]
|
||||||
|
: []),
|
||||||
|
...(useScrapingBee
|
||||||
|
? ["scrapingbee" as const, "scrapingbeeLoad" as const]
|
||||||
|
: []),
|
||||||
|
...(usePlaywright ? ["playwright" as const] : []),
|
||||||
"fetch",
|
"fetch",
|
||||||
"pdf",
|
"pdf",
|
||||||
"docx",
|
"docx"
|
||||||
];
|
];
|
||||||
|
|
||||||
export const featureFlags = [
|
export const featureFlags = [
|
||||||
@@ -37,27 +66,27 @@ export const featureFlags = [
|
|||||||
"location",
|
"location",
|
||||||
"mobile",
|
"mobile",
|
||||||
"skipTlsVerification",
|
"skipTlsVerification",
|
||||||
"useFastMode",
|
"useFastMode"
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
export type FeatureFlag = typeof featureFlags[number];
|
export type FeatureFlag = (typeof featureFlags)[number];
|
||||||
|
|
||||||
export const featureFlagOptions: {
|
export const featureFlagOptions: {
|
||||||
[F in FeatureFlag]: {
|
[F in FeatureFlag]: {
|
||||||
priority: number;
|
priority: number;
|
||||||
}
|
};
|
||||||
} = {
|
} = {
|
||||||
"actions": { priority: 20 },
|
actions: { priority: 20 },
|
||||||
"waitFor": { priority: 1 },
|
waitFor: { priority: 1 },
|
||||||
"screenshot": { priority: 10 },
|
screenshot: { priority: 10 },
|
||||||
"screenshot@fullScreen": { priority: 10 },
|
"screenshot@fullScreen": { priority: 10 },
|
||||||
"pdf": { priority: 100 },
|
pdf: { priority: 100 },
|
||||||
"docx": { priority: 100 },
|
docx: { priority: 100 },
|
||||||
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
|
atsv: { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
|
||||||
"useFastMode": { priority: 90 },
|
useFastMode: { priority: 90 },
|
||||||
"location": { priority: 10 },
|
location: { priority: 10 },
|
||||||
"mobile": { priority: 10 },
|
mobile: { priority: 10 },
|
||||||
"skipTlsVerification": { priority: 10 },
|
skipTlsVerification: { priority: 10 }
|
||||||
} as const;
|
} as const;
|
||||||
|
|
||||||
export type EngineScrapeResult = {
|
export type EngineScrapeResult = {
|
||||||
@@ -73,212 +102,227 @@ export type EngineScrapeResult = {
|
|||||||
screenshots: string[];
|
screenshots: string[];
|
||||||
scrapes: ScrapeActionContent[];
|
scrapes: ScrapeActionContent[];
|
||||||
};
|
};
|
||||||
}
|
};
|
||||||
|
|
||||||
const engineHandlers: {
|
const engineHandlers: {
|
||||||
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
|
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>;
|
||||||
} = {
|
} = {
|
||||||
"cache": scrapeCache,
|
cache: scrapeCache,
|
||||||
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
|
||||||
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
|
||||||
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
|
||||||
"scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
|
scrapingbee: scrapeURLWithScrapingBee("domcontentloaded"),
|
||||||
"scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
|
scrapingbeeLoad: scrapeURLWithScrapingBee("networkidle2"),
|
||||||
"playwright": scrapeURLWithPlaywright,
|
playwright: scrapeURLWithPlaywright,
|
||||||
"fetch": scrapeURLWithFetch,
|
fetch: scrapeURLWithFetch,
|
||||||
"pdf": scrapePDF,
|
pdf: scrapePDF,
|
||||||
"docx": scrapeDOCX,
|
docx: scrapeDOCX
|
||||||
};
|
};
|
||||||
|
|
||||||
export const engineOptions: {
|
export const engineOptions: {
|
||||||
[E in Engine]: {
|
[E in Engine]: {
|
||||||
// A list of feature flags the engine supports.
|
// A list of feature flags the engine supports.
|
||||||
features: { [F in FeatureFlag]: boolean },
|
features: { [F in FeatureFlag]: boolean };
|
||||||
|
|
||||||
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
// This defines the order of engines in general. The engine with the highest quality will be used the most.
|
||||||
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
|
||||||
quality: number,
|
quality: number;
|
||||||
}
|
};
|
||||||
} = {
|
} = {
|
||||||
"cache": {
|
cache: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": true,
|
waitFor: true,
|
||||||
"screenshot": false,
|
screenshot: false,
|
||||||
"screenshot@fullScreen": false,
|
"screenshot@fullScreen": false,
|
||||||
"pdf": false, // TODO: figure this out
|
pdf: false, // TODO: figure this out
|
||||||
"docx": false, // TODO: figure this out
|
docx: false, // TODO: figure this out
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": false,
|
useFastMode: false
|
||||||
},
|
},
|
||||||
quality: 1000, // cache should always be tried first
|
quality: 1000 // cache should always be tried first
|
||||||
},
|
},
|
||||||
"fire-engine;chrome-cdp": {
|
"fire-engine;chrome-cdp": {
|
||||||
features: {
|
features: {
|
||||||
"actions": true,
|
actions: true,
|
||||||
"waitFor": true, // through actions transform
|
waitFor: true, // through actions transform
|
||||||
"screenshot": true, // through actions transform
|
screenshot: true, // through actions transform
|
||||||
"screenshot@fullScreen": true, // through actions transform
|
"screenshot@fullScreen": true, // through actions transform
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": true,
|
location: true,
|
||||||
"mobile": true,
|
mobile: true,
|
||||||
"skipTlsVerification": true,
|
skipTlsVerification: true,
|
||||||
"useFastMode": false,
|
useFastMode: false
|
||||||
},
|
},
|
||||||
quality: 50,
|
quality: 50
|
||||||
},
|
},
|
||||||
"fire-engine;playwright": {
|
"fire-engine;playwright": {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": true,
|
waitFor: true,
|
||||||
"screenshot": true,
|
screenshot: true,
|
||||||
"screenshot@fullScreen": true,
|
"screenshot@fullScreen": true,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": false,
|
useFastMode: false
|
||||||
},
|
},
|
||||||
quality: 40,
|
quality: 40
|
||||||
},
|
},
|
||||||
"scrapingbee": {
|
scrapingbee: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": true,
|
waitFor: true,
|
||||||
"screenshot": true,
|
screenshot: true,
|
||||||
"screenshot@fullScreen": true,
|
"screenshot@fullScreen": true,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": false,
|
useFastMode: false
|
||||||
},
|
},
|
||||||
quality: 30,
|
quality: 30
|
||||||
},
|
},
|
||||||
"scrapingbeeLoad": {
|
scrapingbeeLoad: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": true,
|
waitFor: true,
|
||||||
"screenshot": true,
|
screenshot: true,
|
||||||
"screenshot@fullScreen": true,
|
"screenshot@fullScreen": true,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": false,
|
useFastMode: false
|
||||||
},
|
},
|
||||||
quality: 29,
|
quality: 29
|
||||||
},
|
},
|
||||||
"playwright": {
|
playwright: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": true,
|
waitFor: true,
|
||||||
"screenshot": false,
|
screenshot: false,
|
||||||
"screenshot@fullScreen": false,
|
"screenshot@fullScreen": false,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": false,
|
useFastMode: false
|
||||||
},
|
},
|
||||||
quality: 20,
|
quality: 20
|
||||||
},
|
},
|
||||||
"fire-engine;tlsclient": {
|
"fire-engine;tlsclient": {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": false,
|
waitFor: false,
|
||||||
"screenshot": false,
|
screenshot: false,
|
||||||
"screenshot@fullScreen": false,
|
"screenshot@fullScreen": false,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": true,
|
atsv: true,
|
||||||
"location": true,
|
location: true,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": true,
|
useFastMode: true
|
||||||
},
|
},
|
||||||
quality: 10,
|
quality: 10
|
||||||
},
|
},
|
||||||
"fetch": {
|
fetch: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": false,
|
waitFor: false,
|
||||||
"screenshot": false,
|
screenshot: false,
|
||||||
"screenshot@fullScreen": false,
|
"screenshot@fullScreen": false,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": true,
|
useFastMode: true
|
||||||
},
|
},
|
||||||
quality: 5,
|
quality: 5
|
||||||
},
|
},
|
||||||
"pdf": {
|
pdf: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": false,
|
waitFor: false,
|
||||||
"screenshot": false,
|
screenshot: false,
|
||||||
"screenshot@fullScreen": false,
|
"screenshot@fullScreen": false,
|
||||||
"pdf": true,
|
pdf: true,
|
||||||
"docx": false,
|
docx: false,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": true,
|
useFastMode: true
|
||||||
},
|
},
|
||||||
quality: -10,
|
quality: -10
|
||||||
},
|
},
|
||||||
"docx": {
|
docx: {
|
||||||
features: {
|
features: {
|
||||||
"actions": false,
|
actions: false,
|
||||||
"waitFor": false,
|
waitFor: false,
|
||||||
"screenshot": false,
|
screenshot: false,
|
||||||
"screenshot@fullScreen": false,
|
"screenshot@fullScreen": false,
|
||||||
"pdf": false,
|
pdf: false,
|
||||||
"docx": true,
|
docx: true,
|
||||||
"atsv": false,
|
atsv: false,
|
||||||
"location": false,
|
location: false,
|
||||||
"mobile": false,
|
mobile: false,
|
||||||
"skipTlsVerification": false,
|
skipTlsVerification: false,
|
||||||
"useFastMode": true,
|
useFastMode: true
|
||||||
},
|
|
||||||
quality: -10,
|
|
||||||
},
|
},
|
||||||
|
quality: -10
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
export function buildFallbackList(meta: Meta): {
|
export function buildFallbackList(meta: Meta): {
|
||||||
engine: Engine,
|
engine: Engine;
|
||||||
unsupportedFeatures: Set<FeatureFlag>,
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
}[] {
|
}[] {
|
||||||
const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
const prioritySum = [...meta.featureFlags].reduce(
|
||||||
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
|
0
|
||||||
|
);
|
||||||
const priorityThreshold = Math.floor(prioritySum / 2);
|
const priorityThreshold = Math.floor(prioritySum / 2);
|
||||||
let selectedEngines: {
|
let selectedEngines: {
|
||||||
engine: Engine,
|
engine: Engine;
|
||||||
supportScore: number,
|
supportScore: number;
|
||||||
unsupportedFeatures: Set<FeatureFlag>,
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
}[] = [];
|
}[] = [];
|
||||||
|
|
||||||
const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
|
const currentEngines =
|
||||||
|
meta.internalOptions.forceEngine !== undefined
|
||||||
|
? [meta.internalOptions.forceEngine]
|
||||||
|
: engines;
|
||||||
|
|
||||||
for (const engine of currentEngines) {
|
for (const engine of currentEngines) {
|
||||||
const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
|
const supportedFlags = new Set([
|
||||||
const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
|
...Object.entries(engineOptions[engine].features)
|
||||||
|
.filter(
|
||||||
|
([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true
|
||||||
|
)
|
||||||
|
.map(([k, _]) => k)
|
||||||
|
]);
|
||||||
|
const supportScore = [...supportedFlags].reduce(
|
||||||
|
(a, x) => a + featureFlagOptions[x].priority,
|
||||||
|
0
|
||||||
|
);
|
||||||
|
|
||||||
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
const unsupportedFeatures = new Set([...meta.featureFlags]);
|
||||||
for (const flag of meta.featureFlags) {
|
for (const flag of meta.featureFlags) {
|
||||||
@@ -289,27 +333,54 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
|
|
||||||
if (supportScore >= priorityThreshold) {
|
if (supportScore >= priorityThreshold) {
|
||||||
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
|
||||||
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
|
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, {
|
||||||
|
supportScore,
|
||||||
|
prioritySum,
|
||||||
|
priorityThreshold,
|
||||||
|
featureFlags: [...meta.featureFlags],
|
||||||
|
unsupportedFeatures
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
|
meta.logger.debug(
|
||||||
|
`Engine ${engine} does not meet feature priority threshold`,
|
||||||
|
{
|
||||||
|
supportScore,
|
||||||
|
prioritySum,
|
||||||
|
priorityThreshold,
|
||||||
|
featureFlags: [...meta.featureFlags],
|
||||||
|
unsupportedFeatures
|
||||||
|
}
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
|
if (selectedEngines.some((x) => engineOptions[x.engine].quality > 0)) {
|
||||||
selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
|
selectedEngines = selectedEngines.filter(
|
||||||
|
(x) => engineOptions[x.engine].quality > 0
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
|
selectedEngines.sort(
|
||||||
|
(a, b) =>
|
||||||
|
b.supportScore - a.supportScore ||
|
||||||
|
engineOptions[b.engine].quality - engineOptions[a.engine].quality
|
||||||
|
);
|
||||||
|
|
||||||
return selectedEngines;
|
return selectedEngines;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
|
export async function scrapeURLWithEngine(
|
||||||
|
meta: Meta,
|
||||||
|
engine: Engine
|
||||||
|
): Promise<EngineScrapeResult> {
|
||||||
const fn = engineHandlers[engine];
|
const fn = engineHandlers[engine];
|
||||||
const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
|
const logger = meta.logger.child({
|
||||||
|
method: fn.name ?? "scrapeURLWithEngine",
|
||||||
|
engine
|
||||||
|
});
|
||||||
const _meta = {
|
const _meta = {
|
||||||
...meta,
|
...meta,
|
||||||
logger,
|
logger
|
||||||
};
|
};
|
||||||
|
|
||||||
return await fn(_meta);
|
return await fn(_meta);
|
||||||
|
|||||||
@@ -10,10 +10,15 @@ import PdfParse from "pdf-parse";
|
|||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
import { RemoveFeatureError } from "../../error";
|
import { RemoveFeatureError } from "../../error";
|
||||||
|
|
||||||
type PDFProcessorResult = {html: string, markdown?: string};
|
type PDFProcessorResult = { html: string; markdown?: string };
|
||||||
|
|
||||||
async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
async function scrapePDFWithLlamaParse(
|
||||||
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
meta: Meta,
|
||||||
|
tempFilePath: string
|
||||||
|
): Promise<PDFProcessorResult> {
|
||||||
|
meta.logger.debug("Processing PDF document with LlamaIndex", {
|
||||||
|
tempFilePath
|
||||||
|
});
|
||||||
|
|
||||||
const uploadForm = new FormData();
|
const uploadForm = new FormData();
|
||||||
|
|
||||||
@@ -22,32 +27,36 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
[Symbol.toStringTag]: "Blob",
|
[Symbol.toStringTag]: "Blob",
|
||||||
name: tempFilePath,
|
name: tempFilePath,
|
||||||
stream() {
|
stream() {
|
||||||
return createReadStream(tempFilePath) as unknown as ReadableStream<Uint8Array>
|
return createReadStream(
|
||||||
|
tempFilePath
|
||||||
|
) as unknown as ReadableStream<Uint8Array>;
|
||||||
},
|
},
|
||||||
arrayBuffer() {
|
arrayBuffer() {
|
||||||
throw Error("Unimplemented in mock Blob: arrayBuffer")
|
throw Error("Unimplemented in mock Blob: arrayBuffer");
|
||||||
},
|
},
|
||||||
size: (await fs.stat(tempFilePath)).size,
|
size: (await fs.stat(tempFilePath)).size,
|
||||||
text() {
|
text() {
|
||||||
throw Error("Unimplemented in mock Blob: text")
|
throw Error("Unimplemented in mock Blob: text");
|
||||||
},
|
},
|
||||||
slice(start, end, contentType) {
|
slice(start, end, contentType) {
|
||||||
throw Error("Unimplemented in mock Blob: slice")
|
throw Error("Unimplemented in mock Blob: slice");
|
||||||
},
|
},
|
||||||
type: "application/pdf",
|
type: "application/pdf"
|
||||||
} as Blob);
|
} as Blob);
|
||||||
|
|
||||||
const upload = await robustFetch({
|
const upload = await robustFetch({
|
||||||
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
|
||||||
},
|
},
|
||||||
body: uploadForm,
|
body: uploadForm,
|
||||||
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
|
logger: meta.logger.child({
|
||||||
schema: z.object({
|
method: "scrapePDFWithLlamaParse/upload/robustFetch"
|
||||||
id: z.string(),
|
|
||||||
}),
|
}),
|
||||||
|
schema: z.object({
|
||||||
|
id: z.string()
|
||||||
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
const jobId = upload.id;
|
const jobId = upload.id;
|
||||||
@@ -61,16 +70,18 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
headers: {
|
headers: {
|
||||||
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`
|
||||||
},
|
},
|
||||||
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
|
logger: meta.logger.child({
|
||||||
schema: z.object({
|
method: "scrapePDFWithLlamaParse/result/robustFetch"
|
||||||
markdown: z.string(),
|
|
||||||
}),
|
}),
|
||||||
|
schema: z.object({
|
||||||
|
markdown: z.string()
|
||||||
|
})
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
markdown: result.markdown,
|
markdown: result.markdown,
|
||||||
html: await marked.parse(result.markdown, { async: true }),
|
html: await marked.parse(result.markdown, { async: true })
|
||||||
};
|
};
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof Error && e.message === "Request sent failure status") {
|
if (e instanceof Error && e.message === "Request sent failure status") {
|
||||||
@@ -82,7 +93,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
throw new RemoveFeatureError(["pdf"]);
|
throw new RemoveFeatureError(["pdf"]);
|
||||||
} else {
|
} else {
|
||||||
throw new Error("LlamaParse threw an error", {
|
throw new Error("LlamaParse threw an error", {
|
||||||
cause: e.cause,
|
cause: e.cause
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -96,7 +107,10 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
throw new Error("LlamaParse timed out");
|
throw new Error("LlamaParse timed out");
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
async function scrapePDFWithParsePDF(
|
||||||
|
meta: Meta,
|
||||||
|
tempFilePath: string
|
||||||
|
): Promise<PDFProcessorResult> {
|
||||||
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
||||||
|
|
||||||
const result = await PdfParse(await fs.readFile(tempFilePath));
|
const result = await PdfParse(await fs.readFile(tempFilePath));
|
||||||
@@ -104,7 +118,7 @@ async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
markdown: escaped,
|
markdown: escaped,
|
||||||
html: escaped,
|
html: escaped
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,7 +131,7 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
statusCode: file.response.status,
|
statusCode: file.response.status,
|
||||||
|
|
||||||
html: content,
|
html: content,
|
||||||
markdown: content,
|
markdown: content
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -126,27 +140,40 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
let result: PDFProcessorResult | null = null;
|
let result: PDFProcessorResult | null = null;
|
||||||
if (process.env.LLAMAPARSE_API_KEY) {
|
if (process.env.LLAMAPARSE_API_KEY) {
|
||||||
try {
|
try {
|
||||||
result = await scrapePDFWithLlamaParse({
|
result = await scrapePDFWithLlamaParse(
|
||||||
|
{
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
logger: meta.logger.child({
|
||||||
}, tempFilePath);
|
method: "scrapePDF/scrapePDFWithLlamaParse"
|
||||||
|
})
|
||||||
|
},
|
||||||
|
tempFilePath
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||||
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { error });
|
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
||||||
|
error
|
||||||
|
});
|
||||||
} else if (error instanceof RemoveFeatureError) {
|
} else if (error instanceof RemoveFeatureError) {
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
meta.logger.warn(
|
||||||
|
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
||||||
|
{ error }
|
||||||
|
);
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result === null) {
|
if (result === null) {
|
||||||
result = await scrapePDFWithParsePDF({
|
result = await scrapePDFWithParsePDF(
|
||||||
|
{
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
|
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" })
|
||||||
}, tempFilePath);
|
},
|
||||||
|
tempFilePath
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await fs.unlink(tempFilePath);
|
await fs.unlink(tempFilePath);
|
||||||
@@ -156,6 +183,6 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
|
||||||
html: result.html,
|
html: result.html,
|
||||||
markdown: result.markdown,
|
markdown: result.markdown
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,39 +4,44 @@ import { Meta } from "../..";
|
|||||||
import { TimeoutError } from "../../error";
|
import { TimeoutError } from "../../error";
|
||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
|
|
||||||
export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeURLWithPlaywright(
|
||||||
|
meta: Meta
|
||||||
|
): Promise<EngineScrapeResult> {
|
||||||
const timeout = 20000 + meta.options.waitFor;
|
const timeout = 20000 + meta.options.waitFor;
|
||||||
|
|
||||||
const response = await Promise.race([
|
const response = await Promise.race([
|
||||||
await robustFetch({
|
await robustFetch({
|
||||||
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json"
|
||||||
},
|
},
|
||||||
body: {
|
body: {
|
||||||
url: meta.url,
|
url: meta.url,
|
||||||
wait_after_load: meta.options.waitFor,
|
wait_after_load: meta.options.waitFor,
|
||||||
timeout,
|
timeout,
|
||||||
headers: meta.options.headers,
|
headers: meta.options.headers
|
||||||
},
|
},
|
||||||
method: "POST",
|
method: "POST",
|
||||||
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
content: z.string(),
|
content: z.string(),
|
||||||
pageStatusCode: z.number(),
|
pageStatusCode: z.number(),
|
||||||
pageError: z.string().optional(),
|
pageError: z.string().optional()
|
||||||
}),
|
})
|
||||||
}),
|
}),
|
||||||
(async () => {
|
(async () => {
|
||||||
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
|
||||||
throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } });
|
throw new TimeoutError(
|
||||||
})(),
|
"Playwright was unable to scrape the page before timing out",
|
||||||
|
{ cause: { timeout } }
|
||||||
|
);
|
||||||
|
})()
|
||||||
]);
|
]);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: meta.url, // TODO: impove redirect following
|
url: meta.url, // TODO: impove redirect following
|
||||||
html: response.content,
|
html: response.content,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
error: response.pageError,
|
error: response.pageError
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,9 @@ import { EngineError } from "../../error";
|
|||||||
|
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
||||||
|
|
||||||
export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "networkidle2"): ((meta: Meta) => Promise<EngineScrapeResult>) {
|
export function scrapeURLWithScrapingBee(
|
||||||
|
wait_browser: "domcontentloaded" | "networkidle2"
|
||||||
|
): (meta: Meta) => Promise<EngineScrapeResult> {
|
||||||
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
||||||
let response: AxiosResponse<any>;
|
let response: AxiosResponse<any>;
|
||||||
try {
|
try {
|
||||||
@@ -20,11 +22,13 @@ export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "net
|
|||||||
transparent_status_code: true,
|
transparent_status_code: true,
|
||||||
json_response: true,
|
json_response: true,
|
||||||
screenshot: meta.options.formats.includes("screenshot"),
|
screenshot: meta.options.formats.includes("screenshot"),
|
||||||
screenshot_full_page: meta.options.formats.includes("screenshot@fullPage"),
|
screenshot_full_page: meta.options.formats.includes(
|
||||||
|
"screenshot@fullPage"
|
||||||
|
)
|
||||||
},
|
},
|
||||||
headers: {
|
headers: {
|
||||||
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
|
"ScrapingService-Request": "TRUE" // this is sent to the page, not to ScrapingBee - mogery
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof AxiosError && error.response !== undefined) {
|
if (error instanceof AxiosError && error.response !== undefined) {
|
||||||
@@ -38,19 +42,35 @@ export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "net
|
|||||||
const body = JSON.parse(new TextDecoder().decode(data));
|
const body = JSON.parse(new TextDecoder().decode(data));
|
||||||
|
|
||||||
const headers = body.headers ?? {};
|
const headers = body.headers ?? {};
|
||||||
const isHiddenEngineError = !(headers["Date"] ?? headers["date"] ?? headers["Content-Type"] ?? headers["content-type"]);
|
const isHiddenEngineError = !(
|
||||||
|
headers["Date"] ??
|
||||||
|
headers["date"] ??
|
||||||
|
headers["Content-Type"] ??
|
||||||
|
headers["content-type"]
|
||||||
|
);
|
||||||
|
|
||||||
if (body.errors || body.body?.error || isHiddenEngineError) {
|
if (body.errors || body.body?.error || isHiddenEngineError) {
|
||||||
meta.logger.error("ScrapingBee threw an error", { body: body.body?.error ?? body.errors ?? body.body ?? body });
|
meta.logger.error("ScrapingBee threw an error", {
|
||||||
throw new EngineError("Engine error #34", { cause: { body, statusCode: response.status } });
|
body: body.body?.error ?? body.errors ?? body.body ?? body
|
||||||
|
});
|
||||||
|
throw new EngineError("Engine error #34", {
|
||||||
|
cause: { body, statusCode: response.status }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (typeof body.body !== "string") {
|
if (typeof body.body !== "string") {
|
||||||
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
||||||
throw new EngineError("Engine error #35", { cause: { body, statusCode: response.status } });
|
throw new EngineError("Engine error #35", {
|
||||||
|
cause: { body, statusCode: response.status }
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" }), body.headers);
|
specialtyScrapeCheck(
|
||||||
|
meta.logger.child({
|
||||||
|
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck"
|
||||||
|
}),
|
||||||
|
body.headers
|
||||||
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: body["resolved-url"] ?? meta.url,
|
url: body["resolved-url"] ?? meta.url,
|
||||||
@@ -58,9 +78,11 @@ export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "net
|
|||||||
html: body.body,
|
html: body.body,
|
||||||
error: response.status >= 300 ? response.statusText : undefined,
|
error: response.status >= 300 ? response.statusText : undefined,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
...(body.screenshot ? ({
|
...(body.screenshot
|
||||||
screenshot: `data:image/png;base64,${body.screenshot}`,
|
? {
|
||||||
}) : {}),
|
screenshot: `data:image/png;base64,${body.screenshot}`
|
||||||
|
}
|
||||||
|
: {})
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,19 +7,22 @@ import { v4 as uuid } from "uuid";
|
|||||||
import * as undici from "undici";
|
import * as undici from "undici";
|
||||||
|
|
||||||
export async function fetchFileToBuffer(url: string): Promise<{
|
export async function fetchFileToBuffer(url: string): Promise<{
|
||||||
response: Response,
|
response: Response;
|
||||||
buffer: Buffer
|
buffer: Buffer;
|
||||||
}> {
|
}> {
|
||||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
||||||
return {
|
return {
|
||||||
response,
|
response,
|
||||||
buffer: Buffer.from(await response.arrayBuffer()),
|
buffer: Buffer.from(await response.arrayBuffer())
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function downloadFile(id: string, url: string): Promise<{
|
export async function downloadFile(
|
||||||
response: undici.Response
|
id: string,
|
||||||
tempFilePath: string
|
url: string
|
||||||
|
): Promise<{
|
||||||
|
response: undici.Response;
|
||||||
|
tempFilePath: string;
|
||||||
}> {
|
}> {
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
||||||
const tempFileWrite = createWriteStream(tempFilePath);
|
const tempFileWrite = createWriteStream(tempFilePath);
|
||||||
@@ -29,8 +32,8 @@ export async function downloadFile(id: string, url: string): Promise<{
|
|||||||
const response = await undici.fetch(url, {
|
const response = await undici.fetch(url, {
|
||||||
dispatcher: new undici.Agent({
|
dispatcher: new undici.Agent({
|
||||||
connect: {
|
connect: {
|
||||||
rejectUnauthorized: false,
|
rejectUnauthorized: false
|
||||||
},
|
}
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -43,12 +46,14 @@ export async function downloadFile(id: string, url: string): Promise<{
|
|||||||
await new Promise((resolve, reject) => {
|
await new Promise((resolve, reject) => {
|
||||||
tempFileWrite.on("finish", () => resolve(null));
|
tempFileWrite.on("finish", () => resolve(null));
|
||||||
tempFileWrite.on("error", (error) => {
|
tempFileWrite.on("error", (error) => {
|
||||||
reject(new EngineError("Failed to write to temp file", { cause: { error } }));
|
reject(
|
||||||
|
new EngineError("Failed to write to temp file", { cause: { error } })
|
||||||
|
);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
})
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
response,
|
response,
|
||||||
tempFilePath,
|
tempFilePath
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,32 @@
|
|||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
import { AddFeatureError } from "../../error";
|
import { AddFeatureError } from "../../error";
|
||||||
|
|
||||||
export function specialtyScrapeCheck(logger: Logger, headers: Record<string, string> | undefined) {
|
export function specialtyScrapeCheck(
|
||||||
const contentType = (Object.entries(headers ?? {}).find(x => x[0].toLowerCase() === "content-type") ?? [])[1];
|
logger: Logger,
|
||||||
|
headers: Record<string, string> | undefined
|
||||||
|
) {
|
||||||
|
const contentType = (Object.entries(headers ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type"
|
||||||
|
) ?? [])[1];
|
||||||
|
|
||||||
if (contentType === undefined) {
|
if (contentType === undefined) {
|
||||||
logger.warn("Failed to check contentType -- was not present in headers", { headers });
|
logger.warn("Failed to check contentType -- was not present in headers", {
|
||||||
} else if (contentType === "application/pdf" || contentType.startsWith("application/pdf;")) { // .pdf
|
headers
|
||||||
|
});
|
||||||
|
} else if (
|
||||||
|
contentType === "application/pdf" ||
|
||||||
|
contentType.startsWith("application/pdf;")
|
||||||
|
) {
|
||||||
|
// .pdf
|
||||||
throw new AddFeatureError(["pdf"]);
|
throw new AddFeatureError(["pdf"]);
|
||||||
} else if (contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document;")) { // .docx
|
} else if (
|
||||||
|
contentType ===
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
||||||
|
contentType.startsWith(
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document;"
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
// .docx
|
||||||
throw new AddFeatureError(["docx"]);
|
throw new AddFeatureError(["docx"]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
import { EngineResultsTracker } from "."
|
import { EngineResultsTracker } from ".";
|
||||||
import { Engine, FeatureFlag } from "./engines"
|
import { Engine, FeatureFlag } from "./engines";
|
||||||
|
|
||||||
export class EngineError extends Error {
|
export class EngineError extends Error {
|
||||||
constructor(message?: string, options?: ErrorOptions) {
|
constructor(message?: string, options?: ErrorOptions) {
|
||||||
super(message, options)
|
super(message, options);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export class TimeoutError extends Error {
|
export class TimeoutError extends Error {
|
||||||
constructor(message?: string, options?: ErrorOptions) {
|
constructor(message?: string, options?: ErrorOptions) {
|
||||||
super(message, options)
|
super(message, options);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -18,7 +18,9 @@ export class NoEnginesLeftError extends Error {
|
|||||||
public results: EngineResultsTracker;
|
public results: EngineResultsTracker;
|
||||||
|
|
||||||
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
|
||||||
super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.");
|
super(
|
||||||
|
"All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com."
|
||||||
|
);
|
||||||
this.fallbackList = fallbackList;
|
this.fallbackList = fallbackList;
|
||||||
this.results = results;
|
this.results = results;
|
||||||
}
|
}
|
||||||
@@ -37,7 +39,9 @@ export class RemoveFeatureError extends Error {
|
|||||||
public featureFlags: FeatureFlag[];
|
public featureFlags: FeatureFlag[];
|
||||||
|
|
||||||
constructor(featureFlags: FeatureFlag[]) {
|
constructor(featureFlags: FeatureFlag[]) {
|
||||||
super("Incorrect feature flags have been discovered: " + featureFlags.join(", "));
|
super(
|
||||||
|
"Incorrect feature flags have been discovered: " + featureFlags.join(", ")
|
||||||
|
);
|
||||||
this.featureFlags = featureFlags;
|
this.featureFlags = featureFlags;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -45,7 +49,9 @@ export class RemoveFeatureError extends Error {
|
|||||||
export class SiteError extends Error {
|
export class SiteError extends Error {
|
||||||
public code: string;
|
public code: string;
|
||||||
constructor(code: string) {
|
constructor(code: string) {
|
||||||
super("Specified URL is failing to load in the browser. Error code: " + code)
|
super(
|
||||||
|
"Specified URL is failing to load in the browser. Error code: " + code
|
||||||
|
);
|
||||||
this.code = code;
|
this.code = code;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,23 +3,39 @@ import * as Sentry from "@sentry/node";
|
|||||||
|
|
||||||
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
|
import {
|
||||||
|
buildFallbackList,
|
||||||
|
Engine,
|
||||||
|
EngineScrapeResult,
|
||||||
|
FeatureFlag,
|
||||||
|
scrapeURLWithEngine
|
||||||
|
} from "./engines";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { AddFeatureError, EngineError, NoEnginesLeftError, RemoveFeatureError, SiteError, TimeoutError } from "./error";
|
import {
|
||||||
|
AddFeatureError,
|
||||||
|
EngineError,
|
||||||
|
NoEnginesLeftError,
|
||||||
|
RemoveFeatureError,
|
||||||
|
SiteError,
|
||||||
|
TimeoutError
|
||||||
|
} from "./error";
|
||||||
import { executeTransformers } from "./transformers";
|
import { executeTransformers } from "./transformers";
|
||||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||||
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
||||||
|
|
||||||
export type ScrapeUrlResponse = ({
|
export type ScrapeUrlResponse = (
|
||||||
success: true,
|
| {
|
||||||
document: Document,
|
success: true;
|
||||||
} | {
|
document: Document;
|
||||||
success: false,
|
}
|
||||||
error: any,
|
| {
|
||||||
}) & {
|
success: false;
|
||||||
logs: any[],
|
error: any;
|
||||||
engines: EngineResultsTracker,
|
}
|
||||||
}
|
) & {
|
||||||
|
logs: any[];
|
||||||
|
engines: EngineResultsTracker;
|
||||||
|
};
|
||||||
|
|
||||||
export type Meta = {
|
export type Meta = {
|
||||||
id: string;
|
id: string;
|
||||||
@@ -29,9 +45,13 @@ export type Meta = {
|
|||||||
logger: Logger;
|
logger: Logger;
|
||||||
logs: any[];
|
logs: any[];
|
||||||
featureFlags: Set<FeatureFlag>;
|
featureFlags: Set<FeatureFlag>;
|
||||||
}
|
};
|
||||||
|
|
||||||
function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions: InternalOptions): Set<FeatureFlag> {
|
function buildFeatureFlags(
|
||||||
|
url: string,
|
||||||
|
options: ScrapeOptions,
|
||||||
|
internalOptions: InternalOptions
|
||||||
|
): Set<FeatureFlag> {
|
||||||
const flags: Set<FeatureFlag> = new Set();
|
const flags: Set<FeatureFlag> = new Set();
|
||||||
|
|
||||||
if (options.actions !== undefined) {
|
if (options.actions !== undefined) {
|
||||||
@@ -88,21 +108,37 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions:
|
|||||||
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
|
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
|
||||||
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
|
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
|
||||||
// while also retaining the benefits that WebScraper had from its OOP design.
|
// while also retaining the benefits that WebScraper had from its OOP design.
|
||||||
function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta {
|
function buildMetaObject(
|
||||||
const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
id: string,
|
||||||
|
url: string,
|
||||||
|
options: ScrapeOptions,
|
||||||
|
internalOptions: InternalOptions
|
||||||
|
): Meta {
|
||||||
|
const specParams =
|
||||||
|
urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
|
||||||
if (specParams !== undefined) {
|
if (specParams !== undefined) {
|
||||||
options = Object.assign(options, specParams.scrapeOptions);
|
options = Object.assign(options, specParams.scrapeOptions);
|
||||||
internalOptions = Object.assign(internalOptions, specParams.internalOptions);
|
internalOptions = Object.assign(
|
||||||
|
internalOptions,
|
||||||
|
specParams.internalOptions
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id, scrapeURL: url });
|
const _logger = logger.child({
|
||||||
|
module: "ScrapeURL",
|
||||||
|
scrapeId: id,
|
||||||
|
scrapeURL: url
|
||||||
|
});
|
||||||
const logs: any[] = [];
|
const logs: any[] = [];
|
||||||
|
|
||||||
return {
|
return {
|
||||||
id, url, options, internalOptions,
|
id,
|
||||||
|
url,
|
||||||
|
options,
|
||||||
|
internalOptions,
|
||||||
logger: _logger,
|
logger: _logger,
|
||||||
logs,
|
logs,
|
||||||
featureFlags: buildFeatureFlags(url, options, internalOptions),
|
featureFlags: buildFeatureFlags(url, options, internalOptions)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,26 +154,32 @@ export type InternalOptions = {
|
|||||||
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
disableSmartWaitCache?: boolean; // Passed along to fire-engine
|
||||||
};
|
};
|
||||||
|
|
||||||
export type EngineResultsTracker = { [E in Engine]?: ({
|
export type EngineResultsTracker = {
|
||||||
state: "error",
|
[E in Engine]?: (
|
||||||
error: any,
|
| {
|
||||||
unexpected: boolean,
|
state: "error";
|
||||||
} | {
|
error: any;
|
||||||
state: "success",
|
unexpected: boolean;
|
||||||
result: EngineScrapeResult & { markdown: string },
|
}
|
||||||
factors: Record<string, boolean>,
|
| {
|
||||||
unsupportedFeatures: Set<FeatureFlag>,
|
state: "success";
|
||||||
} | {
|
result: EngineScrapeResult & { markdown: string };
|
||||||
state: "timeout",
|
factors: Record<string, boolean>;
|
||||||
}) & {
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
startedAt: number,
|
}
|
||||||
finishedAt: number,
|
| {
|
||||||
} };
|
state: "timeout";
|
||||||
|
}
|
||||||
|
) & {
|
||||||
|
startedAt: number;
|
||||||
|
finishedAt: number;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
export type EngineScrapeResultWithContext = {
|
export type EngineScrapeResultWithContext = {
|
||||||
engine: Engine,
|
engine: Engine;
|
||||||
unsupportedFeatures: Set<FeatureFlag>,
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
result: (EngineScrapeResult & { markdown: string }),
|
result: EngineScrapeResult & { markdown: string };
|
||||||
};
|
};
|
||||||
|
|
||||||
function safeguardCircularError<T>(error: T): T {
|
function safeguardCircularError<T>(error: T): T {
|
||||||
@@ -150,10 +192,8 @@ function safeguardCircularError<T>(error: T): T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeURLLoop(
|
async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||||
meta: Meta
|
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`);
|
||||||
): Promise<ScrapeUrlResponse> {
|
|
||||||
meta.logger.info(`Scraping URL ${JSON.stringify(meta.url)}...`,);
|
|
||||||
|
|
||||||
// TODO: handle sitemap data, see WebScraper/index.ts:280
|
// TODO: handle sitemap data, see WebScraper/index.ts:280
|
||||||
// TODO: ScrapeEvents
|
// TODO: ScrapeEvents
|
||||||
@@ -168,14 +208,19 @@ async function scrapeURLLoop(
|
|||||||
try {
|
try {
|
||||||
meta.logger.info("Scraping via " + engine + "...");
|
meta.logger.info("Scraping via " + engine + "...");
|
||||||
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
const _engineResult = await scrapeURLWithEngine(meta, engine);
|
||||||
if (_engineResult.markdown === undefined) { // Some engines emit Markdown directly.
|
if (_engineResult.markdown === undefined) {
|
||||||
|
// Some engines emit Markdown directly.
|
||||||
_engineResult.markdown = await parseMarkdown(_engineResult.html);
|
_engineResult.markdown = await parseMarkdown(_engineResult.html);
|
||||||
}
|
}
|
||||||
const engineResult = _engineResult as EngineScrapeResult & { markdown: string };
|
const engineResult = _engineResult as EngineScrapeResult & {
|
||||||
|
markdown: string;
|
||||||
|
};
|
||||||
|
|
||||||
// Success factors
|
// Success factors
|
||||||
const isLongEnough = engineResult.markdown.length >= 20;
|
const isLongEnough = engineResult.markdown.length >= 20;
|
||||||
const isGoodStatusCode = (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || engineResult.statusCode === 304;
|
const isGoodStatusCode =
|
||||||
|
(engineResult.statusCode >= 200 && engineResult.statusCode < 300) ||
|
||||||
|
engineResult.statusCode === 304;
|
||||||
const hasNoPageError = engineResult.error === undefined;
|
const hasNoPageError = engineResult.error === undefined;
|
||||||
|
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
@@ -184,14 +229,16 @@ async function scrapeURLLoop(
|
|||||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now()
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: TODO: what to do when status code is bad is tough...
|
// NOTE: TODO: what to do when status code is bad is tough...
|
||||||
// we cannot just rely on text because error messages can be brief and not hit the limit
|
// we cannot just rely on text because error messages can be brief and not hit the limit
|
||||||
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
||||||
if (isLongEnough || !isGoodStatusCode) {
|
if (isLongEnough || !isGoodStatusCode) {
|
||||||
meta.logger.info("Scrape via " + engine + " deemed successful.", { factors: { isLongEnough, isGoodStatusCode, hasNoPageError } });
|
meta.logger.info("Scrape via " + engine + " deemed successful.", {
|
||||||
|
factors: { isLongEnough, isGoodStatusCode, hasNoPageError }
|
||||||
|
});
|
||||||
result = {
|
result = {
|
||||||
engine,
|
engine,
|
||||||
unsupportedFeatures,
|
unsupportedFeatures,
|
||||||
@@ -201,22 +248,29 @@ async function scrapeURLLoop(
|
|||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof EngineError) {
|
if (error instanceof EngineError) {
|
||||||
meta.logger.info("Engine " + engine + " could not scrape the page.", { error });
|
meta.logger.info("Engine " + engine + " could not scrape the page.", {
|
||||||
|
error
|
||||||
|
});
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "error",
|
state: "error",
|
||||||
error: safeguardCircularError(error),
|
error: safeguardCircularError(error),
|
||||||
unexpected: false,
|
unexpected: false,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now()
|
||||||
};
|
};
|
||||||
} else if (error instanceof TimeoutError) {
|
} else if (error instanceof TimeoutError) {
|
||||||
meta.logger.info("Engine " + engine + " timed out while scraping.", { error });
|
meta.logger.info("Engine " + engine + " timed out while scraping.", {
|
||||||
|
error
|
||||||
|
});
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "timeout",
|
state: "timeout",
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now()
|
||||||
};
|
};
|
||||||
} else if (error instanceof AddFeatureError || error instanceof RemoveFeatureError) {
|
} else if (
|
||||||
|
error instanceof AddFeatureError ||
|
||||||
|
error instanceof RemoveFeatureError
|
||||||
|
) {
|
||||||
throw error;
|
throw error;
|
||||||
} else if (error instanceof LLMRefusalError) {
|
} else if (error instanceof LLMRefusalError) {
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
@@ -224,8 +278,8 @@ async function scrapeURLLoop(
|
|||||||
error: safeguardCircularError(error),
|
error: safeguardCircularError(error),
|
||||||
unexpected: true,
|
unexpected: true,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now()
|
||||||
}
|
};
|
||||||
error.results = results;
|
error.results = results;
|
||||||
meta.logger.warn("LLM refusal encountered", { error });
|
meta.logger.warn("LLM refusal encountered", { error });
|
||||||
throw error;
|
throw error;
|
||||||
@@ -233,20 +287,26 @@ async function scrapeURLLoop(
|
|||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error });
|
meta.logger.info(
|
||||||
|
"An unexpected error happened while scraping with " + engine + ".",
|
||||||
|
{ error }
|
||||||
|
);
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
state: "error",
|
state: "error",
|
||||||
error: safeguardCircularError(error),
|
error: safeguardCircularError(error),
|
||||||
unexpected: true,
|
unexpected: true,
|
||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now()
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result === null) {
|
if (result === null) {
|
||||||
throw new NoEnginesLeftError(fallbackList.map(x => x.engine), results);
|
throw new NoEnginesLeftError(
|
||||||
|
fallbackList.map((x) => x.engine),
|
||||||
|
results
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let document: Document = {
|
let document: Document = {
|
||||||
@@ -258,14 +318,20 @@ async function scrapeURLLoop(
|
|||||||
sourceURL: meta.url,
|
sourceURL: meta.url,
|
||||||
url: result.result.url,
|
url: result.result.url,
|
||||||
statusCode: result.result.statusCode,
|
statusCode: result.result.statusCode,
|
||||||
error: result.result.error,
|
error: result.result.error
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if (result.unsupportedFeatures.size > 0) {
|
if (result.unsupportedFeatures.size > 0) {
|
||||||
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
|
const warning = `The engine used does not support the following features: ${[...result.unsupportedFeatures].join(", ")} -- your scrape may be partial.`;
|
||||||
meta.logger.warn(warning, { engine: result.engine, unsupportedFeatures: result.unsupportedFeatures });
|
meta.logger.warn(warning, {
|
||||||
document.warning = document.warning !== undefined ? document.warning + " " + warning : warning;
|
engine: result.engine,
|
||||||
|
unsupportedFeatures: result.unsupportedFeatures
|
||||||
|
});
|
||||||
|
document.warning =
|
||||||
|
document.warning !== undefined
|
||||||
|
? document.warning + " " + warning
|
||||||
|
: warning;
|
||||||
}
|
}
|
||||||
|
|
||||||
document = await executeTransformers(meta, document);
|
document = await executeTransformers(meta, document);
|
||||||
@@ -274,7 +340,7 @@ async function scrapeURLLoop(
|
|||||||
success: true,
|
success: true,
|
||||||
document,
|
document,
|
||||||
logs: meta.logs,
|
logs: meta.logs,
|
||||||
engines: results,
|
engines: results
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -282,7 +348,7 @@ export async function scrapeURL(
|
|||||||
id: string,
|
id: string,
|
||||||
url: string,
|
url: string,
|
||||||
options: ScrapeOptions,
|
options: ScrapeOptions,
|
||||||
internalOptions: InternalOptions = {},
|
internalOptions: InternalOptions = {}
|
||||||
): Promise<ScrapeUrlResponse> {
|
): Promise<ScrapeUrlResponse> {
|
||||||
const meta = buildMetaObject(id, url, options, internalOptions);
|
const meta = buildMetaObject(id, url, options, internalOptions);
|
||||||
try {
|
try {
|
||||||
@@ -290,12 +356,32 @@ export async function scrapeURL(
|
|||||||
try {
|
try {
|
||||||
return await scrapeURLLoop(meta);
|
return await scrapeURLLoop(meta);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) {
|
if (
|
||||||
meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags });
|
error instanceof AddFeatureError &&
|
||||||
meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags));
|
meta.internalOptions.forceEngine === undefined
|
||||||
} else if (error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined) {
|
) {
|
||||||
meta.logger.debug("Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags });
|
meta.logger.debug(
|
||||||
meta.featureFlags = new Set([...meta.featureFlags].filter(x => !error.featureFlags.includes(x)));
|
"More feature flags requested by scraper: adding " +
|
||||||
|
error.featureFlags.join(", "),
|
||||||
|
{ error, existingFlags: meta.featureFlags }
|
||||||
|
);
|
||||||
|
meta.featureFlags = new Set(
|
||||||
|
[...meta.featureFlags].concat(error.featureFlags)
|
||||||
|
);
|
||||||
|
} else if (
|
||||||
|
error instanceof RemoveFeatureError &&
|
||||||
|
meta.internalOptions.forceEngine === undefined
|
||||||
|
) {
|
||||||
|
meta.logger.debug(
|
||||||
|
"Incorrect feature flags reported by scraper: removing " +
|
||||||
|
error.featureFlags.join(","),
|
||||||
|
{ error, existingFlags: meta.featureFlags }
|
||||||
|
);
|
||||||
|
meta.featureFlags = new Set(
|
||||||
|
[...meta.featureFlags].filter(
|
||||||
|
(x) => !error.featureFlags.includes(x)
|
||||||
|
)
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
@@ -310,7 +396,11 @@ export async function scrapeURL(
|
|||||||
} else if (error instanceof LLMRefusalError) {
|
} else if (error instanceof LLMRefusalError) {
|
||||||
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
|
meta.logger.warn("scrapeURL: LLM refused to extract content", { error });
|
||||||
results = error.results!;
|
results = error.results!;
|
||||||
} else if (error instanceof Error && error.message.includes("Invalid schema for response_format")) { // TODO: seperate into custom error
|
} else if (
|
||||||
|
error instanceof Error &&
|
||||||
|
error.message.includes("Invalid schema for response_format")
|
||||||
|
) {
|
||||||
|
// TODO: seperate into custom error
|
||||||
meta.logger.warn("scrapeURL: LLM schema error", { error });
|
meta.logger.warn("scrapeURL: LLM schema error", { error });
|
||||||
// TODO: results?
|
// TODO: results?
|
||||||
} else if (error instanceof SiteError) {
|
} else if (error instanceof SiteError) {
|
||||||
@@ -325,7 +415,7 @@ export async function scrapeURL(
|
|||||||
success: false,
|
success: false,
|
||||||
error,
|
error,
|
||||||
logs: meta.logs,
|
logs: meta.logs,
|
||||||
engines: results,
|
engines: results
|
||||||
}
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,26 +6,29 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||||||
const $ = load(html);
|
const $ = load(html);
|
||||||
const links: string[] = [];
|
const links: string[] = [];
|
||||||
|
|
||||||
$('a').each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr('href');
|
const href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
try {
|
try {
|
||||||
if (href.startsWith('http://') || href.startsWith('https://')) {
|
if (href.startsWith("http://") || href.startsWith("https://")) {
|
||||||
// Absolute URL, add as is
|
// Absolute URL, add as is
|
||||||
links.push(href);
|
links.push(href);
|
||||||
} else if (href.startsWith('/')) {
|
} else if (href.startsWith("/")) {
|
||||||
// Relative URL starting with '/', append to origin
|
// Relative URL starting with '/', append to origin
|
||||||
links.push(new URL(href, baseUrl).href);
|
links.push(new URL(href, baseUrl).href);
|
||||||
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
|
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
|
||||||
// Relative URL not starting with '/', append to base URL
|
// Relative URL not starting with '/', append to base URL
|
||||||
links.push(new URL(href, baseUrl).href);
|
links.push(new URL(href, baseUrl).href);
|
||||||
} else if (href.startsWith('mailto:')) {
|
} else if (href.startsWith("mailto:")) {
|
||||||
// mailto: links, add as is
|
// mailto: links, add as is
|
||||||
links.push(href);
|
links.push(href);
|
||||||
}
|
}
|
||||||
// Fragment-only links (#) are ignored
|
// Fragment-only links (#) are ignored
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, { error });
|
logger.error(
|
||||||
|
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
|
||||||
|
{ error }
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -2,7 +2,10 @@ import { load } from "cheerio";
|
|||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
import { Meta } from "..";
|
import { Meta } from "..";
|
||||||
|
|
||||||
export function extractMetadata(meta: Meta, html: string): Document["metadata"] {
|
export function extractMetadata(
|
||||||
|
meta: Meta,
|
||||||
|
html: string
|
||||||
|
): Document["metadata"] {
|
||||||
let title: string | undefined = undefined;
|
let title: string | undefined = undefined;
|
||||||
let description: string | undefined = undefined;
|
let description: string | undefined = undefined;
|
||||||
let language: string | undefined = undefined;
|
let language: string | undefined = undefined;
|
||||||
@@ -41,34 +44,52 @@ export function extractMetadata(meta: Meta, html: string): Document["metadata"]
|
|||||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||||
|
|
||||||
// Assuming the language is part of the URL as per the regex pattern
|
// Assuming the language is part of the URL as per the regex pattern
|
||||||
language = soup('html').attr('lang') || undefined;
|
language = soup("html").attr("lang") || undefined;
|
||||||
|
|
||||||
keywords = soup('meta[name="keywords"]').attr("content") || undefined;
|
keywords = soup('meta[name="keywords"]').attr("content") || undefined;
|
||||||
robots = soup('meta[name="robots"]').attr("content") || undefined;
|
robots = soup('meta[name="robots"]').attr("content") || undefined;
|
||||||
ogTitle = soup('meta[property="og:title"]').attr("content") || undefined;
|
ogTitle = soup('meta[property="og:title"]').attr("content") || undefined;
|
||||||
ogDescription = soup('meta[property="og:description"]').attr("content") || undefined;
|
ogDescription =
|
||||||
|
soup('meta[property="og:description"]').attr("content") || undefined;
|
||||||
ogUrl = soup('meta[property="og:url"]').attr("content") || undefined;
|
ogUrl = soup('meta[property="og:url"]').attr("content") || undefined;
|
||||||
ogImage = soup('meta[property="og:image"]').attr("content") || undefined;
|
ogImage = soup('meta[property="og:image"]').attr("content") || undefined;
|
||||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || undefined;
|
ogAudio = soup('meta[property="og:audio"]').attr("content") || undefined;
|
||||||
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || undefined;
|
ogDeterminer =
|
||||||
|
soup('meta[property="og:determiner"]').attr("content") || undefined;
|
||||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || undefined;
|
ogLocale = soup('meta[property="og:locale"]').attr("content") || undefined;
|
||||||
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || undefined;
|
ogLocaleAlternate =
|
||||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || undefined;
|
soup('meta[property="og:locale:alternate"]')
|
||||||
|
.map((i, el) => soup(el).attr("content"))
|
||||||
|
.get() || undefined;
|
||||||
|
ogSiteName =
|
||||||
|
soup('meta[property="og:site_name"]').attr("content") || undefined;
|
||||||
ogVideo = soup('meta[property="og:video"]').attr("content") || undefined;
|
ogVideo = soup('meta[property="og:video"]').attr("content") || undefined;
|
||||||
articleSection = soup('meta[name="article:section"]').attr("content") || undefined;
|
articleSection =
|
||||||
|
soup('meta[name="article:section"]').attr("content") || undefined;
|
||||||
articleTag = soup('meta[name="article:tag"]').attr("content") || undefined;
|
articleTag = soup('meta[name="article:tag"]').attr("content") || undefined;
|
||||||
publishedTime = soup('meta[property="article:published_time"]').attr("content") || undefined;
|
publishedTime =
|
||||||
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || undefined;
|
soup('meta[property="article:published_time"]').attr("content") ||
|
||||||
dcTermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || undefined;
|
undefined;
|
||||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || undefined;
|
modifiedTime =
|
||||||
|
soup('meta[property="article:modified_time"]').attr("content") ||
|
||||||
|
undefined;
|
||||||
|
dcTermsKeywords =
|
||||||
|
soup('meta[name="dcterms.keywords"]').attr("content") || undefined;
|
||||||
|
dcDescription =
|
||||||
|
soup('meta[name="dc.description"]').attr("content") || undefined;
|
||||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || undefined;
|
dcSubject = soup('meta[name="dc.subject"]').attr("content") || undefined;
|
||||||
dcTermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || undefined;
|
dcTermsSubject =
|
||||||
dcTermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || undefined;
|
soup('meta[name="dcterms.subject"]').attr("content") || undefined;
|
||||||
|
dcTermsAudience =
|
||||||
|
soup('meta[name="dcterms.audience"]').attr("content") || undefined;
|
||||||
dcType = soup('meta[name="dc.type"]').attr("content") || undefined;
|
dcType = soup('meta[name="dc.type"]').attr("content") || undefined;
|
||||||
dcTermsType = soup('meta[name="dcterms.type"]').attr("content") || undefined;
|
dcTermsType =
|
||||||
|
soup('meta[name="dcterms.type"]').attr("content") || undefined;
|
||||||
dcDate = soup('meta[name="dc.date"]').attr("content") || undefined;
|
dcDate = soup('meta[name="dc.date"]').attr("content") || undefined;
|
||||||
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || undefined;
|
dcDateCreated =
|
||||||
dcTermsCreated = soup('meta[name="dcterms.created"]').attr("content") || undefined;
|
soup('meta[name="dc.date.created"]').attr("content") || undefined;
|
||||||
|
dcTermsCreated =
|
||||||
|
soup('meta[name="dcterms.created"]').attr("content") || undefined;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Extract all meta tags for custom metadata
|
// Extract all meta tags for custom metadata
|
||||||
@@ -127,6 +148,6 @@ export function extractMetadata(meta: Meta, html: string): Document["metadata"]
|
|||||||
publishedTime,
|
publishedTime,
|
||||||
articleTag,
|
articleTag,
|
||||||
articleSection,
|
articleSection,
|
||||||
...customMetadata,
|
...customMetadata
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
|
|
||||||
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
||||||
url: string;
|
url: string;
|
||||||
logger: Logger,
|
logger: Logger;
|
||||||
method: "GET" | "POST" | "DELETE" | "PUT";
|
method: "GET" | "POST" | "DELETE" | "PUT";
|
||||||
body?: any;
|
body?: any;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
@@ -18,7 +18,10 @@ export type RobustFetchParams<Schema extends z.Schema<any>> = {
|
|||||||
tryCooldown?: number;
|
tryCooldown?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer<Schema>>({
|
export async function robustFetch<
|
||||||
|
Schema extends z.Schema<any>,
|
||||||
|
Output = z.infer<Schema>
|
||||||
|
>({
|
||||||
url,
|
url,
|
||||||
logger,
|
logger,
|
||||||
method = "GET",
|
method = "GET",
|
||||||
@@ -29,9 +32,20 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
ignoreFailure = false,
|
ignoreFailure = false,
|
||||||
requestId = uuid(),
|
requestId = uuid(),
|
||||||
tryCount = 1,
|
tryCount = 1,
|
||||||
tryCooldown,
|
tryCooldown
|
||||||
}: RobustFetchParams<Schema>): Promise<Output> {
|
}: RobustFetchParams<Schema>): Promise<Output> {
|
||||||
const params = { url, logger, method, body, headers, schema, ignoreResponse, ignoreFailure, tryCount, tryCooldown };
|
const params = {
|
||||||
|
url,
|
||||||
|
logger,
|
||||||
|
method,
|
||||||
|
body,
|
||||||
|
headers,
|
||||||
|
schema,
|
||||||
|
ignoreResponse,
|
||||||
|
ignoreFailure,
|
||||||
|
tryCount,
|
||||||
|
tryCooldown
|
||||||
|
};
|
||||||
|
|
||||||
let request: Response;
|
let request: Response;
|
||||||
try {
|
try {
|
||||||
@@ -39,37 +53,47 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
method,
|
method,
|
||||||
headers: {
|
headers: {
|
||||||
...(body instanceof FormData
|
...(body instanceof FormData
|
||||||
? ({})
|
? {}
|
||||||
: body !== undefined ? ({
|
: body !== undefined
|
||||||
"Content-Type": "application/json",
|
? {
|
||||||
}) : {}),
|
"Content-Type": "application/json"
|
||||||
...(headers !== undefined ? headers : {}),
|
}
|
||||||
|
: {}),
|
||||||
|
...(headers !== undefined ? headers : {})
|
||||||
},
|
},
|
||||||
...(body instanceof FormData ? ({
|
...(body instanceof FormData
|
||||||
body,
|
? {
|
||||||
}) : body !== undefined ? ({
|
body
|
||||||
body: JSON.stringify(body),
|
}
|
||||||
}) : {}),
|
: body !== undefined
|
||||||
|
? {
|
||||||
|
body: JSON.stringify(body)
|
||||||
|
}
|
||||||
|
: {})
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (!ignoreFailure) {
|
if (!ignoreFailure) {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
if (tryCount > 1) {
|
if (tryCount > 1) {
|
||||||
logger.debug("Request failed, trying " + (tryCount - 1) + " more times", { params, error, requestId });
|
logger.debug(
|
||||||
|
"Request failed, trying " + (tryCount - 1) + " more times",
|
||||||
|
{ params, error, requestId }
|
||||||
|
);
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
...params,
|
...params,
|
||||||
requestId,
|
requestId,
|
||||||
tryCount: tryCount - 1,
|
tryCount: tryCount - 1
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Request failed", { params, error, requestId });
|
logger.debug("Request failed", { params, error, requestId });
|
||||||
throw new Error("Request failed", {
|
throw new Error("Request failed", {
|
||||||
cause: {
|
cause: {
|
||||||
params, requestId, error,
|
params,
|
||||||
},
|
requestId,
|
||||||
|
error
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
return null as Output;
|
return null as Output;
|
||||||
}
|
}
|
||||||
@@ -82,26 +106,39 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
const response = {
|
const response = {
|
||||||
status: request.status,
|
status: request.status,
|
||||||
headers: request.headers,
|
headers: request.headers,
|
||||||
body: await request.text(), // NOTE: can this throw an exception?
|
body: await request.text() // NOTE: can this throw an exception?
|
||||||
};
|
};
|
||||||
|
|
||||||
if (request.status >= 300) {
|
if (request.status >= 300) {
|
||||||
if (tryCount > 1) {
|
if (tryCount > 1) {
|
||||||
logger.debug("Request sent failure status, trying " + (tryCount - 1) + " more times", { params, request, response, requestId });
|
logger.debug(
|
||||||
|
"Request sent failure status, trying " + (tryCount - 1) + " more times",
|
||||||
|
{ params, request, response, requestId }
|
||||||
|
);
|
||||||
if (tryCooldown !== undefined) {
|
if (tryCooldown !== undefined) {
|
||||||
await new Promise((resolve) => setTimeout(() => resolve(null), tryCooldown));
|
await new Promise((resolve) =>
|
||||||
|
setTimeout(() => resolve(null), tryCooldown)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
return await robustFetch({
|
return await robustFetch({
|
||||||
...params,
|
...params,
|
||||||
requestId,
|
requestId,
|
||||||
tryCount: tryCount - 1,
|
tryCount: tryCount - 1
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Request sent failure status", { params, request, response, requestId });
|
logger.debug("Request sent failure status", {
|
||||||
|
params,
|
||||||
|
request,
|
||||||
|
response,
|
||||||
|
requestId
|
||||||
|
});
|
||||||
throw new Error("Request sent failure status", {
|
throw new Error("Request sent failure status", {
|
||||||
cause: {
|
cause: {
|
||||||
params, request, response, requestId,
|
params,
|
||||||
},
|
request,
|
||||||
|
response,
|
||||||
|
requestId
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -110,11 +147,19 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
try {
|
try {
|
||||||
data = JSON.parse(response.body);
|
data = JSON.parse(response.body);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.debug("Request sent malformed JSON", { params, request, response, requestId });
|
logger.debug("Request sent malformed JSON", {
|
||||||
|
params,
|
||||||
|
request,
|
||||||
|
response,
|
||||||
|
requestId
|
||||||
|
});
|
||||||
throw new Error("Request sent malformed JSON", {
|
throw new Error("Request sent malformed JSON", {
|
||||||
cause: {
|
cause: {
|
||||||
params, request, response, requestId,
|
params,
|
||||||
},
|
request,
|
||||||
|
response,
|
||||||
|
requestId
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -123,19 +168,41 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
|
|||||||
data = schema.parse(data);
|
data = schema.parse(data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof ZodError) {
|
if (error instanceof ZodError) {
|
||||||
logger.debug("Response does not match provided schema", { params, request, response, requestId, error, schema });
|
logger.debug("Response does not match provided schema", {
|
||||||
|
params,
|
||||||
|
request,
|
||||||
|
response,
|
||||||
|
requestId,
|
||||||
|
error,
|
||||||
|
schema
|
||||||
|
});
|
||||||
throw new Error("Response does not match provided schema", {
|
throw new Error("Response does not match provided schema", {
|
||||||
cause: {
|
cause: {
|
||||||
params, request, response, requestId,
|
params,
|
||||||
error, schema,
|
request,
|
||||||
|
response,
|
||||||
|
requestId,
|
||||||
|
error,
|
||||||
|
schema
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Parsing response with provided schema failed", { params, request, response, requestId, error, schema });
|
logger.debug("Parsing response with provided schema failed", {
|
||||||
|
params,
|
||||||
|
request,
|
||||||
|
response,
|
||||||
|
requestId,
|
||||||
|
error,
|
||||||
|
schema
|
||||||
|
});
|
||||||
throw new Error("Parsing response with provided schema failed", {
|
throw new Error("Parsing response with provided schema failed", {
|
||||||
cause: {
|
cause: {
|
||||||
params, request, response, requestId,
|
params,
|
||||||
error, schema
|
request,
|
||||||
|
response,
|
||||||
|
requestId,
|
||||||
|
error,
|
||||||
|
schema
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,9 +50,7 @@ const excludeNonMainTags = [
|
|||||||
"#cookie"
|
"#cookie"
|
||||||
];
|
];
|
||||||
|
|
||||||
const forceIncludeMainTags = [
|
const forceIncludeMainTags = ["#main"];
|
||||||
"#main"
|
|
||||||
];
|
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const removeUnwantedElements = (
|
||||||
html: string,
|
html: string,
|
||||||
@@ -60,7 +58,10 @@ export const removeUnwantedElements = (
|
|||||||
) => {
|
) => {
|
||||||
const soup = load(html);
|
const soup = load(html);
|
||||||
|
|
||||||
if (scrapeOptions.includeTags && scrapeOptions.includeTags.filter(x => x.trim().length !== 0).length > 0) {
|
if (
|
||||||
|
scrapeOptions.includeTags &&
|
||||||
|
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
|
||||||
|
) {
|
||||||
// Create a new root element to hold the tags to keep
|
// Create a new root element to hold the tags to keep
|
||||||
const newRoot = load("<div></div>")("div");
|
const newRoot = load("<div></div>")("div");
|
||||||
scrapeOptions.includeTags.forEach((tag) => {
|
scrapeOptions.includeTags.forEach((tag) => {
|
||||||
@@ -73,7 +74,10 @@ export const removeUnwantedElements = (
|
|||||||
|
|
||||||
soup("script, style, noscript, meta, head").remove();
|
soup("script, style, noscript, meta, head").remove();
|
||||||
|
|
||||||
if (scrapeOptions.excludeTags && scrapeOptions.excludeTags.filter(x => x.trim().length !== 0).length > 0) {
|
if (
|
||||||
|
scrapeOptions.excludeTags &&
|
||||||
|
scrapeOptions.excludeTags.filter((x) => x.trim().length !== 0).length > 0
|
||||||
|
) {
|
||||||
scrapeOptions.excludeTags.forEach((tag) => {
|
scrapeOptions.excludeTags.forEach((tag) => {
|
||||||
let elementsToRemove: Cheerio<AnyNode>;
|
let elementsToRemove: Cheerio<AnyNode>;
|
||||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||||
@@ -105,8 +109,9 @@ export const removeUnwantedElements = (
|
|||||||
|
|
||||||
if (scrapeOptions.onlyMainContent) {
|
if (scrapeOptions.onlyMainContent) {
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
const elementsToRemove = soup(tag)
|
const elementsToRemove = soup(tag).filter(
|
||||||
.filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join(""));
|
forceIncludeMainTags.map((x) => ":not(:has(" + x + "))").join("")
|
||||||
|
);
|
||||||
|
|
||||||
elementsToRemove.remove();
|
elementsToRemove.remove();
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -2,8 +2,8 @@ import { InternalOptions } from "..";
|
|||||||
import { ScrapeOptions } from "../../../controllers/v1/types";
|
import { ScrapeOptions } from "../../../controllers/v1/types";
|
||||||
|
|
||||||
export type UrlSpecificParams = {
|
export type UrlSpecificParams = {
|
||||||
scrapeOptions: Partial<ScrapeOptions>,
|
scrapeOptions: Partial<ScrapeOptions>;
|
||||||
internalOptions: Partial<InternalOptions>,
|
internalOptions: Partial<InternalOptions>;
|
||||||
};
|
};
|
||||||
|
|
||||||
// const docsParam: UrlSpecificParams = {
|
// const docsParam: UrlSpecificParams = {
|
||||||
@@ -46,6 +46,6 @@ export const urlSpecificParams: Record<string, UrlSpecificParams> = {
|
|||||||
},
|
},
|
||||||
"lorealparis.hu": {
|
"lorealparis.hu": {
|
||||||
scrapeOptions: {},
|
scrapeOptions: {},
|
||||||
internalOptions: { forceEngine: "fire-engine;tlsclient" },
|
internalOptions: { forceEngine: "fire-engine;tlsclient" }
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ const testEngines: (Engine | undefined)[] = [
|
|||||||
"fire-engine;tlsclient",
|
"fire-engine;tlsclient",
|
||||||
"scrapingbee",
|
"scrapingbee",
|
||||||
"scrapingbeeLoad",
|
"scrapingbeeLoad",
|
||||||
"fetch",
|
"fetch"
|
||||||
];
|
];
|
||||||
|
|
||||||
const testEnginesScreenshot: (Engine | undefined)[] = [
|
const testEnginesScreenshot: (Engine | undefined)[] = [
|
||||||
@@ -21,13 +21,18 @@ const testEnginesScreenshot: (Engine | undefined)[] = [
|
|||||||
"fire-engine;chrome-cdp",
|
"fire-engine;chrome-cdp",
|
||||||
"fire-engine;playwright",
|
"fire-engine;playwright",
|
||||||
"scrapingbee",
|
"scrapingbee",
|
||||||
"scrapingbeeLoad",
|
"scrapingbeeLoad"
|
||||||
];
|
];
|
||||||
|
|
||||||
describe("Standalone scrapeURL tests", () => {
|
describe("Standalone scrapeURL tests", () => {
|
||||||
describe.each(testEngines)("Engine %s", (forceEngine: Engine | undefined) => {
|
describe.each(testEngines)("Engine %s", (forceEngine: Engine | undefined) => {
|
||||||
it("Basic scrape", async () => {
|
it("Basic scrape", async () => {
|
||||||
const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-basic",
|
||||||
|
"https://www.roastmywebsite.ai/",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
@@ -64,13 +69,17 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
);
|
);
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
}
|
}
|
||||||
|
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape with formats markdown and html", async () => {
|
it("Scrape with formats markdown and html", async () => {
|
||||||
const out = await scrapeURL("test:scrape-formats-markdown-html", "https://roastmywebsite.ai", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
formats: ["markdown", "html"],
|
"test:scrape-formats-markdown-html",
|
||||||
}), { forceEngine });
|
"https://roastmywebsite.ai",
|
||||||
|
scrapeOptions.parse({
|
||||||
|
formats: ["markdown", "html"]
|
||||||
|
}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
@@ -84,13 +93,17 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
|
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape with onlyMainContent disabled", async () => {
|
it("Scrape with onlyMainContent disabled", async () => {
|
||||||
const out = await scrapeURL("test:scrape-onlyMainContent-false", "https://www.scrapethissite.com/", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
onlyMainContent: false,
|
"test:scrape-onlyMainContent-false",
|
||||||
}), { forceEngine });
|
"https://www.scrapethissite.com/",
|
||||||
|
scrapeOptions.parse({
|
||||||
|
onlyMainContent: false
|
||||||
|
}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
@@ -105,10 +118,15 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape with excludeTags", async () => {
|
it("Scrape with excludeTags", async () => {
|
||||||
const out = await scrapeURL("test:scrape-excludeTags", "https://www.scrapethissite.com/", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-excludeTags",
|
||||||
|
"https://www.scrapethissite.com/",
|
||||||
|
scrapeOptions.parse({
|
||||||
onlyMainContent: false,
|
onlyMainContent: false,
|
||||||
excludeTags: ['.nav', '#footer', 'strong'],
|
excludeTags: [".nav", "#footer", "strong"]
|
||||||
}), { forceEngine });
|
}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
@@ -123,186 +141,261 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape of a page with 400 status code", async () => {
|
it("Scrape of a page with 400 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-400",
|
||||||
|
"https://httpstat.us/400",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(400);
|
expect(out.document.metadata.statusCode).toBe(400);
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape of a page with 401 status code", async () => {
|
it("Scrape of a page with 401 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-401",
|
||||||
|
"https://httpstat.us/401",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(401);
|
expect(out.document.metadata.statusCode).toBe(401);
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape of a page with 403 status code", async () => {
|
it("Scrape of a page with 403 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-403",
|
||||||
|
"https://httpstat.us/403",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(403);
|
expect(out.document.metadata.statusCode).toBe(403);
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape of a page with 404 status code", async () => {
|
it("Scrape of a page with 404 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-404",
|
||||||
|
"https://httpstat.us/404",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(404);
|
expect(out.document.metadata.statusCode).toBe(404);
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape of a page with 405 status code", async () => {
|
it("Scrape of a page with 405 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-405",
|
||||||
|
"https://httpstat.us/405",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(405);
|
expect(out.document.metadata.statusCode).toBe(405);
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape of a page with 500 status code", async () => {
|
it("Scrape of a page with 500 status code", async () => {
|
||||||
const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-500",
|
||||||
|
"https://httpstat.us/500",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(500);
|
expect(out.document.metadata.statusCode).toBe(500);
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape a redirected page", async () => {
|
it("Scrape a redirected page", async () => {
|
||||||
const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine });
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-redirect",
|
||||||
|
"https://scrapethissite.com/",
|
||||||
|
scrapeOptions.parse({}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document.markdown).toContain("Explore Sandbox");
|
expect(out.document.markdown).toContain("Explore Sandbox");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.sourceURL).toBe("https://scrapethissite.com/");
|
expect(out.document.metadata.sourceURL).toBe(
|
||||||
expect(out.document.metadata.url).toBe("https://www.scrapethissite.com/");
|
"https://scrapethissite.com/"
|
||||||
|
);
|
||||||
|
expect(out.document.metadata.url).toBe(
|
||||||
|
"https://www.scrapethissite.com/"
|
||||||
|
);
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe.each(testEnginesScreenshot)("Screenshot on engine %s", (forceEngine: Engine | undefined) => {
|
describe.each(testEnginesScreenshot)(
|
||||||
|
"Screenshot on engine %s",
|
||||||
|
(forceEngine: Engine | undefined) => {
|
||||||
it("Scrape with screenshot", async () => {
|
it("Scrape with screenshot", async () => {
|
||||||
const out = await scrapeURL("test:scrape-screenshot", "https://www.scrapethissite.com/", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
formats: ["screenshot"],
|
"test:scrape-screenshot",
|
||||||
}), { forceEngine });
|
"https://www.scrapethissite.com/",
|
||||||
|
scrapeOptions.parse({
|
||||||
|
formats: ["screenshot"]
|
||||||
|
}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('screenshot');
|
expect(out.document).toHaveProperty("screenshot");
|
||||||
expect(typeof out.document.screenshot).toBe("string");
|
expect(typeof out.document.screenshot).toBe("string");
|
||||||
expect(out.document.screenshot!.startsWith("https://service.firecrawl.dev/storage/v1/object/public/media/"));
|
expect(
|
||||||
|
out.document.screenshot!.startsWith(
|
||||||
|
"https://service.firecrawl.dev/storage/v1/object/public/media/"
|
||||||
|
)
|
||||||
|
);
|
||||||
// TODO: attempt to fetch screenshot
|
// TODO: attempt to fetch screenshot
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
|
|
||||||
it("Scrape with full-page screenshot", async () => {
|
it("Scrape with full-page screenshot", async () => {
|
||||||
const out = await scrapeURL("test:scrape-screenshot-fullPage", "https://www.scrapethissite.com/", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
formats: ["screenshot@fullPage"],
|
"test:scrape-screenshot-fullPage",
|
||||||
}), { forceEngine });
|
"https://www.scrapethissite.com/",
|
||||||
|
scrapeOptions.parse({
|
||||||
|
formats: ["screenshot@fullPage"]
|
||||||
|
}),
|
||||||
|
{ forceEngine }
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('screenshot');
|
expect(out.document).toHaveProperty("screenshot");
|
||||||
expect(typeof out.document.screenshot).toBe("string");
|
expect(typeof out.document.screenshot).toBe("string");
|
||||||
expect(out.document.screenshot!.startsWith("https://service.firecrawl.dev/storage/v1/object/public/media/"));
|
expect(
|
||||||
|
out.document.screenshot!.startsWith(
|
||||||
|
"https://service.firecrawl.dev/storage/v1/object/public/media/"
|
||||||
|
)
|
||||||
|
);
|
||||||
// TODO: attempt to fetch screenshot
|
// TODO: attempt to fetch screenshot
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
}, 30000);
|
}, 30000);
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
it("Scrape of a PDF file", async () => {
|
it("Scrape of a PDF file", async () => {
|
||||||
const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}));
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-pdf",
|
||||||
|
"https://arxiv.org/pdf/astro-ph/9301001.pdf",
|
||||||
|
scrapeOptions.parse({})
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.markdown).toContain('Broad Line Radio Galaxy');
|
expect(out.document.markdown).toContain("Broad Line Radio Galaxy");
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
}, 60000);
|
}, 60000);
|
||||||
|
|
||||||
it("Scrape a DOCX file", async () => {
|
it("Scrape a DOCX file", async () => {
|
||||||
const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}));
|
const out = await scrapeURL(
|
||||||
|
"test:scrape-docx",
|
||||||
|
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx",
|
||||||
|
scrapeOptions.parse({})
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.markdown).toContain('SERIES A PREFERRED STOCK PURCHASE AGREEMENT');
|
expect(out.document.markdown).toContain(
|
||||||
|
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
|
||||||
|
);
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
}
|
}
|
||||||
}, 60000)
|
}, 60000);
|
||||||
|
|
||||||
it("LLM extract with prompt and schema", async () => {
|
it("LLM extract with prompt and schema", async () => {
|
||||||
const out = await scrapeURL("test:llm-extract-prompt-schema", "https://firecrawl.dev", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
|
"test:llm-extract-prompt-schema",
|
||||||
|
"https://firecrawl.dev",
|
||||||
|
scrapeOptions.parse({
|
||||||
formats: ["extract"],
|
formats: ["extract"],
|
||||||
extract: {
|
extract: {
|
||||||
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
prompt:
|
||||||
|
"Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||||
schema: {
|
schema: {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
company_mission: { type: "string" },
|
company_mission: { type: "string" },
|
||||||
supports_sso: { type: "boolean" },
|
supports_sso: { type: "boolean" },
|
||||||
is_open_source: { type: "boolean" },
|
is_open_source: { type: "boolean" }
|
||||||
},
|
},
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
additionalProperties: false,
|
additionalProperties: false
|
||||||
},
|
}
|
||||||
},
|
}
|
||||||
}));
|
})
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
@@ -316,10 +409,13 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.extract.supports_sso).toBe(false);
|
expect(out.document.extract.supports_sso).toBe(false);
|
||||||
expect(out.document.extract.is_open_source).toBe(true);
|
expect(out.document.extract.is_open_source).toBe(true);
|
||||||
}
|
}
|
||||||
}, 120000)
|
}, 120000);
|
||||||
|
|
||||||
it("LLM extract with schema only", async () => {
|
it("LLM extract with schema only", async () => {
|
||||||
const out = await scrapeURL("test:llm-extract-schema", "https://firecrawl.dev", scrapeOptions.parse({
|
const out = await scrapeURL(
|
||||||
|
"test:llm-extract-schema",
|
||||||
|
"https://firecrawl.dev",
|
||||||
|
scrapeOptions.parse({
|
||||||
formats: ["extract"],
|
formats: ["extract"],
|
||||||
extract: {
|
extract: {
|
||||||
schema: {
|
schema: {
|
||||||
@@ -327,13 +423,14 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
properties: {
|
properties: {
|
||||||
company_mission: { type: "string" },
|
company_mission: { type: "string" },
|
||||||
supports_sso: { type: "boolean" },
|
supports_sso: { type: "boolean" },
|
||||||
is_open_source: { type: "boolean" },
|
is_open_source: { type: "boolean" }
|
||||||
},
|
},
|
||||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||||
additionalProperties: false,
|
additionalProperties: false
|
||||||
},
|
}
|
||||||
},
|
}
|
||||||
}));
|
})
|
||||||
|
);
|
||||||
|
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
expect(out.success).toBe(true);
|
expect(out.success).toBe(true);
|
||||||
@@ -347,9 +444,11 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
expect(out.document.extract.supports_sso).toBe(false);
|
expect(out.document.extract.supports_sso).toBe(false);
|
||||||
expect(out.document.extract.is_open_source).toBe(true);
|
expect(out.document.extract.is_open_source).toBe(true);
|
||||||
}
|
}
|
||||||
}, 120000)
|
}, 120000);
|
||||||
|
|
||||||
test.concurrent.each(new Array(100).fill(0).map((_, i) => i))("Concurrent scrape #%i", async (i) => {
|
test.concurrent.each(new Array(100).fill(0).map((_, i) => i))(
|
||||||
|
"Concurrent scrape #%i",
|
||||||
|
async (i) => {
|
||||||
const url = "https://www.scrapethissite.com/?i=" + i;
|
const url = "https://www.scrapethissite.com/?i=" + i;
|
||||||
const id = "test:concurrent:" + url;
|
const id = "test:concurrent:" + url;
|
||||||
const out = await scrapeURL(id, url, scrapeOptions.parse({}));
|
const out = await scrapeURL(id, url, scrapeOptions.parse({}));
|
||||||
@@ -361,16 +460,16 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
message: value.message,
|
message: value.message,
|
||||||
name: value.name,
|
name: value.name,
|
||||||
cause: value.cause,
|
cause: value.cause,
|
||||||
stack: value.stack,
|
stack: value.stack
|
||||||
}
|
};
|
||||||
} else {
|
} else {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
// verify that log collection works properly while concurrency is happening
|
// verify that log collection works properly while concurrency is happening
|
||||||
// expect(out.logs.length).toBeGreaterThan(0);
|
// expect(out.logs.length).toBeGreaterThan(0);
|
||||||
const weirdLogs = out.logs.filter(x => x.scrapeId !== id);
|
const weirdLogs = out.logs.filter((x) => x.scrapeId !== id);
|
||||||
if (weirdLogs.length > 0) {
|
if (weirdLogs.length > 0) {
|
||||||
console.warn(JSON.stringify(weirdLogs, replacer));
|
console.warn(JSON.stringify(weirdLogs, replacer));
|
||||||
}
|
}
|
||||||
@@ -381,10 +480,12 @@ describe("Standalone scrapeURL tests", () => {
|
|||||||
|
|
||||||
if (out.success) {
|
if (out.success) {
|
||||||
expect(out.document.warning).toBeUndefined();
|
expect(out.document.warning).toBeUndefined();
|
||||||
expect(out.document).toHaveProperty('markdown');
|
expect(out.document).toHaveProperty("markdown");
|
||||||
expect(out.document).toHaveProperty('metadata');
|
expect(out.document).toHaveProperty("metadata");
|
||||||
expect(out.document.metadata.error).toBeUndefined();
|
expect(out.document.metadata.error).toBeUndefined();
|
||||||
expect(out.document.metadata.statusCode).toBe(200);
|
expect(out.document.metadata.statusCode).toBe(200);
|
||||||
}
|
}
|
||||||
}, 30000);
|
},
|
||||||
})
|
30000
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|||||||
@@ -3,10 +3,16 @@ import { Meta } from "..";
|
|||||||
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
|
||||||
|
|
||||||
export function saveToCache(meta: Meta, document: Document): Document {
|
export function saveToCache(meta: Meta, document: Document): Document {
|
||||||
if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document;
|
if (
|
||||||
|
document.metadata.statusCode! < 200 ||
|
||||||
|
document.metadata.statusCode! >= 300
|
||||||
|
)
|
||||||
|
return document;
|
||||||
|
|
||||||
if (document.rawHtml === undefined) {
|
if (document.rawHtml === undefined) {
|
||||||
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
|
throw new Error(
|
||||||
|
"rawHtml is undefined -- this transformer is being called out of order"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
|
||||||
@@ -16,7 +22,7 @@ export function saveToCache(meta: Meta, document: Document): Document {
|
|||||||
html: document.rawHtml!,
|
html: document.rawHtml!,
|
||||||
statusCode: document.metadata.statusCode!,
|
statusCode: document.metadata.statusCode!,
|
||||||
url: document.metadata.url ?? document.metadata.sourceURL!,
|
url: document.metadata.url ?? document.metadata.sourceURL!,
|
||||||
error: document.metadata.error ?? undefined,
|
error: document.metadata.error ?? undefined
|
||||||
};
|
};
|
||||||
|
|
||||||
saveEntryToCache(key, entry);
|
saveEntryToCache(key, entry);
|
||||||
|
|||||||
@@ -9,32 +9,50 @@ import { uploadScreenshot } from "./uploadScreenshot";
|
|||||||
import { removeBase64Images } from "./removeBase64Images";
|
import { removeBase64Images } from "./removeBase64Images";
|
||||||
import { saveToCache } from "./cache";
|
import { saveToCache } from "./cache";
|
||||||
|
|
||||||
export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
|
export type Transformer = (
|
||||||
|
meta: Meta,
|
||||||
|
document: Document
|
||||||
|
) => Document | Promise<Document>;
|
||||||
|
|
||||||
export function deriveMetadataFromRawHTML(meta: Meta, document: Document): Document {
|
export function deriveMetadataFromRawHTML(
|
||||||
|
meta: Meta,
|
||||||
|
document: Document
|
||||||
|
): Document {
|
||||||
if (document.rawHtml === undefined) {
|
if (document.rawHtml === undefined) {
|
||||||
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
|
throw new Error(
|
||||||
|
"rawHtml is undefined -- this transformer is being called out of order"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.metadata = {
|
document.metadata = {
|
||||||
...extractMetadata(meta, document.rawHtml),
|
...extractMetadata(meta, document.rawHtml),
|
||||||
...document.metadata,
|
...document.metadata
|
||||||
};
|
};
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function deriveHTMLFromRawHTML(meta: Meta, document: Document): Document {
|
export function deriveHTMLFromRawHTML(
|
||||||
|
meta: Meta,
|
||||||
|
document: Document
|
||||||
|
): Document {
|
||||||
if (document.rawHtml === undefined) {
|
if (document.rawHtml === undefined) {
|
||||||
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
|
throw new Error(
|
||||||
|
"rawHtml is undefined -- this transformer is being called out of order"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.html = removeUnwantedElements(document.rawHtml, meta.options);
|
document.html = removeUnwantedElements(document.rawHtml, meta.options);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function deriveMarkdownFromHTML(_meta: Meta, document: Document): Promise<Document> {
|
export async function deriveMarkdownFromHTML(
|
||||||
|
_meta: Meta,
|
||||||
|
document: Document
|
||||||
|
): Promise<Document> {
|
||||||
if (document.html === undefined) {
|
if (document.html === undefined) {
|
||||||
throw new Error("html is undefined -- this transformer is being called out of order");
|
throw new Error(
|
||||||
|
"html is undefined -- this transformer is being called out of order"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.markdown = await parseMarkdown(document.html);
|
document.markdown = await parseMarkdown(document.html);
|
||||||
@@ -45,7 +63,9 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
|
|||||||
// Only derive if the formats has links
|
// Only derive if the formats has links
|
||||||
if (meta.options.formats.includes("links")) {
|
if (meta.options.formats.includes("links")) {
|
||||||
if (document.html === undefined) {
|
if (document.html === undefined) {
|
||||||
throw new Error("html is undefined -- this transformer is being called out of order");
|
throw new Error(
|
||||||
|
"html is undefined -- this transformer is being called out of order"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.links = extractLinks(document.html, meta.url);
|
document.links = extractLinks(document.html, meta.url);
|
||||||
@@ -54,46 +74,74 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
|
|||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function coerceFieldsToFormats(meta: Meta, document: Document): Document {
|
export function coerceFieldsToFormats(
|
||||||
|
meta: Meta,
|
||||||
|
document: Document
|
||||||
|
): Document {
|
||||||
const formats = new Set(meta.options.formats);
|
const formats = new Set(meta.options.formats);
|
||||||
|
|
||||||
if (!formats.has("markdown") && document.markdown !== undefined) {
|
if (!formats.has("markdown") && document.markdown !== undefined) {
|
||||||
delete document.markdown;
|
delete document.markdown;
|
||||||
} else if (formats.has("markdown") && document.markdown === undefined) {
|
} else if (formats.has("markdown") && document.markdown === undefined) {
|
||||||
meta.logger.warn("Request had format: markdown, but there was no markdown field in the result.");
|
meta.logger.warn(
|
||||||
|
"Request had format: markdown, but there was no markdown field in the result."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("rawHtml") && document.rawHtml !== undefined) {
|
if (!formats.has("rawHtml") && document.rawHtml !== undefined) {
|
||||||
delete document.rawHtml;
|
delete document.rawHtml;
|
||||||
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
|
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
|
||||||
meta.logger.warn("Request had format: rawHtml, but there was no rawHtml field in the result.");
|
meta.logger.warn(
|
||||||
|
"Request had format: rawHtml, but there was no rawHtml field in the result."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("html") && document.html !== undefined) {
|
if (!formats.has("html") && document.html !== undefined) {
|
||||||
delete document.html;
|
delete document.html;
|
||||||
} else if (formats.has("html") && document.html === undefined) {
|
} else if (formats.has("html") && document.html === undefined) {
|
||||||
meta.logger.warn("Request had format: html, but there was no html field in the result.");
|
meta.logger.warn(
|
||||||
|
"Request had format: html, but there was no html field in the result."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("screenshot") && !formats.has("screenshot@fullPage") && document.screenshot !== undefined) {
|
if (
|
||||||
meta.logger.warn("Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug.");
|
!formats.has("screenshot") &&
|
||||||
|
!formats.has("screenshot@fullPage") &&
|
||||||
|
document.screenshot !== undefined
|
||||||
|
) {
|
||||||
|
meta.logger.warn(
|
||||||
|
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug."
|
||||||
|
);
|
||||||
delete document.screenshot;
|
delete document.screenshot;
|
||||||
} else if ((formats.has("screenshot") || formats.has("screenshot@fullPage")) && document.screenshot === undefined) {
|
} else if (
|
||||||
meta.logger.warn("Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result.");
|
(formats.has("screenshot") || formats.has("screenshot@fullPage")) &&
|
||||||
|
document.screenshot === undefined
|
||||||
|
) {
|
||||||
|
meta.logger.warn(
|
||||||
|
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("links") && document.links !== undefined) {
|
if (!formats.has("links") && document.links !== undefined) {
|
||||||
meta.logger.warn("Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug.");
|
meta.logger.warn(
|
||||||
|
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug."
|
||||||
|
);
|
||||||
delete document.links;
|
delete document.links;
|
||||||
} else if (formats.has("links") && document.links === undefined) {
|
} else if (formats.has("links") && document.links === undefined) {
|
||||||
meta.logger.warn("Request had format: links, but there was no links field in the result.");
|
meta.logger.warn(
|
||||||
|
"Request had format: links, but there was no links field in the result."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!formats.has("extract") && document.extract !== undefined) {
|
if (!formats.has("extract") && document.extract !== undefined) {
|
||||||
meta.logger.warn("Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.");
|
meta.logger.warn(
|
||||||
|
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug."
|
||||||
|
);
|
||||||
delete document.extract;
|
delete document.extract;
|
||||||
} else if (formats.has("extract") && document.extract === undefined) {
|
} else if (formats.has("extract") && document.extract === undefined) {
|
||||||
meta.logger.warn("Request had format: extract, but there was no extract field in the result.");
|
meta.logger.warn(
|
||||||
|
"Request had format: extract, but there was no extract field in the result."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
|
||||||
@@ -113,16 +161,21 @@ export const transformerStack: Transformer[] = [
|
|||||||
uploadScreenshot,
|
uploadScreenshot,
|
||||||
performLLMExtract,
|
performLLMExtract,
|
||||||
coerceFieldsToFormats,
|
coerceFieldsToFormats,
|
||||||
removeBase64Images,
|
removeBase64Images
|
||||||
];
|
];
|
||||||
|
|
||||||
export async function executeTransformers(meta: Meta, document: Document): Promise<Document> {
|
export async function executeTransformers(
|
||||||
|
meta: Meta,
|
||||||
|
document: Document
|
||||||
|
): Promise<Document> {
|
||||||
const executions: [string, number][] = [];
|
const executions: [string, number][] = [];
|
||||||
|
|
||||||
for (const transformer of transformerStack) {
|
for (const transformer of transformerStack) {
|
||||||
const _meta = {
|
const _meta = {
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({ method: "executeTransformers/" + transformer.name }),
|
logger: meta.logger.child({
|
||||||
|
method: "executeTransformers/" + transformer.name
|
||||||
|
})
|
||||||
};
|
};
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
document = await transformer(_meta, document);
|
document = await transformer(_meta, document);
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ export class LLMRefusalError extends Error {
|
|||||||
public results: EngineResultsTracker | undefined;
|
public results: EngineResultsTracker | undefined;
|
||||||
|
|
||||||
constructor(refusal: string) {
|
constructor(refusal: string) {
|
||||||
super("LLM refused to extract the website's content")
|
super("LLM refused to extract the website's content");
|
||||||
this.refusal = refusal;
|
this.refusal = refusal;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -22,19 +22,24 @@ function normalizeSchema(x: any): any {
|
|||||||
if (typeof x !== "object" || x === null) return x;
|
if (typeof x !== "object" || x === null) return x;
|
||||||
|
|
||||||
if (x["$defs"] !== null && typeof x["$defs"] === "object") {
|
if (x["$defs"] !== null && typeof x["$defs"] === "object") {
|
||||||
x["$defs"] = Object.fromEntries(Object.entries(x["$defs"]).map(([name, schema]) => [name, normalizeSchema(schema)]));
|
x["$defs"] = Object.fromEntries(
|
||||||
|
Object.entries(x["$defs"]).map(([name, schema]) => [
|
||||||
|
name,
|
||||||
|
normalizeSchema(schema)
|
||||||
|
])
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (x && x.anyOf) {
|
if (x && x.anyOf) {
|
||||||
x.anyOf = x.anyOf.map(x => normalizeSchema(x));
|
x.anyOf = x.anyOf.map((x) => normalizeSchema(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (x && x.oneOf) {
|
if (x && x.oneOf) {
|
||||||
x.oneOf = x.oneOf.map(x => normalizeSchema(x));
|
x.oneOf = x.oneOf.map((x) => normalizeSchema(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (x && x.allOf) {
|
if (x && x.allOf) {
|
||||||
x.allOf = x.allOf.map(x => normalizeSchema(x));
|
x.allOf = x.allOf.map((x) => normalizeSchema(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (x && x.not) {
|
if (x && x.not) {
|
||||||
@@ -44,26 +49,35 @@ function normalizeSchema(x: any): any {
|
|||||||
if (x && x.type === "object") {
|
if (x && x.type === "object") {
|
||||||
return {
|
return {
|
||||||
...x,
|
...x,
|
||||||
properties: Object.fromEntries(Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])),
|
properties: Object.fromEntries(
|
||||||
|
Object.entries(x.properties).map(([k, v]) => [k, normalizeSchema(v)])
|
||||||
|
),
|
||||||
required: Object.keys(x.properties),
|
required: Object.keys(x.properties),
|
||||||
additionalProperties: false,
|
additionalProperties: false
|
||||||
}
|
};
|
||||||
} else if (x && x.type === "array") {
|
} else if (x && x.type === "array") {
|
||||||
return {
|
return {
|
||||||
...x,
|
...x,
|
||||||
items: normalizeSchema(x.items),
|
items: normalizeSchema(x.items)
|
||||||
}
|
};
|
||||||
} else {
|
} else {
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string, isExtractEndpoint?: boolean): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
|
export async function generateOpenAICompletions(
|
||||||
|
logger: Logger,
|
||||||
|
options: ExtractOptions,
|
||||||
|
markdown?: string,
|
||||||
|
previousWarning?: string,
|
||||||
|
isExtractEndpoint?: boolean
|
||||||
|
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
|
||||||
let extract: any;
|
let extract: any;
|
||||||
let warning: string | undefined;
|
let warning: string | undefined;
|
||||||
|
|
||||||
const openai = new OpenAI();
|
const openai = new OpenAI();
|
||||||
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
const model: TiktokenModel =
|
||||||
|
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||||
|
|
||||||
if (markdown === undefined) {
|
if (markdown === undefined) {
|
||||||
throw new Error("document.markdown is undefined -- this is unexpected");
|
throw new Error("document.markdown is undefined -- this is unexpected");
|
||||||
@@ -83,7 +97,10 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
|||||||
|
|
||||||
markdown = markdown.slice(0, maxTokens * modifier);
|
markdown = markdown.slice(0, maxTokens * modifier);
|
||||||
|
|
||||||
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
|
let w =
|
||||||
|
"Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" +
|
||||||
|
maxTokens +
|
||||||
|
") we support.";
|
||||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||||
} finally {
|
} finally {
|
||||||
// Free the encoder resources after use
|
// Free the encoder resources after use
|
||||||
@@ -94,7 +111,12 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
|||||||
// trim the document to the maximum number of tokens, tokens != characters
|
// trim the document to the maximum number of tokens, tokens != characters
|
||||||
markdown = markdown.slice(0, maxTokens * modifier);
|
markdown = markdown.slice(0, maxTokens * modifier);
|
||||||
|
|
||||||
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
|
const w =
|
||||||
|
"The extraction content would have used more tokens (" +
|
||||||
|
numTokens +
|
||||||
|
") than the maximum we allow (" +
|
||||||
|
maxTokens +
|
||||||
|
"). -- the input has been automatically trimmed.";
|
||||||
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
warning = previousWarning === undefined ? w : w + " " + previousWarning;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,12 +125,12 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
|||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: {
|
properties: {
|
||||||
items: options.schema,
|
items: options.schema
|
||||||
},
|
},
|
||||||
required: ["items"],
|
required: ["items"],
|
||||||
additionalProperties: false,
|
additionalProperties: false
|
||||||
};
|
};
|
||||||
} else if (schema && typeof schema === 'object' && !schema.type) {
|
} else if (schema && typeof schema === "object" && !schema.type) {
|
||||||
schema = {
|
schema = {
|
||||||
type: "object",
|
type: "object",
|
||||||
properties: Object.fromEntries(
|
properties: Object.fromEntries(
|
||||||
@@ -127,27 +149,30 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
|||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
role: "system",
|
role: "system",
|
||||||
content: options.systemPrompt,
|
content: options.systemPrompt
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content: [{ type: "text", text: markdown }],
|
content: [{ type: "text", text: markdown }]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
content: options.prompt !== undefined
|
content:
|
||||||
|
options.prompt !== undefined
|
||||||
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
|
? `Transform the above content into structured JSON output based on the following user request: ${options.prompt}`
|
||||||
: "Transform the above content into structured JSON output.",
|
: "Transform the above content into structured JSON output."
|
||||||
},
|
}
|
||||||
],
|
],
|
||||||
response_format: options.schema ? {
|
response_format: options.schema
|
||||||
|
? {
|
||||||
type: "json_schema",
|
type: "json_schema",
|
||||||
json_schema: {
|
json_schema: {
|
||||||
name: "websiteContent",
|
name: "websiteContent",
|
||||||
schema: schema,
|
schema: schema,
|
||||||
strict: true,
|
strict: true
|
||||||
}
|
}
|
||||||
} : { type: "json_object" },
|
}
|
||||||
|
: { type: "json_object" }
|
||||||
});
|
});
|
||||||
|
|
||||||
if (jsonCompletion.choices[0].message.refusal !== null) {
|
if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||||
@@ -161,30 +186,45 @@ export async function generateOpenAICompletions(logger: Logger, options: Extract
|
|||||||
if (!isExtractEndpoint) {
|
if (!isExtractEndpoint) {
|
||||||
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
extract = JSON.parse(jsonCompletion.choices[0].message.content);
|
||||||
} else {
|
} else {
|
||||||
const extractData = JSON.parse(jsonCompletion.choices[0].message.content);
|
const extractData = JSON.parse(
|
||||||
|
jsonCompletion.choices[0].message.content
|
||||||
|
);
|
||||||
extract = options.schema ? extractData.data.extract : extractData;
|
extract = options.schema ? extractData.data.extract : extractData;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
|
logger.error("Failed to parse returned JSON, no schema specified.", {
|
||||||
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
|
error: e
|
||||||
|
});
|
||||||
|
throw new LLMRefusalError(
|
||||||
|
"Failed to parse returned JSON. Please specify a schema in the extract object."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
// If the users actually wants the items object, they can specify it as 'required' in the schema
|
||||||
// otherwise, we just return the items array
|
// otherwise, we just return the items array
|
||||||
if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
|
if (
|
||||||
|
options.schema &&
|
||||||
|
options.schema.type === "array" &&
|
||||||
|
!schema?.required?.includes("items")
|
||||||
|
) {
|
||||||
extract = extract?.items;
|
extract = extract?.items;
|
||||||
}
|
}
|
||||||
return { extract, warning, numTokens };
|
return { extract, warning, numTokens };
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
|
export async function performLLMExtract(
|
||||||
|
meta: Meta,
|
||||||
|
document: Document
|
||||||
|
): Promise<Document> {
|
||||||
if (meta.options.formats.includes("extract")) {
|
if (meta.options.formats.includes("extract")) {
|
||||||
const { extract, warning } = await generateOpenAICompletions(
|
const { extract, warning } = await generateOpenAICompletions(
|
||||||
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
|
meta.logger.child({
|
||||||
|
method: "performLLMExtract/generateOpenAICompletions"
|
||||||
|
}),
|
||||||
meta.options.extract!,
|
meta.options.extract!,
|
||||||
document.markdown,
|
document.markdown,
|
||||||
document.warning,
|
document.warning
|
||||||
);
|
);
|
||||||
document.extract = extract;
|
document.extract = extract;
|
||||||
document.warning = warning;
|
document.warning = warning;
|
||||||
|
|||||||
@@ -5,7 +5,10 @@ const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;
|
|||||||
|
|
||||||
export function removeBase64Images(meta: Meta, document: Document): Document {
|
export function removeBase64Images(meta: Meta, document: Document): Document {
|
||||||
if (meta.options.removeBase64Images && document.markdown !== undefined) {
|
if (meta.options.removeBase64Images && document.markdown !== undefined) {
|
||||||
document.markdown = document.markdown.replace(regex, '$1(<Base64-Image-Removed>)');
|
document.markdown = document.markdown.replace(
|
||||||
|
regex,
|
||||||
|
"$1(<Base64-Image-Removed>)"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
@@ -6,18 +6,26 @@ import { Meta } from "..";
|
|||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
|
|
||||||
export function uploadScreenshot(meta: Meta, document: Document): Document {
|
export function uploadScreenshot(meta: Meta, document: Document): Document {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true" && document.screenshot !== undefined && document.screenshot.startsWith("data:")) {
|
if (
|
||||||
|
process.env.USE_DB_AUTHENTICATION === "true" &&
|
||||||
|
document.screenshot !== undefined &&
|
||||||
|
document.screenshot.startsWith("data:")
|
||||||
|
) {
|
||||||
meta.logger.debug("Uploading screenshot to Supabase...");
|
meta.logger.debug("Uploading screenshot to Supabase...");
|
||||||
|
|
||||||
const fileName = `screenshot-${crypto.randomUUID()}.png`;
|
const fileName = `screenshot-${crypto.randomUUID()}.png`;
|
||||||
|
|
||||||
supabase_service.storage
|
supabase_service.storage
|
||||||
.from("media")
|
.from("media")
|
||||||
.upload(fileName, Buffer.from(document.screenshot.split(",")[1], "base64"), {
|
.upload(
|
||||||
|
fileName,
|
||||||
|
Buffer.from(document.screenshot.split(",")[1], "base64"),
|
||||||
|
{
|
||||||
cacheControl: "3600",
|
cacheControl: "3600",
|
||||||
upsert: false,
|
upsert: false,
|
||||||
contentType: document.screenshot.split(":")[1].split(";")[0],
|
contentType: document.screenshot.split(":")[1].split(";")[0]
|
||||||
});
|
}
|
||||||
|
);
|
||||||
|
|
||||||
document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`;
|
document.screenshot = `https://service.firecrawl.dev/storage/v1/object/public/media/${encodeURIComponent(fileName)}`;
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user