feat(crawl): includes/excludes fixes (FIR-1300) (#1303)

* feat(crawl): includes/excludes fixes pt. 1

* fix(snips): billing tests

* drop tha logs

* fix(ci): add replica url

* feat(crawl): drop initial scrape if it's not included

* feat(ci): more verbose logging

* fix crawl path in test

* fix(ci): wait for api

* fix(snips/scrape/ad): test for more pixels

* feat(js-sdk/crawl): add regexOnFullURL
This commit is contained in:
Gergő Móricz
2025-03-06 17:05:15 +01:00
committed by GitHub
parent f8df18ed6a
commit e1cfe1da48
11 changed files with 81 additions and 10 deletions
@@ -7,4 +7,50 @@ describe("Crawl tests", () => {
limit: 10,
});
}, 120000);
it.concurrent("filters URLs properly", async () => {
const res = await crawl({
url: "https://firecrawl.dev/pricing",
includePaths: ["^/pricing$"],
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBe(1);
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
}
}, 120000);
it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
const res = await crawl({
url: "https://firecrawl.dev/pricing",
includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
regexOnFullURL: true,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBe(1);
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
}
}, 120000);
it.concurrent("discovers URLs properly when origin is not included", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
includePaths: ["^/blog"],
ignoreSitemap: true,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.data.length).toBeGreaterThan(1);
for (const page of res.data) {
expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
}
}
}, 120000);
});