Sitemap fixes (#1010)

* sitemap fixes iter 1

* feat(sitemap): dedupe improvements

---------

Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Gergő Móricz
2024-12-27 19:59:26 +01:00
committed by GitHub
parent 6851281beb
commit 0421f81020
9 changed files with 341 additions and 291 deletions
+43 -48
View File
@@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) {
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
const sitemap = sc.crawlerOptions.ignoreSitemap
? 0
: await crawler.tryGetSitemap(async urls => {
if (urls.length === 0) return;
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
const jobs = urls.map(url => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
if (sitemap !== null && sitemap.length > 0) {
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if (sitemap.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
}
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
name: uuid,
data: {
url,
mode: "single_urls",
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
plan,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
},
opts: {
jobId: uuid,
priority: jobPriority,
},
};
});
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url),
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
for (const job of jobs) {
// add with sentry instrumentation
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
});
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url),
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId),
);
for (const job of jobs) {
// add with sentry instrumentation
await addScrapeJob(job.data as any, {}, job.opts.jobId);
}
} else {
if (sitemap === 0) {
await lockURL(id, sc, url);
// Not needed, first one should be 15.