Sitemap fixes (#1010)
* sitemap fixes iter 1 * feat(sitemap): dedupe improvements --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
@@ -177,56 +177,51 @@ export async function crawlController(req: Request, res: Response) {
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? 0
|
||||
: await crawler.tryGetSitemap(async urls => {
|
||||
if (urls.length === 0) return;
|
||||
|
||||
let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||
const jobs = urls.map(url => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: jobPriority,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if (sitemap.length > 1000) {
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
scrapeOptions,
|
||||
internalOptions,
|
||||
team_id,
|
||||
plan,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: jobPriority,
|
||||
},
|
||||
};
|
||||
});
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url),
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
for (const job of jobs) {
|
||||
// add with sentry instrumentation
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
sc,
|
||||
jobs.map((x) => x.data.url),
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId),
|
||||
);
|
||||
for (const job of jobs) {
|
||||
// add with sentry instrumentation
|
||||
await addScrapeJob(job.data as any, {}, job.opts.jobId);
|
||||
}
|
||||
} else {
|
||||
if (sitemap === 0) {
|
||||
await lockURL(id, sc, url);
|
||||
|
||||
// Not needed, first one should be 15.
|
||||
|
||||
Reference in New Issue
Block a user