fix(crawler): move sitemap deduplication to deeper in the process

This commit is contained in:
Móricz Gergő
2025-01-23 08:10:46 +01:00
parent aa2c369060
commit 72198123cb
2 changed files with 19 additions and 11 deletions
+15 -1
View File
@@ -19,7 +19,20 @@ export async function getLinksFromSitemap(
},
logger: Logger,
crawlId: string,
sitemapsHit: Set<string>,
): Promise<number> {
if (sitemapsHit.size >= 5) {
logger.warn("Sitemap limit of 5 hit, not hitting this one.");
return 0;
}
if (sitemapsHit.has(sitemapUrl)) {
logger.warn("This sitemap has already been hit.", { sitemapUrl });
return 0;
}
sitemapsHit.add(sitemapUrl);
try {
let content: string = "";
try {
@@ -126,7 +139,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId),
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit),
);
const results = await Promise.all(sitemapPromises);
@@ -149,6 +162,7 @@ export async function getLinksFromSitemap(
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
logger,
crawlId,
sitemapsHit,
),
);
count += (await Promise.all(sitemapPromises)).reduce(