fix(WebCrawler): filter out file URLs when taking URLs from sitemap

This commit is contained in:
Gergo Moricz
2024-07-18 21:49:37 +02:00
parent 95c6c63b85
commit f0e95ce399
2 changed files with 3 additions and 2 deletions
+2 -1
View File
@@ -2,6 +2,7 @@ import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler";
export async function getLinksFromSitemap(
{
@@ -41,7 +42,7 @@ export async function getLinksFromSitemap(
}
} else if (root && root.url) {
for (const url of root.url) {
if (url.loc && url.loc.length > 0) {
if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) {
allUrls.push(url.loc[0]);
}
}