fix(WebCrawler): filter out file URLs when taking URLs from sitemap

This commit is contained in:
Gergo Moricz
2024-07-18 21:49:37 +02:00
parent 95c6c63b85
commit f0e95ce399
2 changed files with 3 additions and 2 deletions
+1 -1
View File
@@ -383,7 +383,7 @@ export class WebCrawler {
return linkDomain === baseDomain; return linkDomain === baseDomain;
} }
private isFile(url: string): boolean { public isFile(url: string): boolean {
const fileExtensions = [ const fileExtensions = [
".png", ".png",
".jpg", ".jpg",
+2 -1
View File
@@ -2,6 +2,7 @@ import axios from "axios";
import { axiosTimeout } from "../../lib/timeout"; import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler";
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
{ {
@@ -41,7 +42,7 @@ export async function getLinksFromSitemap(
} }
} else if (root && root.url) { } else if (root && root.url) {
for (const url of root.url) { for (const url of root.url) {
if (url.loc && url.loc.length > 0) { if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) {
allUrls.push(url.loc[0]); allUrls.push(url.loc[0]);
} }
} }