fix(WebCrawler): filter out file URLs when taking URLs from sitemap
This commit is contained in:
@@ -383,7 +383,7 @@ export class WebCrawler {
|
|||||||
return linkDomain === baseDomain;
|
return linkDomain === baseDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
private isFile(url: string): boolean {
|
public isFile(url: string): boolean {
|
||||||
const fileExtensions = [
|
const fileExtensions = [
|
||||||
".png",
|
".png",
|
||||||
".jpg",
|
".jpg",
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import axios from "axios";
|
|||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||||
|
import { WebCrawler } from "./crawler";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
{
|
{
|
||||||
@@ -41,7 +42,7 @@ export async function getLinksFromSitemap(
|
|||||||
}
|
}
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
for (const url of root.url) {
|
for (const url of root.url) {
|
||||||
if (url.loc && url.loc.length > 0) {
|
if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) {
|
||||||
allUrls.push(url.loc[0]);
|
allUrls.push(url.loc[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user