feat(crawl): ensure url trimming

This commit is contained in:
Móricz Gergő
2025-01-08 12:35:42 +01:00
parent 977a3e13c5
commit 363021ea78
2 changed files with 8 additions and 7 deletions
+6 -6
View File
@@ -60,7 +60,7 @@ export async function getLinksFromSitemap(
// Handle sitemap index files // Handle sitemap index files
const sitemapUrls = root.sitemap const sitemapUrls = root.sitemap
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
.map((sitemap) => sitemap.loc[0]); .map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) => const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap( getLinksFromSitemap(
@@ -78,9 +78,9 @@ export async function getLinksFromSitemap(
(url) => (url) =>
url.loc && url.loc &&
url.loc.length > 0 && url.loc.length > 0 &&
url.loc[0].toLowerCase().endsWith('.xml') url.loc[0].trim().toLowerCase().endsWith('.xml')
) )
.map((url) => url.loc[0]); .map((url) => url.loc[0].trim());
if (xmlSitemaps.length > 0) { if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps // Recursively fetch links from additional sitemaps
@@ -98,10 +98,10 @@ export async function getLinksFromSitemap(
(url) => (url) =>
url.loc && url.loc &&
url.loc.length > 0 && url.loc.length > 0 &&
!url.loc[0].toLowerCase().endsWith('.xml') && !url.loc[0].trim().toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0]), !WebCrawler.prototype.isFile(url.loc[0].trim()),
) )
.map((url) => url.loc[0]); .map((url) => url.loc[0].trim());
count += validUrls.length; count += validUrls.length;
const h = urlsHandler(validUrls); const h = urlsHandler(validUrls);
@@ -7,8 +7,9 @@ export function extractLinks(html: string, baseUrl: string): string[] {
const links: string[] = []; const links: string[] = [];
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); let href = $(element).attr("href");
if (href) { if (href) {
href = href.trim();
try { try {
if (href.startsWith("http://") || href.startsWith("https://")) { if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is // Absolute URL, add as is