feat(crawl): ensure url trimming
This commit is contained in:
@@ -60,7 +60,7 @@ export async function getLinksFromSitemap(
|
|||||||
// Handle sitemap index files
|
// Handle sitemap index files
|
||||||
const sitemapUrls = root.sitemap
|
const sitemapUrls = root.sitemap
|
||||||
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||||
.map((sitemap) => sitemap.loc[0]);
|
.map((sitemap) => sitemap.loc[0].trim());
|
||||||
|
|
||||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap(
|
getLinksFromSitemap(
|
||||||
@@ -78,9 +78,9 @@ export async function getLinksFromSitemap(
|
|||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
url.loc[0].toLowerCase().endsWith('.xml')
|
url.loc[0].trim().toLowerCase().endsWith('.xml')
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0]);
|
.map((url) => url.loc[0].trim());
|
||||||
|
|
||||||
if (xmlSitemaps.length > 0) {
|
if (xmlSitemaps.length > 0) {
|
||||||
// Recursively fetch links from additional sitemaps
|
// Recursively fetch links from additional sitemaps
|
||||||
@@ -98,10 +98,10 @@ export async function getLinksFromSitemap(
|
|||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
!url.loc[0].toLowerCase().endsWith('.xml') &&
|
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
|
||||||
!WebCrawler.prototype.isFile(url.loc[0]),
|
!WebCrawler.prototype.isFile(url.loc[0].trim()),
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0]);
|
.map((url) => url.loc[0].trim());
|
||||||
count += validUrls.length;
|
count += validUrls.length;
|
||||||
|
|
||||||
const h = urlsHandler(validUrls);
|
const h = urlsHandler(validUrls);
|
||||||
|
|||||||
@@ -7,8 +7,9 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
|||||||
const links: string[] = [];
|
const links: string[] = [];
|
||||||
|
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
let href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
|
href = href.trim();
|
||||||
try {
|
try {
|
||||||
if (href.startsWith("http://") || href.startsWith("https://")) {
|
if (href.startsWith("http://") || href.startsWith("https://")) {
|
||||||
// Absolute URL, add as is
|
// Absolute URL, add as is
|
||||||
|
|||||||
Reference in New Issue
Block a user