Files
firecrawl/apps/api/src/scraper/WebScraper/sitemap-index.ts
T

45 lines
1.1 KiB
TypeScript
Raw Normal View History

import { logger } from "../../lib/logger";
2025-01-21 16:22:27 -03:00
import { normalizeUrl, normalizeUrlOnlyHostname } from "../../lib/canonical-url";
import { supabase_service } from "../../services/supabase";
/**
* Query the sitemap index for a given URL
* @param url The URL to query
* @returns A list of URLs found in the sitemap index aggregated from all sitemaps
*/
import { withAuth } from "../../lib/withAuth";
async function querySitemapIndexFunction(url: string) {
const originUrl = normalizeUrlOnlyHostname(url);
2025-01-19 12:33:44 -03:00
for (let attempt = 1; attempt <= 3; attempt++) {
try {
const { data, error } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl);
if (error) {
throw error;
}
2025-01-21 16:22:27 -03:00
const allUrls = [...new Set(data.map((entry) => entry.urls).flat().map(url => normalizeUrl(url)))];
2025-01-19 12:33:44 -03:00
return allUrls;
2025-01-19 12:33:44 -03:00
} catch (error) {
logger.error("(sitemap-index) Error querying the index", {
error,
attempt
});
if (attempt === 3) {
return [];
}
}
}
2025-01-19 12:33:44 -03:00
return [];
}
export const querySitemapIndex = withAuth(querySitemapIndexFunction, []);