Files
firecrawl/apps/api/src/scraper/WebScraper/sitemap-index.ts
T

63 lines
1.6 KiB
TypeScript
Raw Normal View History

import { logger } from "../../lib/logger";
2025-01-22 18:47:44 -03:00
import {
normalizeUrl,
normalizeUrlOnlyHostname,
} from "../../lib/canonical-url";
import { supabase_service } from "../../services/supabase";
/**
* Query the sitemap index for a given URL
* @param url The URL to query
* @returns A list of URLs found in the sitemap index aggregated from all sitemaps
*/
import { withAuth } from "../../lib/withAuth";
async function querySitemapIndexFunction(url: string, abort?: AbortSignal) {
const originUrl = normalizeUrlOnlyHostname(url);
2025-01-19 12:33:44 -03:00
for (let attempt = 1; attempt <= 3; attempt++) {
abort?.throwIfAborted();
2025-01-19 12:33:44 -03:00
try {
const { data, error } = await supabase_service
.from("crawl_maps")
2025-01-21 16:57:45 -03:00
.select("urls, updated_at")
.eq("origin_url", originUrl)
.order("updated_at", { ascending: false });
2025-01-19 12:33:44 -03:00
if (error) {
throw error;
}
2025-01-21 16:57:45 -03:00
if (!data || data.length === 0) {
return { urls: [], lastUpdated: new Date(0) };
}
2025-01-22 18:47:44 -03:00
const allUrls = [
...new Set(
data
.map((entry) => entry.urls)
.flat()
.map((url) => normalizeUrl(url)),
),
];
2025-01-21 16:57:45 -03:00
return { urls: allUrls, lastUpdated: data[0].updated_at };
2025-01-19 12:33:44 -03:00
} catch (error) {
2025-01-22 18:47:44 -03:00
logger.error("(sitemap-index) Error querying the index", {
2025-01-19 12:33:44 -03:00
error,
2025-01-22 18:47:44 -03:00
attempt,
2025-01-19 12:33:44 -03:00
});
if (attempt === 3) {
2025-01-21 16:57:45 -03:00
return { urls: [], lastUpdated: new Date(0) };
2025-01-19 12:33:44 -03:00
}
}
}
2025-01-21 16:57:45 -03:00
return { urls: [], lastUpdated: new Date(0) };
}
2025-01-22 18:47:44 -03:00
export const querySitemapIndex = withAuth(querySitemapIndexFunction, {
urls: [],
lastUpdated: new Date(0),
});