Files
firecrawl/apps/api/src/scraper/WebScraper/sitemap.ts
T

167 lines
4.8 KiB
TypeScript
Raw Normal View History

2024-04-15 17:01:47 -04:00
import axios from "axios";
2024-06-24 16:33:07 -03:00
import { axiosTimeout } from "../../lib/timeout";
2024-04-15 17:01:47 -04:00
import { parseStringPromise } from "xml2js";
import { WebCrawler } from "./crawler";
2024-11-07 20:57:33 +01:00
import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types";
import type { Logger } from "winston";
2024-12-26 13:51:20 -03:00
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
2024-04-15 17:01:47 -04:00
export async function getLinksFromSitemap(
{
sitemapUrl,
2024-12-27 19:59:26 +01:00
urlsHandler,
2024-12-11 19:51:08 -03:00
mode = "axios",
}: {
2024-12-11 19:46:11 -03:00
sitemapUrl: string;
2024-12-27 19:59:26 +01:00
urlsHandler(urls: string[]): unknown,
2024-12-11 19:46:11 -03:00
mode?: "axios" | "fire-engine";
},
2024-12-11 19:51:08 -03:00
logger: Logger,
2024-12-27 19:59:26 +01:00
): Promise<number> {
2024-04-15 17:01:47 -04:00
try {
2024-11-07 20:57:33 +01:00
let content: string = "";
2024-04-15 17:01:47 -04:00
try {
2024-12-26 13:51:20 -03:00
if (mode === "fire-engine" && useFireEngine) {
2024-12-11 19:46:11 -03:00
const response = await scrapeURL(
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
2024-12-11 19:51:08 -03:00
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
2024-12-11 19:46:11 -03:00
);
2024-11-07 20:57:33 +01:00
if (!response.success) {
2024-12-27 19:59:26 +01:00
logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = ar.data;
} else {
content = response.document.rawHtml!;
2024-11-07 20:57:33 +01:00
}
2024-12-26 13:51:20 -03:00
} else {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
}
2024-04-15 17:01:47 -04:00
} catch (error) {
2024-12-11 19:46:11 -03:00
logger.error(`Request failed for ${sitemapUrl}`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
2024-12-11 19:51:08 -03:00
error,
2024-12-11 19:46:11 -03:00
});
2024-12-27 19:59:26 +01:00
return 0;
2024-04-15 17:01:47 -04:00
}
const parsed = await parseStringPromise(content);
const root = parsed.urlset || parsed.sitemapindex;
2024-12-27 19:59:26 +01:00
let count = 0;
2024-04-15 17:01:47 -04:00
if (root && root.sitemap) {
2024-12-26 13:51:20 -03:00
// Handle sitemap index files
const sitemapUrls = root.sitemap
2024-12-11 19:46:11 -03:00
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
2025-01-08 12:35:42 +01:00
.map((sitemap) => sitemap.loc[0].trim());
2024-12-26 13:51:20 -03:00
2024-12-27 19:59:26 +01:00
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
2024-12-26 13:51:20 -03:00
getLinksFromSitemap(
2024-12-27 19:59:26 +01:00
{ sitemapUrl, urlsHandler, mode },
2024-12-26 13:51:20 -03:00
logger,
),
);
const results = await Promise.all(sitemapPromises);
2024-12-27 19:59:26 +01:00
count = results.reduce((a,x) => a + x)
2024-12-26 13:51:20 -03:00
} else if (root && root.url) {
// Check if any URLs point to additional sitemaps
2024-12-27 19:59:26 +01:00
const xmlSitemaps: string[] = root.url
2024-12-26 13:51:20 -03:00
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
2025-01-08 12:35:42 +01:00
url.loc[0].trim().toLowerCase().endsWith('.xml')
2024-12-26 13:51:20 -03:00
)
2025-01-08 12:35:42 +01:00
.map((url) => url.loc[0].trim());
2024-12-26 13:51:20 -03:00
if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
2024-12-11 19:46:11 -03:00
getLinksFromSitemap(
2024-12-27 19:59:26 +01:00
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
2024-12-11 19:51:08 -03:00
logger,
),
2024-12-11 19:46:11 -03:00
);
2024-12-27 19:59:26 +01:00
count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
2024-12-26 13:51:20 -03:00
}
2024-09-05 17:52:27 -03:00
const validUrls = root.url
2024-12-11 19:46:11 -03:00
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
2025-01-08 12:35:42 +01:00
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0].trim()),
2024-12-11 19:46:11 -03:00
)
2025-01-08 12:35:42 +01:00
.map((url) => url.loc[0].trim());
2024-12-27 19:59:26 +01:00
count += validUrls.length;
const h = urlsHandler(validUrls);
if (h instanceof Promise) {
await h;
}
2024-04-15 17:01:47 -04:00
}
2024-12-27 19:59:26 +01:00
return count;
2024-04-15 17:01:47 -04:00
} catch (error) {
2024-12-11 19:46:11 -03:00
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
2024-12-11 19:51:08 -03:00
error,
2024-12-11 19:46:11 -03:00
});
2024-04-15 17:01:47 -04:00
}
2024-12-27 19:59:26 +01:00
return 0;
2024-04-15 17:01:47 -04:00
}
2024-12-11 19:46:11 -03:00
export const fetchSitemapData = async (
url: string,
2024-12-11 19:51:08 -03:00
timeout?: number,
2024-12-11 19:46:11 -03:00
): Promise<SitemapEntry[] | null> => {
2024-04-15 17:01:47 -04:00
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try {
2024-12-11 19:46:11 -03:00
const response = await axios.get(sitemapUrl, {
2024-12-11 19:51:08 -03:00
timeout: timeout || axiosTimeout,
2024-12-11 19:46:11 -03:00
});
2024-04-15 17:01:47 -04:00
if (response.status === 200) {
const xml = response.data;
const parsedXml = await parseStringPromise(xml);
const sitemapData: SitemapEntry[] = [];
if (parsedXml.urlset && parsedXml.urlset.url) {
for (const urlElement of parsedXml.urlset.url) {
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
2024-12-11 19:46:11 -03:00
if (urlElement.changefreq)
sitemapEntry.changefreq = urlElement.changefreq[0];
if (urlElement.priority)
sitemapEntry.priority = Number(urlElement.priority[0]);
2024-04-15 17:01:47 -04:00
sitemapData.push(sitemapEntry);
}
}
return sitemapData;
}
return null;
} catch (error) {
// Error handling for failed sitemap fetch
}
return [];
2024-12-11 19:46:11 -03:00
};
2024-04-15 17:01:47 -04:00
export interface SitemapEntry {
loc: string;
lastmod?: string;
changefreq?: string;
priority?: number;
2024-12-11 19:46:11 -03:00
}