Nick: fixed prettier
This commit is contained in:
@@ -10,29 +10,39 @@ export async function getLinksFromSitemap(
|
||||
{
|
||||
sitemapUrl,
|
||||
allUrls = [],
|
||||
mode = 'axios'
|
||||
mode = "axios"
|
||||
}: {
|
||||
sitemapUrl: string,
|
||||
allUrls?: string[],
|
||||
mode?: 'axios' | 'fire-engine'
|
||||
sitemapUrl: string;
|
||||
allUrls?: string[];
|
||||
mode?: "axios" | "fire-engine";
|
||||
},
|
||||
logger: Logger,
|
||||
logger: Logger
|
||||
): Promise<string[]> {
|
||||
try {
|
||||
let content: string = "";
|
||||
try {
|
||||
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
|
||||
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
|
||||
} else if (mode === "fire-engine") {
|
||||
const response = await scrapeURL(
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }
|
||||
);
|
||||
if (!response.success) {
|
||||
throw response.error;
|
||||
}
|
||||
content = response.document.rawHtml!;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Request failed for ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error });
|
||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
error
|
||||
});
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
@@ -42,26 +52,46 @@ export async function getLinksFromSitemap(
|
||||
|
||||
if (root && root.sitemap) {
|
||||
const sitemapPromises = root.sitemap
|
||||
.filter(sitemap => sitemap.loc && sitemap.loc.length > 0)
|
||||
.map(sitemap => getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }, logger));
|
||||
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||
.map((sitemap) =>
|
||||
getLinksFromSitemap(
|
||||
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
||||
logger
|
||||
)
|
||||
);
|
||||
await Promise.all(sitemapPromises);
|
||||
} else if (root && root.url) {
|
||||
const validUrls = root.url
|
||||
.filter(url => url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0]))
|
||||
.map(url => url.loc[0]);
|
||||
.filter(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
!WebCrawler.prototype.isFile(url.loc[0])
|
||||
)
|
||||
.map((url) => url.loc[0]);
|
||||
allUrls.push(...validUrls);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, { method: "getLinksFromSitemap", mode, sitemapUrl, error });
|
||||
logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
error
|
||||
});
|
||||
}
|
||||
|
||||
return allUrls;
|
||||
}
|
||||
|
||||
export const fetchSitemapData = async (url: string, timeout?: number): Promise<SitemapEntry[] | null> => {
|
||||
export const fetchSitemapData = async (
|
||||
url: string,
|
||||
timeout?: number
|
||||
): Promise<SitemapEntry[] | null> => {
|
||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: timeout || axiosTimeout });
|
||||
const response = await axios.get(sitemapUrl, {
|
||||
timeout: timeout || axiosTimeout
|
||||
});
|
||||
if (response.status === 200) {
|
||||
const xml = response.data;
|
||||
const parsedXml = await parseStringPromise(xml);
|
||||
@@ -71,8 +101,10 @@ export const fetchSitemapData = async (url: string, timeout?: number): Promise<S
|
||||
for (const urlElement of parsedXml.urlset.url) {
|
||||
const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
|
||||
if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
|
||||
if (urlElement.changefreq) sitemapEntry.changefreq = urlElement.changefreq[0];
|
||||
if (urlElement.priority) sitemapEntry.priority = Number(urlElement.priority[0]);
|
||||
if (urlElement.changefreq)
|
||||
sitemapEntry.changefreq = urlElement.changefreq[0];
|
||||
if (urlElement.priority)
|
||||
sitemapEntry.priority = Number(urlElement.priority[0]);
|
||||
sitemapData.push(sitemapEntry);
|
||||
}
|
||||
}
|
||||
@@ -84,11 +116,11 @@ export const fetchSitemapData = async (url: string, timeout?: number): Promise<S
|
||||
// Error handling for failed sitemap fetch
|
||||
}
|
||||
return [];
|
||||
}
|
||||
};
|
||||
|
||||
export interface SitemapEntry {
|
||||
loc: string;
|
||||
lastmod?: string;
|
||||
changefreq?: string;
|
||||
priority?: number;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user