Nick: formatting + error handling

This commit is contained in:
Nicolas
2024-10-16 23:35:03 -03:00
parent c0384ea381
commit 8974230db4
@@ -77,51 +77,71 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
title = soup("title").text() || null; title = soup("title").text() || null;
description = soup('meta[name="description"]').attr("content") || null; description = soup('meta[name="description"]').attr("content") || null;
language = soup('html').attr('lang') || null; language = soup("html").attr("lang") || null;
keywords = soup('meta[name="keywords"]').attr("content") || null; keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null;
ogTitle = soup('meta[property="og:title"]').attr("content") || null; ogTitle = soup('meta[property="og:title"]').attr("content") || null;
ogDescription = soup('meta[property="og:description"]').attr("content") || null; ogDescription =
soup('meta[property="og:description"]').attr("content") || null;
ogUrl = soup('meta[property="og:url"]').attr("content") || null; ogUrl = soup('meta[property="og:url"]').attr("content") || null;
ogImage = soup('meta[property="og:image"]').attr("content") || null; ogImage = soup('meta[property="og:image"]').attr("content") || null;
ogAudio = soup('meta[property="og:audio"]').attr("content") || null; ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; ogDeterminer =
soup('meta[property="og:determiner"]').attr("content") || null;
ogLocale = soup('meta[property="og:locale"]').attr("content") || null; ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; ogLocaleAlternate =
soup('meta[property="og:locale:alternate"]')
.map((i, el) => soup(el).attr("content"))
.get() || null;
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
ogVideo = soup('meta[property="og:video"]').attr("content") || null; ogVideo = soup('meta[property="og:video"]').attr("content") || null;
articleSection = soup('meta[name="article:section"]').attr("content") || null; articleSection =
soup('meta[name="article:section"]').attr("content") || null;
articleTag = soup('meta[name="article:tag"]').attr("content") || null; articleTag = soup('meta[name="article:tag"]').attr("content") || null;
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; publishedTime =
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; soup('meta[property="article:published_time"]').attr("content") || null;
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; modifiedTime =
soup('meta[property="article:modified_time"]').attr("content") || null;
dctermsKeywords =
soup('meta[name="dcterms.keywords"]').attr("content") || null;
dcDescription = soup('meta[name="dc.description"]').attr("content") || null; dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null; dctermsSubject =
dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null; soup('meta[name="dcterms.subject"]').attr("content") || null;
dctermsAudience =
soup('meta[name="dcterms.audience"]').attr("content") || null;
dcType = soup('meta[name="dc.type"]').attr("content") || null; dcType = soup('meta[name="dc.type"]').attr("content") || null;
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null; dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
dcDate = soup('meta[name="dc.date"]').attr("content") || null; dcDate = soup('meta[name="dc.date"]').attr("content") || null;
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; dcDateCreated =
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; soup('meta[name="dc.date.created"]').attr("content") || null;
dctermsCreated =
soup('meta[name="dcterms.created"]').attr("content") || null;
// Extract all meta tags for custom metadata try {
soup("meta").each((i, elem) => { // Extract all meta tags for custom metadata
const name = soup(elem).attr("name") || soup(elem).attr("property"); soup("meta").each((i, elem) => {
const content = soup(elem).attr("content"); try {
const name = soup(elem).attr("name") || soup(elem).attr("property");
const content = soup(elem).attr("content");
if (name && content) { if (name && content) {
if (customMetadata[name] === undefined) { if (customMetadata[name] === undefined) {
customMetadata[name] = content; customMetadata[name] = content;
} else if (Array.isArray(customMetadata[name])) { } else if (Array.isArray(customMetadata[name])) {
(customMetadata[name] as string[]).push(content); (customMetadata[name] as string[]).push(content);
} else { } else {
customMetadata[name] = [customMetadata[name] as string, content]; customMetadata[name] = [customMetadata[name] as string, content];
}
}
} catch (error) {
Logger.error(`Error extracting custom metadata (in): ${error}`);
} }
} });
}); } catch (error) {
Logger.error(`Error extracting custom metadata: ${error}`);
}
} catch (error) { } catch (error) {
Logger.error(`Error extracting metadata: ${error}`); Logger.error(`Error extracting metadata: ${error}`);
} }