From 6c94db7ed04fa6bc3db0eb984f0fc98cb4f867b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 16 Jan 2025 16:56:13 +0100 Subject: [PATCH] fix(html,markdown): always get absolute links --- .../scrapeURL/lib/removeUnwantedElements.ts | 16 +++++++++++++++- .../src/scraper/scrapeURL/transformers/index.ts | 4 ++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 2c23c2f1..92df68f4 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -49,12 +49,14 @@ const excludeNonMainTags = [ const forceIncludeMainTags = ["#main"]; -export const removeUnwantedElements = ( +export const htmlTransform = ( html: string, + url: string, scrapeOptions: ScrapeOptions, ) => { const soup = load(html); + // remove unwanted elements if ( scrapeOptions.includeTags && scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0 @@ -138,6 +140,18 @@ export const removeUnwantedElements = ( el.attribs.src = sizes[0]?.url; }); + // absolute links + soup("img[src]").each((_, el) => { + try { + el.attribs.src = new URL(el.attribs.src, url).href; + } catch (_) {} + }); + soup("a[href]").each((_, el) => { + try { + el.attribs.href = new URL(el.attribs.href, url).href; + } catch (_) {} + }); + const cleanedHtml = soup.html(); return cleanedHtml; }; diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index e14896ef..54bf0d46 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -1,7 +1,7 @@ import { parseMarkdown } from "../../../lib/html-to-markdown"; import { Meta } from ".."; import { Document } from "../../../controllers/v1/types"; -import { removeUnwantedElements } from "../lib/removeUnwantedElements"; +import { htmlTransform } from "../lib/removeUnwantedElements"; import { extractLinks } from "../lib/extractLinks"; import { extractMetadata } from "../lib/extractMetadata"; import { performLLMExtract } from "./llmExtract"; @@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML( ); } - document.html = removeUnwantedElements(document.rawHtml, meta.options); + document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options); return document; }