fix(html,markdown): always get absolute links
This commit is contained in:
@@ -49,12 +49,14 @@ const excludeNonMainTags = [
|
|||||||
|
|
||||||
const forceIncludeMainTags = ["#main"];
|
const forceIncludeMainTags = ["#main"];
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const htmlTransform = (
|
||||||
html: string,
|
html: string,
|
||||||
|
url: string,
|
||||||
scrapeOptions: ScrapeOptions,
|
scrapeOptions: ScrapeOptions,
|
||||||
) => {
|
) => {
|
||||||
const soup = load(html);
|
const soup = load(html);
|
||||||
|
|
||||||
|
// remove unwanted elements
|
||||||
if (
|
if (
|
||||||
scrapeOptions.includeTags &&
|
scrapeOptions.includeTags &&
|
||||||
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
|
scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
|
||||||
@@ -138,6 +140,18 @@ export const removeUnwantedElements = (
|
|||||||
el.attribs.src = sizes[0]?.url;
|
el.attribs.src = sizes[0]?.url;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// absolute links
|
||||||
|
soup("img[src]").each((_, el) => {
|
||||||
|
try {
|
||||||
|
el.attribs.src = new URL(el.attribs.src, url).href;
|
||||||
|
} catch (_) {}
|
||||||
|
});
|
||||||
|
soup("a[href]").each((_, el) => {
|
||||||
|
try {
|
||||||
|
el.attribs.href = new URL(el.attribs.href, url).href;
|
||||||
|
} catch (_) {}
|
||||||
|
});
|
||||||
|
|
||||||
const cleanedHtml = soup.html();
|
const cleanedHtml = soup.html();
|
||||||
return cleanedHtml;
|
return cleanedHtml;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../../lib/html-to-markdown";
|
||||||
import { Meta } from "..";
|
import { Meta } from "..";
|
||||||
import { Document } from "../../../controllers/v1/types";
|
import { Document } from "../../../controllers/v1/types";
|
||||||
import { removeUnwantedElements } from "../lib/removeUnwantedElements";
|
import { htmlTransform } from "../lib/removeUnwantedElements";
|
||||||
import { extractLinks } from "../lib/extractLinks";
|
import { extractLinks } from "../lib/extractLinks";
|
||||||
import { extractMetadata } from "../lib/extractMetadata";
|
import { extractMetadata } from "../lib/extractMetadata";
|
||||||
import { performLLMExtract } from "./llmExtract";
|
import { performLLMExtract } from "./llmExtract";
|
||||||
@@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
document.html = removeUnwantedElements(document.rawHtml, meta.options);
|
document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user