Added a html to markdown table parser

This commit is contained in:
rafaelsideguide
2024-04-17 11:01:19 -03:00
parent a12f4d96a2
commit ff622739b7
2 changed files with 69 additions and 1 deletions
@@ -4,6 +4,7 @@ import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { parseTablesToMarkdown } from "./utils/parseTable";
// import puppeteer from "puppeteer";
dotenv.config();
@@ -132,7 +133,8 @@ export async function scrapSingleUrl(
}
break;
}
const cleanedHtml = removeUnwantedElements(text);
let cleanedHtml = removeUnwantedElements(text);
cleanedHtml = await parseTablesToMarkdown(cleanedHtml);
return [await parseMarkdown(cleanedHtml), text];
};