This commit is contained in:
Eric Ciarla
2024-06-28 16:39:09 -04:00
parent dbfae2d9bf
commit 70fcf2ce03
7 changed files with 5864 additions and 4651 deletions
@@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
import { ScrapingBeeClient } from "scrapingbee";
import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
@@ -303,10 +303,14 @@ export async function scrapSingleUrl(
pageOptions: PageOptions = {
onlyMainContent: true,
includeHtml: false,
rawHtml: false,
waitFor: 0,
screenshot: false,
headers: undefined
},
extractorOptions: ExtractorOptions = {
mode: "llm-extraction-from-markdown"
},
existingHtml: string = ""
): Promise<Document> {
urlToScrap = urlToScrap.trim();
@@ -465,6 +469,7 @@ export async function scrapSingleUrl(
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
metadata: {
...metadata,
screenshot: screenshot,
@@ -478,6 +483,7 @@ export async function scrapSingleUrl(
content: text,
markdown: text,
html: pageOptions.includeHtml ? html : undefined,
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
metadata: {
...metadata,
sourceURL: urlToScrap,