init
This commit is contained in:
@@ -2,7 +2,7 @@ import * as cheerio from "cheerio";
|
||||
import { ScrapingBeeClient } from "scrapingbee";
|
||||
import { extractMetadata } from "./utils/metadata";
|
||||
import dotenv from "dotenv";
|
||||
import { Document, PageOptions, FireEngineResponse } from "../../lib/entities";
|
||||
import { Document, PageOptions, FireEngineResponse, ExtractorOptions } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||
@@ -303,10 +303,14 @@ export async function scrapSingleUrl(
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
rawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
headers: undefined
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown"
|
||||
},
|
||||
existingHtml: string = ""
|
||||
): Promise<Document> {
|
||||
urlToScrap = urlToScrap.trim();
|
||||
@@ -465,6 +469,7 @@ export async function scrapSingleUrl(
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
@@ -478,6 +483,7 @@ export async function scrapSingleUrl(
|
||||
content: text,
|
||||
markdown: text,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml: pageOptions.rawHtml || extractorOptions.mode === "llm-extraction-from-raw-html" ? rawHtml : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
sourceURL: urlToScrap,
|
||||
|
||||
Reference in New Issue
Block a user