Merge branch 'main' into llm-extraction

This commit is contained in:
Nicolas
2024-04-29 08:40:44 -07:00
4 changed files with 99 additions and 14 deletions
+39 -9
View File
@@ -5,9 +5,33 @@ import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
dotenv.config();
export async function generateRequestParams(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = 15000
): Promise<any> {
const defaultParams = {
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
};
try {
const urlKey = new URL(url).hostname;
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
return defaultParams;
}
} catch (error) {
console.error(`Error generating URL key: ${error}`);
return defaultParams;
}
}
export async function scrapWithCustomFirecrawl(
url: string,
options?: any
@@ -28,11 +52,13 @@ export async function scrapWithScrapingBee(
): Promise<string> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const response = await client.get({
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
});
const clientParams = await generateRequestParams(
url,
wait_browser,
timeout
);
const response = await client.get(clientParams);
if (response.status !== 200 && response.status !== 404) {
console.error(
@@ -107,11 +133,15 @@ export async function scrapSingleUrl(
let text = "";
switch (method) {
case "firecrawl-scraper":
text = await scrapWithCustomFirecrawl(url,);
text = await scrapWithCustomFirecrawl(url);
break;
case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
text = await scrapWithScrapingBee(
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
}
break;
case "playwright":
@@ -143,7 +173,7 @@ export async function scrapSingleUrl(
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text];
};
@@ -157,7 +187,7 @@ export async function scrapSingleUrl(
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
// Basically means that it is using /search endpoint
if(pageOptions.fallback === false){
if (pageOptions.fallback === false) {
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
return {