This commit is contained in:
Nicolas
2024-04-28 11:34:25 -07:00
parent fb08f28edf
commit 8e44696c4d
2 changed files with 58 additions and 9 deletions
+34 -9
View File
@@ -5,9 +5,28 @@ import dotenv from "dotenv";
import { Document, PageOptions } from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
dotenv.config();
export async function generateRequestParams(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = 15000
): Promise<any> {
const defaultParams = {
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
};
const urlKey = new URL(url).hostname;
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
return defaultParams;
}
}
export async function scrapWithCustomFirecrawl(
url: string,
options?: any
@@ -28,11 +47,13 @@ export async function scrapWithScrapingBee(
): Promise<string> {
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const response = await client.get({
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
});
const clientParams = await generateRequestParams(
url,
wait_browser,
timeout
);
const response = await client.get(clientParams);
if (response.status !== 200 && response.status !== 404) {
console.error(
@@ -107,11 +128,15 @@ export async function scrapSingleUrl(
let text = "";
switch (method) {
case "firecrawl-scraper":
text = await scrapWithCustomFirecrawl(url,);
text = await scrapWithCustomFirecrawl(url);
break;
case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) {
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
text = await scrapWithScrapingBee(
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
}
break;
case "playwright":
@@ -141,7 +166,7 @@ export async function scrapSingleUrl(
break;
}
let cleanedHtml = removeUnwantedElements(text, pageOptions);
return [await parseMarkdown(cleanedHtml), text];
};
@@ -155,7 +180,7 @@ export async function scrapSingleUrl(
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
// Basically means that it is using /search endpoint
if(pageOptions.fallback === false){
if (pageOptions.fallback === false) {
const soup = cheerio.load(html);
const metadata = extractMetadata(soup, urlToScrap);
return {