Nick:
This commit is contained in:
@@ -5,9 +5,28 @@ import dotenv from "dotenv";
|
||||
import { Document, PageOptions } from "../../lib/entities";
|
||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||
import { excludeNonMainTags } from "./utils/excludeTags";
|
||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function generateRequestParams(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = 15000
|
||||
): Promise<any> {
|
||||
const defaultParams = {
|
||||
url: url,
|
||||
params: { timeout: timeout, wait_browser: wait_browser },
|
||||
headers: { "ScrapingService-Request": "TRUE" },
|
||||
};
|
||||
|
||||
const urlKey = new URL(url).hostname;
|
||||
if (urlSpecificParams.hasOwnProperty(urlKey)) {
|
||||
return { ...defaultParams, ...urlSpecificParams[urlKey] };
|
||||
} else {
|
||||
return defaultParams;
|
||||
}
|
||||
}
|
||||
export async function scrapWithCustomFirecrawl(
|
||||
url: string,
|
||||
options?: any
|
||||
@@ -28,11 +47,13 @@ export async function scrapWithScrapingBee(
|
||||
): Promise<string> {
|
||||
try {
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||
const response = await client.get({
|
||||
url: url,
|
||||
params: { timeout: timeout, wait_browser: wait_browser },
|
||||
headers: { "ScrapingService-Request": "TRUE" },
|
||||
});
|
||||
const clientParams = await generateRequestParams(
|
||||
url,
|
||||
wait_browser,
|
||||
timeout
|
||||
);
|
||||
|
||||
const response = await client.get(clientParams);
|
||||
|
||||
if (response.status !== 200 && response.status !== 404) {
|
||||
console.error(
|
||||
@@ -107,11 +128,15 @@ export async function scrapSingleUrl(
|
||||
let text = "";
|
||||
switch (method) {
|
||||
case "firecrawl-scraper":
|
||||
text = await scrapWithCustomFirecrawl(url,);
|
||||
text = await scrapWithCustomFirecrawl(url);
|
||||
break;
|
||||
case "scrapingBee":
|
||||
if (process.env.SCRAPING_BEE_API_KEY) {
|
||||
text = await scrapWithScrapingBee(url,"domcontentloaded", pageOptions.fallback === false? 7000 : 15000);
|
||||
text = await scrapWithScrapingBee(
|
||||
url,
|
||||
"domcontentloaded",
|
||||
pageOptions.fallback === false ? 7000 : 15000
|
||||
);
|
||||
}
|
||||
break;
|
||||
case "playwright":
|
||||
@@ -141,7 +166,7 @@ export async function scrapSingleUrl(
|
||||
break;
|
||||
}
|
||||
let cleanedHtml = removeUnwantedElements(text, pageOptions);
|
||||
|
||||
|
||||
return [await parseMarkdown(cleanedHtml), text];
|
||||
};
|
||||
|
||||
@@ -155,7 +180,7 @@ export async function scrapSingleUrl(
|
||||
|
||||
let [text, html] = await attemptScraping(urlToScrap, "scrapingBee");
|
||||
// Basically means that it is using /search endpoint
|
||||
if(pageOptions.fallback === false){
|
||||
if (pageOptions.fallback === false) {
|
||||
const soup = cheerio.load(html);
|
||||
const metadata = extractMetadata(soup, urlToScrap);
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user