2024-11-07 20:57:33 +01:00
|
|
|
import { ScrapingBeeClient } from "scrapingbee";
|
|
|
|
|
import { Meta } from "../..";
|
|
|
|
|
import { EngineScrapeResult } from "..";
|
|
|
|
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
|
|
|
|
import { AxiosError, type AxiosResponse } from "axios";
|
|
|
|
|
import { EngineError } from "../../error";
|
|
|
|
|
|
|
|
|
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
|
|
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
export function scrapeURLWithScrapingBee(
|
|
|
|
|
wait_browser: "domcontentloaded" | "networkidle2"
|
|
|
|
|
): (meta: Meta) => Promise<EngineScrapeResult> {
|
|
|
|
|
return async (meta: Meta): Promise<EngineScrapeResult> => {
|
|
|
|
|
let response: AxiosResponse<any>;
|
|
|
|
|
try {
|
|
|
|
|
response = await client.get({
|
|
|
|
|
url: meta.url,
|
|
|
|
|
params: {
|
|
|
|
|
timeout: 15000, // TODO: dynamic timeout based on request timeout
|
|
|
|
|
wait_browser: wait_browser,
|
|
|
|
|
wait: Math.min(meta.options.waitFor, 35000),
|
|
|
|
|
transparent_status_code: true,
|
|
|
|
|
json_response: true,
|
|
|
|
|
screenshot: meta.options.formats.includes("screenshot"),
|
|
|
|
|
screenshot_full_page: meta.options.formats.includes(
|
|
|
|
|
"screenshot@fullPage"
|
|
|
|
|
)
|
|
|
|
|
},
|
|
|
|
|
headers: {
|
|
|
|
|
"ScrapingService-Request": "TRUE" // this is sent to the page, not to ScrapingBee - mogery
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
|
} catch (error) {
|
|
|
|
|
if (error instanceof AxiosError && error.response !== undefined) {
|
|
|
|
|
response = error.response;
|
|
|
|
|
} else {
|
|
|
|
|
throw error;
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
const data: Buffer = response.data;
|
|
|
|
|
const body = JSON.parse(new TextDecoder().decode(data));
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
const headers = body.headers ?? {};
|
|
|
|
|
const isHiddenEngineError = !(
|
|
|
|
|
headers["Date"] ??
|
|
|
|
|
headers["date"] ??
|
|
|
|
|
headers["Content-Type"] ??
|
|
|
|
|
headers["content-type"]
|
|
|
|
|
);
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
if (body.errors || body.body?.error || isHiddenEngineError) {
|
|
|
|
|
meta.logger.error("ScrapingBee threw an error", {
|
|
|
|
|
body: body.body?.error ?? body.errors ?? body.body ?? body
|
|
|
|
|
});
|
|
|
|
|
throw new EngineError("Engine error #34", {
|
|
|
|
|
cause: { body, statusCode: response.status }
|
|
|
|
|
});
|
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
if (typeof body.body !== "string") {
|
|
|
|
|
meta.logger.error("ScrapingBee: Body is not string??", { body });
|
|
|
|
|
throw new EngineError("Engine error #35", {
|
|
|
|
|
cause: { body, statusCode: response.status }
|
|
|
|
|
});
|
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
specialtyScrapeCheck(
|
|
|
|
|
meta.logger.child({
|
|
|
|
|
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck"
|
|
|
|
|
}),
|
|
|
|
|
body.headers
|
|
|
|
|
);
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
return {
|
|
|
|
|
url: body["resolved-url"] ?? meta.url,
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
html: body.body,
|
|
|
|
|
error: response.status >= 300 ? response.statusText : undefined,
|
|
|
|
|
statusCode: response.status,
|
|
|
|
|
...(body.screenshot
|
|
|
|
|
? {
|
|
|
|
|
screenshot: `data:image/png;base64,${body.screenshot}`
|
|
|
|
|
}
|
|
|
|
|
: {})
|
2024-11-07 20:57:33 +01:00
|
|
|
};
|
2024-12-11 19:46:11 -03:00
|
|
|
};
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|