8d467c8ca7
* feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
115 lines
3.6 KiB
TypeScript
115 lines
3.6 KiB
TypeScript
import { createReadStream, promises as fs } from "node:fs";
|
|
import FormData from "form-data";
|
|
import { Meta } from "../..";
|
|
import { EngineScrapeResult } from "..";
|
|
import * as marked from "marked";
|
|
import { robustFetch } from "../../lib/fetch";
|
|
import { z } from "zod";
|
|
import * as Sentry from "@sentry/node";
|
|
import escapeHtml from "escape-html";
|
|
import PdfParse from "pdf-parse";
|
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
|
|
|
type PDFProcessorResult = {html: string, markdown?: string};
|
|
|
|
async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
|
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
|
|
|
|
const uploadForm = new FormData();
|
|
uploadForm.append("file", createReadStream(tempFilePath), {
|
|
filename: tempFilePath,
|
|
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
|
|
});
|
|
|
|
const upload = await robustFetch({
|
|
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
|
|
method: "POST",
|
|
headers: {
|
|
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
|
},
|
|
body: uploadForm,
|
|
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
|
|
schema: z.object({
|
|
id: z.string(),
|
|
}),
|
|
});
|
|
|
|
const jobId = upload.id;
|
|
|
|
// TODO: timeout, retries
|
|
const result = await robustFetch({
|
|
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
|
method: "GET",
|
|
headers: {
|
|
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
|
|
},
|
|
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
|
|
schema: z.object({
|
|
markdown: z.string(),
|
|
}),
|
|
});
|
|
|
|
return {
|
|
markdown: result.markdown,
|
|
html: await marked.parse(result.markdown, { async: true }),
|
|
};
|
|
}
|
|
|
|
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
|
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
|
|
|
const result = await PdfParse(await fs.readFile(tempFilePath));
|
|
const escaped = escapeHtml(result.text);
|
|
|
|
return {
|
|
markdown: escaped,
|
|
html: escaped,
|
|
};
|
|
}
|
|
|
|
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|
if (!meta.options.parsePDF) {
|
|
const file = await fetchFileToBuffer(meta.url);
|
|
const content = file.buffer.toString("base64");
|
|
return {
|
|
url: file.response.url,
|
|
statusCode: file.response.status,
|
|
|
|
html: content,
|
|
markdown: content,
|
|
};
|
|
}
|
|
|
|
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
|
|
|
let result: PDFProcessorResult | null = null;
|
|
if (process.env.LLAMAPARSE_API_KEY) {
|
|
try {
|
|
result = await scrapePDFWithLlamaParse({
|
|
...meta,
|
|
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
|
}, tempFilePath);
|
|
} catch (error) {
|
|
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
|
Sentry.captureException(error);
|
|
}
|
|
}
|
|
|
|
if (result === null) {
|
|
result = await scrapePDFWithParsePDF({
|
|
...meta,
|
|
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
|
|
}, tempFilePath);
|
|
}
|
|
|
|
await fs.unlink(tempFilePath);
|
|
|
|
return {
|
|
url: response.url,
|
|
statusCode: response.status,
|
|
|
|
html: result.html,
|
|
markdown: result.markdown,
|
|
}
|
|
}
|