import { createReadStream, promises as fs } from "node:fs"; import FormData from "form-data"; import { Meta } from "../.."; import { EngineScrapeResult } from ".."; import * as marked from "marked"; import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; type PDFProcessorResult = {html: string, markdown?: string}; async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise { meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath }); const uploadForm = new FormData(); uploadForm.append("file", createReadStream(tempFilePath), { filename: tempFilePath, contentType: "application/pdf", // NOTE: request.headers["Content-Type"]? }); const upload = await robustFetch({ url: "https://api.cloud.llamaindex.ai/api/parsing/upload", method: "POST", headers: { "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`, }, body: uploadForm, logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }), schema: z.object({ id: z.string(), }), }); const jobId = upload.id; // TODO: timeout, retries const result = await robustFetch({ url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, method: "GET", headers: { "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`, }, logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }), schema: z.object({ markdown: z.string(), }), }); return { markdown: result.markdown, html: await marked.parse(result.markdown, { async: true }), }; } async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise { meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); const result = await PdfParse(await fs.readFile(tempFilePath)); const escaped = escapeHtml(result.text); return { markdown: escaped, html: escaped, }; } export async function scrapePDF(meta: Meta): Promise { if (!meta.options.parsePDF) { const file = await fetchFileToBuffer(meta.url); const content = file.buffer.toString("base64"); return { url: file.response.url, statusCode: file.response.status, html: content, markdown: content, }; } const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; if (process.env.LLAMAPARSE_API_KEY) { try { result = await scrapePDFWithLlamaParse({ ...meta, logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }), }, tempFilePath); } catch (error) { meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error }); Sentry.captureException(error); } } if (result === null) { result = await scrapePDFWithParsePDF({ ...meta, logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }), }, tempFilePath); } await fs.unlink(tempFilePath); return { url: response.url, statusCode: response.status, html: result.html, markdown: result.markdown, } }