apps/api/src/scraper/scrapeURL/engines/pdf/index.ts

import { createReadStream, promises as fs } from "node:fs";
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import * as marked from "marked";
import { robustFetch } from "../../lib/fetch";
import { z } from "zod";
import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError } from "../../error";

type PDFProcessorResult = { html: string; markdown?: string };

async function scrapePDFWithLlamaParse(
  meta: Meta,
  tempFilePath: string,
  timeToRun: number | undefined,
): Promise<PDFProcessorResult> {
  meta.logger.debug("Processing PDF document with LlamaIndex", {
    tempFilePath,
  });

  const uploadForm = new FormData();

  // This is utterly stupid but it works! - mogery
  uploadForm.append("file", {
    [Symbol.toStringTag]: "Blob",
    name: tempFilePath,
    stream() {
      return createReadStream(
        tempFilePath,
      ) as unknown as ReadableStream<Uint8Array>;
    },
    arrayBuffer() {
      throw Error("Unimplemented in mock Blob: arrayBuffer");
    },
    size: (await fs.stat(tempFilePath)).size,
    text() {
      throw Error("Unimplemented in mock Blob: text");
    },
    slice(start, end, contentType) {
      throw Error("Unimplemented in mock Blob: slice");
    },
    type: "application/pdf",
  } as Blob);

  const upload = await robustFetch({
    url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
    method: "POST",
    headers: {
      Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
    },
    body: uploadForm,
    logger: meta.logger.child({
      method: "scrapePDFWithLlamaParse/upload/robustFetch",
    }),
    schema: z.object({
      id: z.string(),
    }),
  });

  const jobId = upload.id;

  // TODO: timeout, retries
  const startedAt = Date.now();
  const timeout = timeToRun ?? 300000;

  while (Date.now() <= startedAt + timeout) {
    try {
      const result = await robustFetch({
        url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
        method: "GET",
        headers: {
          Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
        },
        logger: meta.logger.child({
          method: "scrapePDFWithLlamaParse/result/robustFetch",
        }),
        schema: z.object({
          markdown: z.string(),
        }),
      });
      return {
        markdown: result.markdown,
        html: await marked.parse(result.markdown, { async: true }),
      };
    } catch (e) {
      if (e instanceof Error && e.message === "Request sent failure status") {
        if ((e.cause as any).response.status === 404) {
          // no-op, result not up yet
        } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
          // URL is not a PDF, actually!
          meta.logger.debug("URL is not actually a PDF, signalling...");
          throw new RemoveFeatureError(["pdf"]);
        } else {
          throw new Error("LlamaParse threw an error", {
            cause: e.cause,
          });
        }
      } else {
        throw e;
      }
    }

    await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
  }

  throw new Error("LlamaParse timed out");
}

async function scrapePDFWithParsePDF(
  meta: Meta,
  tempFilePath: string,
): Promise<PDFProcessorResult> {
  meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });

  const result = await PdfParse(await fs.readFile(tempFilePath));
  const escaped = escapeHtml(result.text);

  return {
    markdown: escaped,
    html: escaped,
  };
}

export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
  if (!meta.options.parsePDF) {
    const file = await fetchFileToBuffer(meta.url);
    const content = file.buffer.toString("base64");
    return {
      url: file.response.url,
      statusCode: file.response.status,

      html: content,
      markdown: content,
    };
  }

  const { response, tempFilePath } = await downloadFile(meta.id, meta.url);

  let result: PDFProcessorResult | null = null;

  // First, try parsing with PdfParse
  result = await scrapePDFWithParsePDF(
    {
      ...meta,
      logger: meta.logger.child({
        method: "scrapePDF/scrapePDFWithParsePDF",
      }),
    },
    tempFilePath,
  );


  // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
  if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
    try {
      const llamaResult = await scrapePDFWithLlamaParse(
        {
          ...meta,
          logger: meta.logger.child({
            method: "scrapePDF/scrapePDFWithLlamaParse",
          }),
        },
        tempFilePath,
        timeToRun,
      );
      result = llamaResult; // Use LlamaParse result if successful
    } catch (error) {
      if (error instanceof Error && error.message === "LlamaParse timed out") {
        meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
          error,
        });
      } else if (error instanceof RemoveFeatureError) {
        throw error;
      } else {
        meta.logger.warn(
          "LlamaParse failed to parse PDF -- using parse-pdf result",
          { error },
        );
        Sentry.captureException(error);
      }
    }
  }

  await fs.unlink(tempFilePath);

  return {
    url: response.url,
    statusCode: response.status,

    html: result.html,
    markdown: result.markdown,
  };
}
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import { createReadStream, promises as fs } from "node:fs";`
			`import { Meta } from "../..";`
			`import { EngineScrapeResult } from "..";`
			`import * as marked from "marked";`
			`import { robustFetch } from "../../lib/fetch";`
			`import { z } from "zod";`
			`import * as Sentry from "@sentry/node";`
			`import escapeHtml from "escape-html";`
revert to pdf parse 2024-12-17 12:12:22 -05:00			`import PdfParse from "pdf-parse";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";`
fix(scrapeURL/pdf): handle if a presumed PDF link returns HTML (e.g. 404) 2024-12-10 23:24:33 +01:00			`import { RemoveFeatureError } from "../../error";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`type PDFProcessorResult = { html: string; markdown?: string };`

			`async function scrapePDFWithLlamaParse(`
			`meta: Meta,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`tempFilePath: string,`
fix(scrapeURL/engines): better timeouts 2024-12-15 18:58:29 +01:00			`timeToRun: number \| undefined,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`): Promise<PDFProcessorResult> {`
			`meta.logger.debug("Processing PDF document with LlamaIndex", {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`tempFilePath,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`

			`const uploadForm = new FormData();`

			`// This is utterly stupid but it works! - mogery`
			`uploadForm.append("file", {`
			`[Symbol.toStringTag]: "Blob",`
			`name: tempFilePath,`
			`stream() {`
			`return createReadStream(`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`tempFilePath,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`) as unknown as ReadableStream<Uint8Array>;`
			`},`
			`arrayBuffer() {`
			`throw Error("Unimplemented in mock Blob: arrayBuffer");`
			`},`
			`size: (await fs.stat(tempFilePath)).size,`
			`text() {`
			`throw Error("Unimplemented in mock Blob: text");`
			`},`
			`slice(start, end, contentType) {`
			`throw Error("Unimplemented in mock Blob: slice");`
			`},`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`type: "application/pdf",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`} as Blob);`

			`const upload = await robustFetch({`
			`url: "https://api.cloud.llamaindex.ai/api/parsing/upload",`
			`method: "POST",`
			`headers: {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
			`body: uploadForm,`
			`logger: meta.logger.child({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`method: "scrapePDFWithLlamaParse/upload/robustFetch",`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}),`
			`schema: z.object({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`id: z.string(),`
			`}),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`

			`const jobId = upload.id;`

			`// TODO: timeout, retries`
			`const startedAt = Date.now();`
fix(scrapeURL/engines): better timeouts 2024-12-15 18:58:29 +01:00			`const timeout = timeToRun ?? 300000;`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00
fix(scrapeURL/engines): better timeouts 2024-12-15 18:58:29 +01:00			`while (Date.now() <= startedAt + timeout) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`try {`
			`const result = await robustFetch({`
			url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
			`method: "GET",`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`headers: {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`},`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger: meta.logger.child({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`method: "scrapePDFWithLlamaParse/result/robustFetch",`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`schema: z.object({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`markdown: z.string(),`
			`}),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
			`return {`
			`markdown: result.markdown,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`html: await marked.parse(result.markdown, { async: true }),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`} catch (e) {`
			`if (e instanceof Error && e.message === "Request sent failure status") {`
			`if ((e.cause as any).response.status === 404) {`
			`// no-op, result not up yet`
			`} else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {`
			`// URL is not a PDF, actually!`
			`meta.logger.debug("URL is not actually a PDF, signalling...");`
			`throw new RemoveFeatureError(["pdf"]);`
			`} else {`
			`throw new Error("LlamaParse threw an error", {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`cause: e.cause,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
fix(scrapeURL/pdf): handle if a presumed PDF link returns HTML (e.g. 404) 2024-12-10 23:24:33 +01:00			`}`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`} else {`
			`throw e;`
			`}`
fix(scrapeURL/pdf): handle if a presumed PDF link returns HTML (e.g. 404) 2024-12-10 23:24:33 +01:00			`}`

Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));`
			`}`

			`throw new Error("LlamaParse timed out");`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`async function scrapePDFWithParsePDF(`
			`meta: Meta,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`tempFilePath: string,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`): Promise<PDFProcessorResult> {`
revert to pdf parse 2024-12-17 12:12:22 -05:00			`meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
revert to pdf parse 2024-12-17 12:12:22 -05:00			`const result = await PdfParse(await fs.readFile(tempFilePath));`
			`const escaped = escapeHtml(result.text);`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`return {`
			`markdown: escaped,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`html: escaped,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`

fix(scrapeURL/engines): better timeouts 2024-12-15 18:58:29 +01:00			`export async function scrapePDF(meta: Meta, timeToRun: number \| undefined): Promise<EngineScrapeResult> {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (!meta.options.parsePDF) {`
			`const file = await fetchFileToBuffer(meta.url);`
			`const content = file.buffer.toString("base64");`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`return {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`url: file.response.url,`
			`statusCode: file.response.status,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`html: content,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`markdown: content,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
			`}`

			`const { response, tempFilePath } = await downloadFile(meta.id, meta.url);`

			`let result: PDFProcessorResult \| null = null;`
Update index.ts 2024-12-17 09:50:29 -05:00
revert to pdf parse 2024-12-17 12:12:22 -05:00			`// First, try parsing with PdfParse`
Update index.ts 2024-12-17 09:50:29 -05:00			`result = await scrapePDFWithParsePDF(`
			`{`
			`...meta,`
			`logger: meta.logger.child({`
			`method: "scrapePDF/scrapePDFWithParsePDF",`
			`}),`
			`},`
			`tempFilePath,`
			`);`


			`// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse`
			`if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`try {`
Update index.ts 2024-12-17 09:50:29 -05:00			`const llamaResult = await scrapePDFWithLlamaParse(`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`{`
			`...meta,`
			`logger: meta.logger.child({`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`method: "scrapePDF/scrapePDFWithLlamaParse",`
			`}),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`},`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`tempFilePath,`
fix(scrapeURL/engines): better timeouts 2024-12-15 18:58:29 +01:00			`timeToRun,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
Update index.ts 2024-12-17 09:50:29 -05:00			`result = llamaResult; // Use LlamaParse result if successful`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`} catch (error) {`
			`if (error instanceof Error && error.message === "LlamaParse timed out") {`
revert to pdf parse 2024-12-17 12:12:22 -05:00			`meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`error,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
			`} else if (error instanceof RemoveFeatureError) {`
			`throw error;`
			`} else {`
			`meta.logger.warn(`
revert to pdf parse 2024-12-17 12:12:22 -05:00			`"LlamaParse failed to parse PDF -- using parse-pdf result",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`{ error },`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
			`Sentry.captureException(error);`
			`}`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`}`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`}`

			`await fs.unlink(tempFilePath);`

			`return {`
			`url: response.url,`
			`statusCode: response.status,`

			`html: result.html,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`markdown: result.markdown,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`};`
revert to pdf parse 2024-12-17 12:12:22 -05:00			`}`