apps/api/src/services/logging/log_job.ts

import { supabase_service } from "../supabase";
import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog";
import "dotenv/config";
import { logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
import { Storage } from "@google-cloud/storage";
configDotenv();

function cleanOfNull<T>(x: T): T {
  if (Array.isArray(x)) {
    return x.map((x) => cleanOfNull(x)) as T;
  } else if (typeof x === "object" && x !== null) {
    return Object.fromEntries(
      Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),
    ) as T;
  } else if (typeof x === "string") {
    return x.replaceAll("\u0000", "") as T;
  } else {
    return x;
  }
}


async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise<void> {
  try {
    const storage = new Storage();
    const bucket = storage.bucket(bucketName);
    const blob = bucket.file(`${job.job_id}.json`);
    await blob.save(JSON.stringify(job.docs), {
      contentType: "application/json",
    });
    await blob.setMetadata({
      metadata: {
        job_id: job.job_id ?? null,
        success: job.success,
        message: job.message ?? null,
        num_docs: job.num_docs,
        time_taken: job.time_taken,
        team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id,
        mode: job.mode,
        url: job.url,
        crawler_options: job.crawlerOptions,
        page_options: job.scrapeOptions,
        origin: job.origin,
        num_tokens: job.num_tokens ?? null,
        retry: !!job.retry,
        crawl_id: job.crawl_id ?? null,
        tokens_billed: job.tokens_billed ?? null,
      },
    })
  } catch (error) {
    logger.error(`Error saving job to GCS`, {
      error,
      scrapeId: job.job_id,
      jobId: job.job_id,
    });
  }
}

async function indexJob(job: FirecrawlJob): Promise<void> {
  try {
    if (job.mode !== "single_urls" && job.mode !== "scrape") {
      return;
    }

    const response = await fetch(`${process.env.FIRE_INDEX_SERVER_URL}/api/jobs`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({
        url: job.url,
        mode: job.mode || "scrape",
        docs: job.docs,
        origin: job.origin,
        success: job.success,
        time_taken: job.time_taken,
        num_tokens: job.num_tokens,
        page_options: job.scrapeOptions,
        date_added: new Date().toISOString(),
      }),
    });

    if (!response.ok) {
      const errorData = await response.json();
      logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
        error: errorData,
        scrapeId: job.job_id,
      });
    } else {
      logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });
    }
  } catch (error) {
    logger.error(`Error sending job to external server: ${error.message}`, {
      error,
      scrapeId: job.job_id,
    });
  }
}

export async function logJob(job: FirecrawlJob, force: boolean = false) {
  try {
    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
    if (!useDbAuthentication) {
      return;
    }

    // Redact any pages that have an authorization header
    // actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
    // if (
    //   job.scrapeOptions &&
    //   job.scrapeOptions.headers &&
    //   job.scrapeOptions.headers["Authorization"]
    // ) {
    //   job.scrapeOptions.headers["Authorization"] = "REDACTED";
    //   job.docs = [
    //     {
    //       content: "REDACTED DUE TO AUTHORIZATION HEADER",
    //       html: "REDACTED DUE TO AUTHORIZATION HEADER",
    //     },
    //   ];
    // }
    const jobColumn = {
      job_id: job.job_id ? job.job_id : null,
      success: job.success,
      message: job.message,
      num_docs: job.num_docs,
      docs: cleanOfNull(job.docs),
      time_taken: job.time_taken,
      team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id,
      mode: job.mode,
      url: job.url,
      crawler_options: job.crawlerOptions,
      page_options: job.scrapeOptions,
      origin: job.origin,
      num_tokens: job.num_tokens,
      retry: !!job.retry,
      crawl_id: job.crawl_id,
      tokens_billed: job.tokens_billed,
    };

    // Send job to external server
    if (process.env.FIRE_INDEX_SERVER_URL) {
      indexJob(job);
    }

    if (process.env.GCS_BUCKET_NAME) {
      await saveJobToGCS(job, process.env.GCS_BUCKET_NAME);
    }

    if (force) {
      let i = 0,
        done = false;
      while (i++ <= 10) {
        try {
          const { error } = await supabase_service
            .from("firecrawl_jobs")
            .insert([jobColumn]);
          if (error) {
            logger.error(
              "Failed to log job due to Supabase error -- trying again",
              { error, scrapeId: job.job_id },
            );
            await new Promise<void>((resolve) =>
              setTimeout(() => resolve(), 75),
            );
          } else {
            done = true;
            break;
          }
        } catch (error) {
          logger.error(
            "Failed to log job due to thrown error -- trying again",
            { error, scrapeId: job.job_id },
          );
          await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
        }
      }
      if (done) {
        logger.debug("Job logged successfully!", { scrapeId: job.job_id });
      } else {
        logger.error("Failed to log job!", { scrapeId: job.job_id });
      }
    } else {
      const { error } = await supabase_service
        .from("firecrawl_jobs")
        .insert([jobColumn]);
      if (error) {
        logger.error(`Error logging job: ${error.message}`, {
          error,
          scrapeId: job.job_id,
        });
      } else {
        logger.debug("Job logged successfully!", { scrapeId: job.job_id });
      }
    }

    if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
      let phLog = {
        distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
        ...((job.team_id !== "preview" && !job.team_id?.startsWith("preview_")) && {
          groups: { team: job.team_id },
        }), //* Identifying event on this team
        event: "job-logged",
        properties: {
          success: job.success,
          message: job.message,
          num_docs: job.num_docs,
          time_taken: job.time_taken,
          team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id,
          mode: job.mode,
          url: job.url,
          crawler_options: job.crawlerOptions,
          page_options: job.scrapeOptions,
          origin: job.origin,
          num_tokens: job.num_tokens,
          retry: job.retry,
          tokens_billed: job.tokens_billed,
        },
      };
      if (job.mode !== "single_urls") {
        posthog.capture(phLog);
      }
    }
  } catch (error) {
    logger.error(`Error logging job: ${error.message}`);
  }
}
Nick: 2024-04-20 13:53:11 -07:00			`import { supabase_service } from "../supabase";`
			`import { FirecrawlJob } from "../../types";`
Add Posthog Logging 2024-05-02 15:30:22 -04:00			`import { posthog } from "../posthog";`
Nick: 2024-04-20 13:53:11 -07:00			`import "dotenv/config";`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`import { logger } from "../../lib/logger";`
fix: enforced dotenv config 2024-09-04 15:57:57 -03:00			`import { configDotenv } from "dotenv";`
feat(log_job): start saving jobs to GCS (#1424 ) 2025-04-08 19:28:21 +02:00			`import { Storage } from "@google-cloud/storage";`
fix: enforced dotenv config 2024-09-04 15:57:57 -03:00			`configDotenv();`
Nick: 2024-04-20 13:53:11 -07:00
crawl incomplete issues 2025-01-07 19:38:17 +01:00			`function cleanOfNull<T>(x: T): T {`
			`if (Array.isArray(x)) {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`return x.map((x) => cleanOfNull(x)) as T;`
crawl incomplete issues 2025-01-07 19:38:17 +01:00			`} else if (typeof x === "object" && x !== null) {`
Nick: formatting fixes 2025-01-10 18:35:10 -03:00			`return Object.fromEntries(`
			`Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),`
			`) as T;`
crawl incomplete issues 2025-01-07 19:38:17 +01:00			`} else if (typeof x === "string") {`
			`return x.replaceAll("\u0000", "") as T;`
			`} else {`
			`return x;`
			`}`
			`}`

feat(log_job): start saving jobs to GCS (#1424 ) 2025-04-08 19:28:21 +02:00
			`async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise<void> {`
			`try {`
			`const storage = new Storage();`
			`const bucket = storage.bucket(bucketName);`
			const blob = bucket.file(`${job.job_id}.json`);
			`await blob.save(JSON.stringify(job.docs), {`
			`contentType: "application/json",`
			`});`
			`await blob.setMetadata({`
			`metadata: {`
			`job_id: job.job_id ?? null,`
			`success: job.success,`
			`message: job.message ?? null,`
			`num_docs: job.num_docs,`
			`time_taken: job.time_taken,`
			`team_id: (job.team_id === "preview" \|\| job.team_id?.startsWith("preview_"))? null : job.team_id,`
			`mode: job.mode,`
			`url: job.url,`
			`crawler_options: job.crawlerOptions,`
			`page_options: job.scrapeOptions,`
			`origin: job.origin,`
			`num_tokens: job.num_tokens ?? null,`
			`retry: !!job.retry,`
			`crawl_id: job.crawl_id ?? null,`
			`tokens_billed: job.tokens_billed ?? null,`
			`},`
			`})`
			`} catch (error) {`
			logger.error(`Error saving job to GCS`, {
			`error,`
			`scrapeId: job.job_id,`
			`jobId: job.job_id,`
			`});`
			`}`
			`}`

Update log_job.ts 2025-02-25 19:32:16 -03:00			`async function indexJob(job: FirecrawlJob): Promise<void> {`
			`try {`
Update log_job.ts 2025-02-25 21:01:00 -03:00			`if (job.mode !== "single_urls" && job.mode !== "scrape") {`
			`return;`
			`}`

Update log_job.ts 2025-02-25 19:32:16 -03:00			const response = await fetch(`${process.env.FIRE_INDEX_SERVER_URL}/api/jobs`, {
			`method: 'POST',`
			`headers: {`
			`'Content-Type': 'application/json',`
			`},`
			`body: JSON.stringify({`
			`url: job.url,`
			`mode: job.mode \|\| "scrape",`
			`docs: job.docs,`
			`origin: job.origin,`
			`success: job.success,`
			`time_taken: job.time_taken,`
			`num_tokens: job.num_tokens,`
			`page_options: job.scrapeOptions,`
			`date_added: new Date().toISOString(),`
			`}),`
			`});`

			`if (!response.ok) {`
			`const errorData = await response.json();`
			logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
			`error: errorData,`
			`scrapeId: job.job_id,`
			`});`
			`} else {`
			`logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });`
			`}`
			`} catch (error) {`
			logger.error(`Error sending job to external server: ${error.message}`, {
			`error,`
			`scrapeId: job.job_id,`
			`});`
			`}`
			`}`

fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`export async function logJob(job: FirecrawlJob, force: boolean = false) {`
Nick: 2024-04-20 13:53:11 -07:00			`try {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";`
added validation on every USE_DB_AUTHENTICATION call 2024-08-12 14:20:41 -03:00			`if (!useDbAuthentication) {`
looking good 2024-06-27 16:00:45 -03:00			`return;`
			`}`
Added implementation for saving docs on supabase 2024-06-26 18:23:28 -03:00
Nick: 2024-07-03 20:18:11 -03:00			`// Redact any pages that have an authorization header`
fix(log_job): don't redact with auth header 2025-01-09 09:50:54 +01:00			`// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery`
			`// if (`
			`// job.scrapeOptions &&`
			`// job.scrapeOptions.headers &&`
			`// job.scrapeOptions.headers["Authorization"]`
			`// ) {`
			`// job.scrapeOptions.headers["Authorization"] = "REDACTED";`
			`// job.docs = [`
			`// {`
			`// content: "REDACTED DUE TO AUTHORIZATION HEADER",`
			`// html: "REDACTED DUE TO AUTHORIZATION HEADER",`
			`// },`
			`// ];`
			`// }`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`const jobColumn = {`
			`job_id: job.job_id ? job.job_id : null,`
			`success: job.success,`
			`message: job.message,`
			`num_docs: job.num_docs,`
crawl incomplete issues 2025-01-07 19:38:17 +01:00			`docs: cleanOfNull(job.docs),`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`time_taken: job.time_taken,`
fix: adapt preview team checks 2025-01-25 19:02:32 +01:00			`team_id: (job.team_id === "preview" \|\| job.team_id?.startsWith("preview_"))? null : job.team_id,`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`mode: job.mode,`
			`url: job.url,`
			`crawler_options: job.crawlerOptions,`
			`page_options: job.scrapeOptions,`
			`origin: job.origin,`
			`num_tokens: job.num_tokens,`
			`retry: !!job.retry,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`crawl_id: job.crawl_id,`
Reapply "Merge pull request #1068 from mendableai/nsc/llm-usage-extract" 2025-01-19 22:04:12 -03:00			`tokens_billed: job.tokens_billed,`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`};`
Nick: 2024-07-03 20:18:11 -03:00
Update log_job.ts 2025-02-25 19:32:16 -03:00			`// Send job to external server`
			`if (process.env.FIRE_INDEX_SERVER_URL) {`
			`indexJob(job);`
			`}`

feat(log_job): start saving jobs to GCS (#1424 ) 2025-04-08 19:28:21 +02:00			`if (process.env.GCS_BUCKET_NAME) {`
			`await saveJobToGCS(job, process.env.GCS_BUCKET_NAME);`
			`}`

fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`if (force) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`let i = 0,`
			`done = false;`
fix(log_job): infinite loop 2024-11-28 08:49:03 +01:00			`while (i++ <= 10) {`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`try {`
			`const { error } = await supabase_service`
			`.from("firecrawl_jobs")`
			`.insert([jobColumn]);`
			`if (error) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger.error(`
			`"Failed to log job due to Supabase error -- trying again",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`{ error, scrapeId: job.job_id },`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
			`await new Promise<void>((resolve) =>`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`setTimeout(() => resolve(), 75),`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`} else {`
fix(log_job): infinite loop 2024-11-28 08:49:03 +01:00			`done = true;`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`break;`
			`}`
			`} catch (error) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`logger.error(`
			`"Failed to log job due to thrown error -- trying again",`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`{ error, scrapeId: job.job_id },`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`);`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));`
			`}`
			`}`
fix(log_job): infinite loop 2024-11-28 08:49:03 +01:00			`if (done) {`
			`logger.debug("Job logged successfully!", { scrapeId: job.job_id });`
			`} else {`
			`logger.error("Failed to log job!", { scrapeId: job.job_id });`
			`}`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`} else {`
			`const { error } = await supabase_service`
			`.from("firecrawl_jobs")`
			`.insert([jobColumn]);`
			`if (error) {`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			logger.error(`Error logging job: ${error.message}`, {
			`error,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`scrapeId: job.job_id,`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`});`
fix(log_job): add force option to retry on supabase failure 2024-11-15 19:55:23 +01:00			`} else {`
			`logger.debug("Job logged successfully!", { scrapeId: job.job_id });`
			`}`
			`}`
Add Posthog Logging 2024-05-02 15:30:22 -04:00
fix: fix posthog, add dummy crawl DB items 2024-08-15 18:55:18 +02:00			`if (process.env.POSTHOG_API_KEY && !job.crawl_id) {`
Nick: 2024-07-03 20:18:11 -03:00			`let phLog = {`
			`distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user`
fix: adapt preview team checks 2025-01-25 19:02:32 +01:00			`...((job.team_id !== "preview" && !job.team_id?.startsWith("preview_")) && {`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`groups: { team: job.team_id },`
Nick: 2024-07-03 20:18:11 -03:00			`}), //* Identifying event on this team`
			`event: "job-logged",`
			`properties: {`
			`success: job.success,`
			`message: job.message,`
			`num_docs: job.num_docs,`
			`time_taken: job.time_taken,`
fix: adapt preview team checks 2025-01-25 19:02:32 +01:00			`team_id: (job.team_id === "preview" \|\| job.team_id?.startsWith("preview_"))? null : job.team_id,`
Nick: 2024-07-03 20:18:11 -03:00			`mode: job.mode,`
			`url: job.url,`
			`crawler_options: job.crawlerOptions,`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			`page_options: job.scrapeOptions,`
Nick: 2024-07-03 20:18:11 -03:00			`origin: job.origin,`
			`num_tokens: job.num_tokens,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`retry: job.retry,`
Reapply "Merge pull request #1068 from mendableai/nsc/llm-usage-extract" 2025-01-19 22:04:12 -03:00			`tokens_billed: job.tokens_billed,`
Nick: revert trailing comma 2024-12-11 19:51:08 -03:00			`},`
Nick: 2024-07-03 20:18:11 -03:00			`};`
Nick: fixed prettier 2024-12-11 19:46:11 -03:00			`if (job.mode !== "single_urls") {`
Update log_job.ts 2024-10-27 23:14:25 -03:00			`posthog.capture(phLog);`
			`}`
Nick: 2024-07-03 20:18:11 -03:00			`}`
Nick: 2024-04-20 13:53:11 -07:00			`} catch (error) {`
`WebScraper` refactor into `scrapeURL` (#714 ) 2024-11-07 20:57:33 +01:00			logger.error(`Error logging job: ${error.message}`);
Nick: 2024-04-20 13:53:11 -07:00			`}`
			`}`