Files
firecrawl/apps/api/src/services/logging/log_job.ts
T

230 lines
7.1 KiB
TypeScript
Raw Normal View History

2024-04-20 13:53:11 -07:00
import { supabase_service } from "../supabase";
import { FirecrawlJob } from "../../types";
2024-05-02 15:30:22 -04:00
import { posthog } from "../posthog";
2024-04-20 13:53:11 -07:00
import "dotenv/config";
2024-11-07 20:57:33 +01:00
import { logger } from "../../lib/logger";
2024-09-04 15:57:57 -03:00
import { configDotenv } from "dotenv";
import { Storage } from "@google-cloud/storage";
2024-09-04 15:57:57 -03:00
configDotenv();
2024-04-20 13:53:11 -07:00
2025-01-07 19:38:17 +01:00
function cleanOfNull<T>(x: T): T {
if (Array.isArray(x)) {
2025-01-10 18:35:10 -03:00
return x.map((x) => cleanOfNull(x)) as T;
2025-01-07 19:38:17 +01:00
} else if (typeof x === "object" && x !== null) {
2025-01-10 18:35:10 -03:00
return Object.fromEntries(
Object.entries(x).map(([k, v]) => [k, cleanOfNull(v)]),
) as T;
2025-01-07 19:38:17 +01:00
} else if (typeof x === "string") {
return x.replaceAll("\u0000", "") as T;
} else {
return x;
}
}
async function saveJobToGCS(job: FirecrawlJob, bucketName: string): Promise<void> {
try {
const storage = new Storage();
const bucket = storage.bucket(bucketName);
const blob = bucket.file(`${job.job_id}.json`);
await blob.save(JSON.stringify(job.docs), {
contentType: "application/json",
});
await blob.setMetadata({
metadata: {
job_id: job.job_id ?? null,
success: job.success,
message: job.message ?? null,
num_docs: job.num_docs,
time_taken: job.time_taken,
team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id,
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.scrapeOptions,
origin: job.origin,
num_tokens: job.num_tokens ?? null,
retry: !!job.retry,
crawl_id: job.crawl_id ?? null,
tokens_billed: job.tokens_billed ?? null,
},
})
} catch (error) {
logger.error(`Error saving job to GCS`, {
error,
scrapeId: job.job_id,
jobId: job.job_id,
});
}
}
2025-02-25 19:32:16 -03:00
async function indexJob(job: FirecrawlJob): Promise<void> {
try {
2025-02-25 21:01:00 -03:00
if (job.mode !== "single_urls" && job.mode !== "scrape") {
return;
}
2025-02-25 19:32:16 -03:00
const response = await fetch(`${process.env.FIRE_INDEX_SERVER_URL}/api/jobs`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
url: job.url,
mode: job.mode || "scrape",
docs: job.docs,
origin: job.origin,
success: job.success,
time_taken: job.time_taken,
num_tokens: job.num_tokens,
page_options: job.scrapeOptions,
date_added: new Date().toISOString(),
}),
});
if (!response.ok) {
const errorData = await response.json();
logger.error(`Failed to send job to external server: ${response.status} ${response.statusText}`, {
error: errorData,
scrapeId: job.job_id,
});
} else {
logger.debug("Job sent to external server successfully!", { scrapeId: job.job_id });
}
} catch (error) {
logger.error(`Error sending job to external server: ${error.message}`, {
error,
scrapeId: job.job_id,
});
}
}
export async function logJob(job: FirecrawlJob, force: boolean = false) {
2024-04-20 13:53:11 -07:00
try {
2024-12-11 19:46:11 -03:00
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
if (!useDbAuthentication) {
2024-06-27 16:00:45 -03:00
return;
}
2024-07-03 20:18:11 -03:00
// Redact any pages that have an authorization header
2025-01-09 09:50:54 +01:00
// actually, Don't. we use the db to retrieve results now. this breaks authed crawls - mogery
// if (
// job.scrapeOptions &&
// job.scrapeOptions.headers &&
// job.scrapeOptions.headers["Authorization"]
// ) {
// job.scrapeOptions.headers["Authorization"] = "REDACTED";
// job.docs = [
// {
// content: "REDACTED DUE TO AUTHORIZATION HEADER",
// html: "REDACTED DUE TO AUTHORIZATION HEADER",
// },
// ];
// }
const jobColumn = {
job_id: job.job_id ? job.job_id : null,
success: job.success,
message: job.message,
num_docs: job.num_docs,
2025-01-07 19:38:17 +01:00
docs: cleanOfNull(job.docs),
time_taken: job.time_taken,
2025-01-25 19:02:32 +01:00
team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id,
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
page_options: job.scrapeOptions,
origin: job.origin,
num_tokens: job.num_tokens,
retry: !!job.retry,
2024-12-11 19:51:08 -03:00
crawl_id: job.crawl_id,
tokens_billed: job.tokens_billed,
};
2024-07-03 20:18:11 -03:00
2025-02-25 19:32:16 -03:00
// Send job to external server
if (process.env.FIRE_INDEX_SERVER_URL) {
indexJob(job);
}
if (process.env.GCS_BUCKET_NAME) {
await saveJobToGCS(job, process.env.GCS_BUCKET_NAME);
}
if (force) {
2024-12-11 19:46:11 -03:00
let i = 0,
done = false;
2024-11-28 08:49:03 +01:00
while (i++ <= 10) {
try {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
2024-12-11 19:46:11 -03:00
logger.error(
"Failed to log job due to Supabase error -- trying again",
2024-12-11 19:51:08 -03:00
{ error, scrapeId: job.job_id },
2024-12-11 19:46:11 -03:00
);
await new Promise<void>((resolve) =>
2024-12-11 19:51:08 -03:00
setTimeout(() => resolve(), 75),
2024-12-11 19:46:11 -03:00
);
} else {
2024-11-28 08:49:03 +01:00
done = true;
break;
}
} catch (error) {
2024-12-11 19:46:11 -03:00
logger.error(
"Failed to log job due to thrown error -- trying again",
2024-12-11 19:51:08 -03:00
{ error, scrapeId: job.job_id },
2024-12-11 19:46:11 -03:00
);
await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
}
}
2024-11-28 08:49:03 +01:00
if (done) {
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
} else {
logger.error("Failed to log job!", { scrapeId: job.job_id });
}
} else {
const { error } = await supabase_service
.from("firecrawl_jobs")
.insert([jobColumn]);
if (error) {
2024-12-11 19:46:11 -03:00
logger.error(`Error logging job: ${error.message}`, {
error,
2024-12-11 19:51:08 -03:00
scrapeId: job.job_id,
2024-12-11 19:46:11 -03:00
});
} else {
logger.debug("Job logged successfully!", { scrapeId: job.job_id });
}
}
2024-05-02 15:30:22 -04:00
2024-08-15 18:55:18 +02:00
if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
2024-07-03 20:18:11 -03:00
let phLog = {
distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
2025-01-25 19:02:32 +01:00
...((job.team_id !== "preview" && !job.team_id?.startsWith("preview_")) && {
2024-12-11 19:51:08 -03:00
groups: { team: job.team_id },
2024-07-03 20:18:11 -03:00
}), //* Identifying event on this team
event: "job-logged",
properties: {
success: job.success,
message: job.message,
num_docs: job.num_docs,
time_taken: job.time_taken,
2025-01-25 19:02:32 +01:00
team_id: (job.team_id === "preview" || job.team_id?.startsWith("preview_"))? null : job.team_id,
2024-07-03 20:18:11 -03:00
mode: job.mode,
url: job.url,
crawler_options: job.crawlerOptions,
2024-11-07 20:57:33 +01:00
page_options: job.scrapeOptions,
2024-07-03 20:18:11 -03:00
origin: job.origin,
num_tokens: job.num_tokens,
2024-12-11 19:51:08 -03:00
retry: job.retry,
tokens_billed: job.tokens_billed,
2024-12-11 19:51:08 -03:00
},
2024-07-03 20:18:11 -03:00
};
2024-12-11 19:46:11 -03:00
if (job.mode !== "single_urls") {
2024-10-27 23:14:25 -03:00
posthog.capture(phLog);
}
2024-07-03 20:18:11 -03:00
}
2024-04-20 13:53:11 -07:00
} catch (error) {
2024-11-07 20:57:33 +01:00
logger.error(`Error logging job: ${error.message}`);
2024-04-20 13:53:11 -07:00
}
}