2024-07-03 20:18:11 -03:00
import { ExtractorOptions } from "./../../lib/entities" ;
2024-04-20 13:53:11 -07:00
import { supabase_service } from "../supabase" ;
import { FirecrawlJob } from "../../types" ;
2024-05-02 15:30:22 -04:00
import { posthog } from "../posthog" ;
2024-04-20 13:53:11 -07:00
import "dotenv/config" ;
2024-11-07 20:57:33 +01:00
import { logger } from "../../lib/logger" ;
2024-09-04 15:57:57 -03:00
import { configDotenv } from "dotenv" ;
configDotenv ( ) ;
2024-04-20 13:53:11 -07:00
2024-11-15 19:55:23 +01:00
export async function logJob ( job : FirecrawlJob , force : boolean = false ) {
2024-04-20 13:53:11 -07:00
try {
2024-12-11 19:46:11 -03:00
const useDbAuthentication = process . env . USE_DB_AUTHENTICATION === "true" ;
2024-08-12 14:20:41 -03:00
if ( ! useDbAuthentication ) {
2024-06-27 16:00:45 -03:00
return ;
}
2024-06-26 18:23:28 -03:00
2024-07-03 20:18:11 -03:00
// Redact any pages that have an authorization header
if (
2024-11-07 20:57:33 +01:00
job . scrapeOptions &&
job . scrapeOptions . headers &&
job . scrapeOptions . headers [ "Authorization" ]
2024-07-03 20:18:11 -03:00
) {
2024-11-07 20:57:33 +01:00
job . scrapeOptions . headers [ "Authorization" ] = "REDACTED" ;
2024-12-11 19:46:11 -03:00
job . docs = [
{
content : "REDACTED DUE TO AUTHORIZATION HEADER" ,
2024-12-11 19:51:08 -03:00
html : "REDACTED DUE TO AUTHORIZATION HEADER" ,
} ,
2024-12-11 19:46:11 -03:00
] ;
2024-07-03 20:18:11 -03:00
}
2024-11-15 19:55:23 +01:00
const jobColumn = {
job_id : job.job_id ? job.job_id : null ,
success : job.success ,
message : job.message ,
num_docs : job.num_docs ,
docs : job.docs ,
time_taken : job.time_taken ,
team_id : job.team_id === "preview" ? null : job . team_id ,
mode : job.mode ,
url : job.url ,
crawler_options : job.crawlerOptions ,
page_options : job.scrapeOptions ,
origin : job.origin ,
num_tokens : job.num_tokens ,
retry : ! ! job . retry ,
2024-12-11 19:51:08 -03:00
crawl_id : job.crawl_id ,
2024-11-15 19:55:23 +01:00
} ;
2024-07-03 20:18:11 -03:00
2024-11-15 19:55:23 +01:00
if ( force ) {
2024-12-11 19:46:11 -03:00
let i = 0 ,
done = false ;
2024-11-28 08:49:03 +01:00
while ( i ++ <= 10 ) {
2024-11-15 19:55:23 +01:00
try {
const { error } = await supabase_service
. from ( "firecrawl_jobs" )
. insert ( [ jobColumn ] ) ;
if ( error ) {
2024-12-11 19:46:11 -03:00
logger . error (
"Failed to log job due to Supabase error -- trying again" ,
2024-12-11 19:51:08 -03:00
{ error , scrapeId : job.job_id } ,
2024-12-11 19:46:11 -03:00
) ;
await new Promise < void > ( ( resolve ) = >
2024-12-11 19:51:08 -03:00
setTimeout ( ( ) = > resolve ( ) , 75 ) ,
2024-12-11 19:46:11 -03:00
) ;
2024-11-15 19:55:23 +01:00
} else {
2024-11-28 08:49:03 +01:00
done = true ;
2024-11-15 19:55:23 +01:00
break ;
}
} catch ( error ) {
2024-12-11 19:46:11 -03:00
logger . error (
"Failed to log job due to thrown error -- trying again" ,
2024-12-11 19:51:08 -03:00
{ error , scrapeId : job.job_id } ,
2024-12-11 19:46:11 -03:00
) ;
2024-11-15 19:55:23 +01:00
await new Promise < void > ( ( resolve ) = > setTimeout ( ( ) = > resolve ( ) , 75 ) ) ;
}
}
2024-11-28 08:49:03 +01:00
if ( done ) {
logger . debug ( "Job logged successfully!" , { scrapeId : job.job_id } ) ;
} else {
logger . error ( "Failed to log job!" , { scrapeId : job.job_id } ) ;
}
2024-11-15 19:55:23 +01:00
} else {
const { error } = await supabase_service
. from ( "firecrawl_jobs" )
. insert ( [ jobColumn ] ) ;
if ( error ) {
2024-12-11 19:46:11 -03:00
logger . error ( ` Error logging job: ${ error . message } ` , {
error ,
2024-12-11 19:51:08 -03:00
scrapeId : job.job_id ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-11-15 19:55:23 +01:00
} else {
logger . debug ( "Job logged successfully!" , { scrapeId : job.job_id } ) ;
}
}
2024-05-02 15:30:22 -04:00
2024-08-15 18:55:18 +02:00
if ( process . env . POSTHOG_API_KEY && ! job . crawl_id ) {
2024-07-03 20:18:11 -03:00
let phLog = {
distinctId : "from-api" , //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user
. . . ( job . team_id !== "preview" && {
2024-12-11 19:51:08 -03:00
groups : { team : job.team_id } ,
2024-07-03 20:18:11 -03:00
} ) , //* Identifying event on this team
event : "job-logged" ,
properties : {
success : job.success ,
message : job.message ,
num_docs : job.num_docs ,
time_taken : job.time_taken ,
team_id : job.team_id === "preview" ? null : job . team_id ,
mode : job.mode ,
url : job.url ,
crawler_options : job.crawlerOptions ,
2024-11-07 20:57:33 +01:00
page_options : job.scrapeOptions ,
2024-07-03 20:18:11 -03:00
origin : job.origin ,
num_tokens : job.num_tokens ,
2024-12-11 19:51:08 -03:00
retry : job.retry ,
} ,
2024-07-03 20:18:11 -03:00
} ;
2024-12-11 19:46:11 -03:00
if ( job . mode !== "single_urls" ) {
2024-10-27 23:14:25 -03:00
posthog . capture ( phLog ) ;
}
2024-07-03 20:18:11 -03:00
}
2024-04-20 13:53:11 -07:00
} catch ( error ) {
2024-11-07 20:57:33 +01:00
logger . error ( ` Error logging job: ${ error . message } ` ) ;
2024-04-20 13:53:11 -07:00
}
}