2024-05-06 11:36:44 -03:00
import { ExtractorOptions , PageOptions } from './../lib/entities' ;
2024-04-20 16:38:05 -07:00
import { Request , Response } from "express" ;
2024-04-20 19:04:27 -07:00
import { billTeam , checkTeamCredits } from "../services/billing/credit_billing" ;
2024-04-20 16:38:05 -07:00
import { authenticateUser } from "./auth" ;
2024-04-20 19:04:27 -07:00
import { RateLimiterMode } from "../types" ;
import { logJob } from "../services/logging/log_job" ;
import { Document } from "../lib/entities" ;
2024-04-23 18:50:35 -03:00
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist" ; // Import the isUrlBlocked function
2024-04-30 09:20:15 -07:00
import { numTokensFromString } from '../lib/LLM-extraction/helpers' ;
2024-06-26 09:00:54 -03:00
import { defaultPageOptions , defaultExtractorOptions , defaultTimeout , defaultOrigin } from '../lib/default-values' ;
2024-08-06 16:26:46 +02:00
import { addScrapeJob } from '../services/queue-jobs' ;
import { scrapeQueueEvents } from '../services/queue-service' ;
2024-07-24 14:31:25 +02:00
import { v4 as uuidv4 } from "uuid" ;
2024-07-25 09:48:06 -03:00
import { Logger } from '../lib/logger' ;
2024-08-22 03:55:40 +02:00
import * as Sentry from "@sentry/node" ;
2024-04-20 16:38:05 -07:00
export async function scrapeHelper (
2024-07-24 15:18:12 +02:00
jobId : string ,
2024-04-20 16:38:05 -07:00
req : Request ,
team_id : string ,
crawlerOptions : any ,
2024-05-06 11:36:44 -03:00
pageOptions : PageOptions ,
2024-05-06 19:45:56 -03:00
extractorOptions : ExtractorOptions ,
2024-05-30 14:46:55 -07:00
timeout : number ,
plan? : string
2024-04-20 18:55:39 -07:00
) : Promise < {
success : boolean ;
error? : string ;
data? : Document ;
2024-04-20 19:04:27 -07:00
returnCode : number ;
2024-04-20 18:55:39 -07:00
} > {
2024-04-20 16:38:05 -07:00
const url = req . body . url ;
if ( ! url ) {
2024-04-20 18:55:39 -07:00
return { success : false , error : "Url is required" , returnCode : 400 } ;
2024-04-20 16:38:05 -07:00
}
2024-04-23 18:50:35 -03:00
if ( isUrlBlocked ( url ) ) {
2024-04-23 16:47:24 -07:00
return { success : false , error : "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." , returnCode : 403 } ;
2024-04-23 18:50:35 -03:00
}
2024-07-30 14:44:13 -04:00
const job = await addScrapeJob ( {
2024-07-25 00:14:25 +02:00
url ,
2024-04-20 16:38:05 -07:00
mode : "single_urls" ,
2024-07-25 00:14:25 +02:00
crawlerOptions ,
team_id ,
pageOptions ,
extractorOptions ,
origin : req.body.origin ? ? defaultOrigin ,
2024-08-15 19:29:47 +02:00
} , { } , jobId ) ;
2024-07-25 00:14:25 +02:00
2024-08-06 17:25:58 +02:00
let doc ;
try {
doc = ( await job . waitUntilFinished ( scrapeQueueEvents , timeout ) ) [ 0 ] ; //60 seconds timeout
} catch ( e ) {
if ( e instanceof Error && e . message . startsWith ( "Job wait" ) ) {
return {
success : false ,
error : "Request timed out" ,
2024-08-06 17:30:01 +02:00
returnCode : 408 ,
2024-08-06 17:25:58 +02:00
}
} else {
throw e ;
}
}
2024-07-25 00:14:25 +02:00
2024-08-13 21:26:41 +02:00
await job . remove ( ) ;
2024-07-25 00:14:25 +02:00
if ( ! doc ) {
2024-08-06 16:26:46 +02:00
console . error ( "!!! PANIC DOC IS" , doc , job ) ;
2024-07-25 00:14:25 +02:00
return { success : true , error : "No page found" , returnCode : 200 , data : doc } ;
2024-04-20 16:38:05 -07:00
}
2024-04-21 09:31:22 -07:00
2024-07-25 00:50:03 +02:00
delete doc . index ;
delete doc . provider ;
2024-07-01 08:48:21 -04:00
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if ( ! pageOptions . includeRawHtml && extractorOptions . mode == "llm-extraction-from-raw-html" ) {
2024-07-25 00:14:25 +02:00
delete doc . rawHtml ;
2024-06-28 16:39:09 -04:00
}
2024-04-20 16:38:05 -07:00
return {
success : true ,
2024-07-25 00:14:25 +02:00
data : doc ,
2024-04-20 18:55:39 -07:00
returnCode : 200 ,
2024-04-20 16:38:05 -07:00
} ;
}
export async function scrapeController ( req : Request , res : Response ) {
try {
2024-07-22 18:30:58 -04:00
let earlyReturn = false ;
2024-04-20 16:38:05 -07:00
// make sure to authenticate user first, Bearer <token>
2024-05-30 14:46:55 -07:00
const { success , team_id , error , status , plan } = await authenticateUser (
2024-04-20 16:38:05 -07:00
req ,
res ,
RateLimiterMode . Scrape
) ;
if ( ! success ) {
return res . status ( status ) . json ( { error } ) ;
}
2024-07-22 18:30:58 -04:00
2024-04-20 16:38:05 -07:00
const crawlerOptions = req . body . crawlerOptions ? ? { } ;
2024-06-26 09:00:54 -03:00
const pageOptions = { . . . defaultPageOptions , . . . req . body . pageOptions } ;
const extractorOptions = { . . . defaultExtractorOptions , . . . req . body . extractorOptions } ;
2024-07-17 20:44:34 -04:00
const origin = req . body . origin ? ? defaultOrigin ;
let timeout = req . body . timeout ? ? defaultTimeout ;
2024-07-22 18:30:58 -04:00
if ( extractorOptions . mode . includes ( "llm-extraction" ) ) {
2024-05-20 17:07:38 -07:00
pageOptions . onlyMainContent = true ;
2024-07-17 20:44:34 -04:00
timeout = req . body . timeout ? ? 90000 ;
2024-05-20 17:07:38 -07:00
}
2024-04-20 16:38:05 -07:00
2024-08-21 09:28:20 -03:00
// checkCredits
try {
const { success : creditsCheckSuccess , message : creditsCheckMessage } = await checkTeamCredits ( team_id , 1 ) ;
if ( ! creditsCheckSuccess ) {
2024-07-22 18:30:58 -04:00
earlyReturn = true ;
2024-08-21 09:28:20 -03:00
return res . status ( 402 ) . json ( { error : "Insufficient credits" } ) ;
2024-04-20 16:38:05 -07:00
}
2024-08-21 09:28:20 -03:00
} catch ( error ) {
Logger . error ( error ) ;
earlyReturn = true ;
return res . status ( 500 ) . json ( { error : "Error checking team credits. Please contact hello@firecrawl.com for help." } ) ;
}
2024-07-22 18:30:58 -04:00
2024-07-24 15:18:12 +02:00
const jobId = uuidv4 ( ) ;
2024-04-20 19:37:45 -07:00
const startTime = new Date ( ) . getTime ( ) ;
2024-04-20 18:55:39 -07:00
const result = await scrapeHelper (
2024-07-24 15:18:12 +02:00
jobId ,
2024-04-20 18:55:39 -07:00
req ,
team_id ,
crawlerOptions ,
2024-04-28 15:52:09 -07:00
pageOptions ,
2024-05-06 19:45:56 -03:00
extractorOptions ,
2024-05-30 14:46:55 -07:00
timeout ,
plan
2024-04-20 18:55:39 -07:00
) ;
2024-04-20 19:37:45 -07:00
const endTime = new Date ( ) . getTime ( ) ;
const timeTakenInSeconds = ( endTime - startTime ) / 1000 ;
2024-04-30 16:35:44 -07:00
const numTokens = ( result . data && result . data . markdown ) ? numTokensFromString ( result . data . markdown , "gpt-3.5-turbo" ) : 0 ;
2024-04-30 09:20:15 -07:00
2024-07-22 18:30:58 -04:00
if ( result . success ) {
2024-08-06 16:57:23 +02:00
let creditsToBeBilled = 0 ; // billing for doc done on queue end
2024-07-22 18:30:58 -04:00
const creditsPerLLMExtract = 50 ;
if ( extractorOptions . mode . includes ( "llm-extraction" ) ) {
2024-07-22 19:12:51 -04:00
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
2024-07-22 18:30:58 -04:00
creditsToBeBilled += creditsPerLLMExtract ;
}
let startTimeBilling = new Date ( ) . getTime ( ) ;
if ( earlyReturn ) {
// Don't bill if we're early returning
return ;
}
const billingResult = await billTeam (
team_id ,
creditsToBeBilled
) ;
if ( ! billingResult . success ) {
return res . status ( 402 ) . json ( {
success : false ,
error : "Failed to bill team. Insufficient credits or subscription not found." ,
} ) ;
}
}
2024-04-20 18:55:39 -07:00
logJob ( {
2024-07-24 15:18:12 +02:00
job_id : jobId ,
2024-04-20 18:55:39 -07:00
success : result.success ,
message : result.error ,
num_docs : 1 ,
docs : [ result . data ] ,
2024-04-20 19:37:45 -07:00
time_taken : timeTakenInSeconds ,
2024-04-20 18:55:39 -07:00
team_id : team_id ,
mode : "scrape" ,
url : req.body.url ,
crawlerOptions : crawlerOptions ,
pageOptions : pageOptions ,
2024-04-30 09:20:15 -07:00
origin : origin ,
extractor_options : extractorOptions ,
2024-05-06 19:45:56 -03:00
num_tokens : numTokens ,
2024-04-20 18:55:39 -07:00
} ) ;
2024-07-22 18:30:58 -04:00
2024-04-20 19:04:27 -07:00
return res . status ( result . returnCode ) . json ( result ) ;
2024-04-20 16:38:05 -07:00
} catch ( error ) {
2024-08-22 03:55:40 +02:00
Sentry . captureException ( error ) ;
2024-07-25 09:48:06 -03:00
Logger . error ( error ) ;
2024-04-20 16:38:05 -07:00
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}