2024-05-06 11:36:44 -03:00
import { ExtractorOptions , PageOptions } from './../lib/entities' ;
2024-04-20 16:38:05 -07:00
import { Request , Response } from "express" ;
2024-04-20 19:04:27 -07:00
import { WebScraperDataProvider } from "../scraper/WebScraper" ;
import { billTeam , checkTeamCredits } from "../services/billing/credit_billing" ;
2024-04-20 16:38:05 -07:00
import { authenticateUser } from "./auth" ;
2024-04-20 19:04:27 -07:00
import { RateLimiterMode } from "../types" ;
import { logJob } from "../services/logging/log_job" ;
import { Document } from "../lib/entities" ;
2024-04-23 18:50:35 -03:00
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist" ; // Import the isUrlBlocked function
2024-04-30 09:20:15 -07:00
import { numTokensFromString } from '../lib/LLM-extraction/helpers' ;
2024-06-26 09:00:54 -03:00
import { defaultPageOptions , defaultExtractorOptions , defaultTimeout , defaultOrigin } from '../lib/default-values' ;
2024-07-25 00:14:25 +02:00
import { addWebScraperJob } from '../services/queue-jobs' ;
import { getWebScraperQueue } from '../services/queue-service' ;
2024-04-20 16:38:05 -07:00
export async function scrapeHelper (
req : Request ,
team_id : string ,
crawlerOptions : any ,
2024-05-06 11:36:44 -03:00
pageOptions : PageOptions ,
2024-05-06 19:45:56 -03:00
extractorOptions : ExtractorOptions ,
2024-05-30 14:46:55 -07:00
timeout : number ,
plan? : string
2024-04-20 18:55:39 -07:00
) : Promise < {
success : boolean ;
error? : string ;
data? : Document ;
2024-04-20 19:04:27 -07:00
returnCode : number ;
2024-04-20 18:55:39 -07:00
} > {
2024-04-20 16:38:05 -07:00
const url = req . body . url ;
if ( ! url ) {
2024-04-20 18:55:39 -07:00
return { success : false , error : "Url is required" , returnCode : 400 } ;
2024-04-20 16:38:05 -07:00
}
2024-04-23 18:50:35 -03:00
if ( isUrlBlocked ( url ) ) {
2024-04-23 16:47:24 -07:00
return { success : false , error : "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." , returnCode : 403 } ;
2024-04-23 18:50:35 -03:00
}
2024-07-25 00:14:25 +02:00
// const a = new WebScraperDataProvider();
// await a.setOptions({
// mode: "single_urls",
// urls: [url],
// crawlerOptions: {
// ...crawlerOptions,
// },
// pageOptions: pageOptions,
// extractorOptions: extractorOptions,
// });
const job = await addWebScraperJob ( {
url ,
2024-04-20 16:38:05 -07:00
mode : "single_urls" ,
2024-07-25 00:14:25 +02:00
crawlerOptions ,
team_id ,
pageOptions ,
extractorOptions ,
origin : req.body.origin ? ? defaultOrigin ,
} ) ;
const wsq = getWebScraperQueue ( ) ;
let promiseResolve ;
const docsPromise = new Promise ( ( resolve ) = > {
promiseResolve = resolve ;
2024-04-20 16:38:05 -07:00
} ) ;
2024-07-25 00:14:25 +02:00
const listener = ( j : string ) = > {
console . log ( "JOB COMPLETED" , j , "vs" , job . id ) ;
if ( j === job . id ) {
promiseResolve ( j ) ;
wsq . removeListener ( "global:completed" , listener ) ;
}
}
wsq . on ( "global:completed" , listener ) ;
2024-05-13 13:01:43 -07:00
const timeoutPromise = new Promise < { success : boolean ; error? : string ; returnCode : number } > ( ( _ , reject ) = >
setTimeout ( ( ) = > reject ( { success : false , error : "Request timed out. Increase the timeout by passing `timeout` param to the request." , returnCode : 408 } ) , timeout )
) ;
2024-07-25 00:14:25 +02:00
let j ;
2024-05-13 13:01:43 -07:00
try {
2024-07-25 00:14:25 +02:00
j = await Promise . race ( [ docsPromise , timeoutPromise ] ) ;
2024-05-13 13:01:43 -07:00
} catch ( error ) {
2024-07-25 00:14:25 +02:00
wsq . removeListener ( "global:completed" , listener ) ;
2024-05-13 13:01:43 -07:00
return error ;
}
2024-07-25 00:14:25 +02:00
const jobNew = ( await wsq . getJob ( j ) ) ;
const doc = jobNew . progress ( ) . currentDocument ;
delete doc . index ;
2024-04-20 16:38:05 -07:00
// make sure doc.content is not empty
2024-07-25 00:14:25 +02:00
if ( ! doc ) {
return { success : true , error : "No page found" , returnCode : 200 , data : doc } ;
2024-04-20 16:38:05 -07:00
}
2024-04-21 09:31:22 -07:00
2024-07-01 08:48:21 -04:00
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
if ( ! pageOptions . includeRawHtml && extractorOptions . mode == "llm-extraction-from-raw-html" ) {
2024-07-25 00:14:25 +02:00
delete doc . rawHtml ;
2024-06-28 16:39:09 -04:00
}
2024-04-20 16:38:05 -07:00
return {
success : true ,
2024-07-25 00:14:25 +02:00
data : doc ,
2024-04-20 18:55:39 -07:00
returnCode : 200 ,
2024-04-20 16:38:05 -07:00
} ;
}
export async function scrapeController ( req : Request , res : Response ) {
try {
2024-07-22 18:30:58 -04:00
let earlyReturn = false ;
2024-04-20 16:38:05 -07:00
// make sure to authenticate user first, Bearer <token>
2024-05-30 14:46:55 -07:00
const { success , team_id , error , status , plan } = await authenticateUser (
2024-04-20 16:38:05 -07:00
req ,
res ,
RateLimiterMode . Scrape
) ;
if ( ! success ) {
return res . status ( status ) . json ( { error } ) ;
}
2024-07-22 18:30:58 -04:00
2024-04-20 16:38:05 -07:00
const crawlerOptions = req . body . crawlerOptions ? ? { } ;
2024-06-26 09:00:54 -03:00
const pageOptions = { . . . defaultPageOptions , . . . req . body . pageOptions } ;
const extractorOptions = { . . . defaultExtractorOptions , . . . req . body . extractorOptions } ;
2024-07-17 20:44:34 -04:00
const origin = req . body . origin ? ? defaultOrigin ;
let timeout = req . body . timeout ? ? defaultTimeout ;
2024-07-22 18:30:58 -04:00
if ( extractorOptions . mode . includes ( "llm-extraction" ) ) {
2024-05-20 17:07:38 -07:00
pageOptions . onlyMainContent = true ;
2024-07-17 20:44:34 -04:00
timeout = req . body . timeout ? ? 90000 ;
2024-05-20 17:07:38 -07:00
}
2024-04-20 16:38:05 -07:00
2024-07-22 18:30:58 -04:00
const checkCredits = async ( ) = > {
try {
const { success : creditsCheckSuccess , message : creditsCheckMessage } = await checkTeamCredits ( team_id , 1 ) ;
if ( ! creditsCheckSuccess ) {
earlyReturn = true ;
return res . status ( 402 ) . json ( { error : "Insufficient credits" } ) ;
}
} catch ( error ) {
console . error ( error ) ;
earlyReturn = true ;
return res . status ( 402 ) . json ( { error : "Error checking team credits. Please contact hello@firecrawl.com for help." } ) ;
2024-04-20 16:38:05 -07:00
}
2024-07-22 18:30:58 -04:00
} ;
// Async check saves 500ms in average case
// Don't async check in llm extraction mode as it could be expensive
if ( extractorOptions . mode . includes ( "llm-extraction" ) ) {
await checkCredits ( ) ;
} else {
checkCredits ( ) ;
2024-04-20 16:38:05 -07:00
}
2024-07-22 18:30:58 -04:00
2024-04-20 19:37:45 -07:00
const startTime = new Date ( ) . getTime ( ) ;
2024-04-20 18:55:39 -07:00
const result = await scrapeHelper (
req ,
team_id ,
crawlerOptions ,
2024-04-28 15:52:09 -07:00
pageOptions ,
2024-05-06 19:45:56 -03:00
extractorOptions ,
2024-05-30 14:46:55 -07:00
timeout ,
plan
2024-04-20 18:55:39 -07:00
) ;
2024-04-20 19:37:45 -07:00
const endTime = new Date ( ) . getTime ( ) ;
const timeTakenInSeconds = ( endTime - startTime ) / 1000 ;
2024-04-30 16:35:44 -07:00
const numTokens = ( result . data && result . data . markdown ) ? numTokensFromString ( result . data . markdown , "gpt-3.5-turbo" ) : 0 ;
2024-04-30 09:20:15 -07:00
2024-07-22 18:30:58 -04:00
if ( result . success ) {
let creditsToBeBilled = 1 ; // Assuming 1 credit per document
const creditsPerLLMExtract = 50 ;
if ( extractorOptions . mode . includes ( "llm-extraction" ) ) {
2024-07-22 19:12:51 -04:00
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
2024-07-22 18:30:58 -04:00
creditsToBeBilled += creditsPerLLMExtract ;
}
let startTimeBilling = new Date ( ) . getTime ( ) ;
if ( earlyReturn ) {
// Don't bill if we're early returning
return ;
}
const billingResult = await billTeam (
team_id ,
creditsToBeBilled
) ;
if ( ! billingResult . success ) {
return res . status ( 402 ) . json ( {
success : false ,
error : "Failed to bill team. Insufficient credits or subscription not found." ,
} ) ;
}
}
2024-04-20 18:55:39 -07:00
logJob ( {
success : result.success ,
message : result.error ,
num_docs : 1 ,
docs : [ result . data ] ,
2024-04-20 19:37:45 -07:00
time_taken : timeTakenInSeconds ,
2024-04-20 18:55:39 -07:00
team_id : team_id ,
mode : "scrape" ,
url : req.body.url ,
crawlerOptions : crawlerOptions ,
pageOptions : pageOptions ,
2024-04-30 09:20:15 -07:00
origin : origin ,
extractor_options : extractorOptions ,
2024-05-06 19:45:56 -03:00
num_tokens : numTokens ,
2024-04-20 18:55:39 -07:00
} ) ;
2024-07-22 18:30:58 -04:00
2024-04-20 19:04:27 -07:00
return res . status ( result . returnCode ) . json ( result ) ;
2024-04-20 16:38:05 -07:00
} catch ( error ) {
console . error ( error ) ;
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}