2024-08-15 23:30:33 +02:00
import { Response } from "express" ;
2024-08-06 15:24:45 -03:00
import { v4 as uuidv4 } from "uuid" ;
2024-08-20 14:19:20 -03:00
import {
CrawlRequest ,
crawlRequestSchema ,
CrawlResponse ,
RequestWithAuth ,
2024-12-11 19:51:08 -03:00
toLegacyCrawlerOptions ,
2024-08-20 14:19:20 -03:00
} from "./types" ;
2025-01-10 18:35:10 -03:00
import { crawlToCrawler , saveCrawl , StoredCrawl } from "../../lib/crawl-redis" ;
2024-08-15 23:30:33 +02:00
import { logCrawl } from "../../services/logging/crawl_log" ;
2025-01-07 19:15:23 +01:00
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs" ;
2024-12-05 20:50:36 +01:00
import { logger as _logger } from "../../lib/logger" ;
2024-08-06 15:24:45 -03:00
2024-08-20 14:19:20 -03:00
export async function crawlController (
req : RequestWithAuth < { } , CrawlResponse , CrawlRequest > ,
2024-12-11 19:51:08 -03:00
res : Response < CrawlResponse > ,
2024-08-20 14:19:20 -03:00
) {
2024-12-05 20:50:36 +01:00
const preNormalizedBody = req . body ;
2024-08-15 23:30:33 +02:00
req . body = crawlRequestSchema . parse ( req . body ) ;
2024-08-20 14:19:20 -03:00
2024-08-15 23:30:33 +02:00
const id = uuidv4 ( ) ;
2024-12-11 19:46:11 -03:00
const logger = _logger . child ( {
crawlId : id ,
module : "api/v1" ,
method : "crawlController" ,
teamId : req.auth.team_id ,
} ) ;
logger . debug ( "Crawl " + id + " starting" , {
request : req.body ,
originalRequest : preNormalizedBody ,
2024-12-11 19:51:08 -03:00
account : req.account ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-08-06 15:24:45 -03:00
2024-08-15 23:30:33 +02:00
await logCrawl ( id , req . auth . team_id ) ;
2024-08-06 15:24:45 -03:00
2024-11-07 20:57:33 +01:00
let { remainingCredits } = req . account ! ;
2024-12-11 19:46:11 -03:00
const useDbAuthentication = process . env . USE_DB_AUTHENTICATION === "true" ;
if ( ! useDbAuthentication ) {
2024-10-09 22:52:49 +00:00
remainingCredits = Infinity ;
}
2024-08-20 14:39:52 -03:00
2024-11-07 20:57:33 +01:00
const crawlerOptions = {
. . . req . body ,
url : undefined ,
2024-12-11 19:51:08 -03:00
scrapeOptions : undefined ,
2024-11-07 20:57:33 +01:00
} ;
const scrapeOptions = req . body . scrapeOptions ;
2024-08-06 15:24:45 -03:00
2024-08-26 19:07:14 -03:00
// TODO: @rafa, is this right? copied from v0
2024-11-07 20:57:33 +01:00
if ( Array . isArray ( crawlerOptions . includePaths ) ) {
for ( const x of crawlerOptions . includePaths ) {
2024-08-26 19:07:14 -03:00
try {
new RegExp ( x ) ;
} catch ( e ) {
return res . status ( 400 ) . json ( { success : false , error : e.message } ) ;
}
}
}
2024-11-07 20:57:33 +01:00
if ( Array . isArray ( crawlerOptions . excludePaths ) ) {
for ( const x of crawlerOptions . excludePaths ) {
2024-08-26 19:07:14 -03:00
try {
new RegExp ( x ) ;
} catch ( e ) {
return res . status ( 400 ) . json ( { success : false , error : e.message } ) ;
}
}
}
2024-12-05 20:50:36 +01:00
const originalLimit = crawlerOptions . limit ;
2024-08-20 14:39:52 -03:00
crawlerOptions . limit = Math . min ( remainingCredits , crawlerOptions . limit ) ;
2024-12-11 19:46:11 -03:00
logger . debug ( "Determined limit: " + crawlerOptions . limit , {
remainingCredits ,
bodyLimit : originalLimit ,
2024-12-11 19:51:08 -03:00
originalBodyLimit : preNormalizedBody.limit ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-08-15 23:30:33 +02:00
const sc : StoredCrawl = {
originUrl : req.body.url ,
2024-11-07 20:57:33 +01:00
crawlerOptions : toLegacyCrawlerOptions ( crawlerOptions ) ,
scrapeOptions ,
2025-04-02 19:52:43 +02:00
internalOptions : { disableSmartWaitCache : true , teamId : req.auth.team_id } , // NOTE: smart wait disabled for crawls to ensure contentful scrape, speed does not matter
2024-08-15 23:30:33 +02:00
team_id : req.auth.team_id ,
createdAt : Date.now ( ) ,
} ;
2024-08-06 15:24:45 -03:00
2024-08-15 23:30:33 +02:00
const crawler = crawlToCrawler ( id , sc ) ;
2024-08-06 15:24:45 -03:00
2024-08-15 23:30:33 +02:00
try {
2024-11-07 20:57:33 +01:00
sc . robots = await crawler . getRobotsTxt ( scrapeOptions . skipTlsVerification ) ;
2024-08-15 23:30:33 +02:00
} catch ( e ) {
2024-12-11 19:46:11 -03:00
logger . debug ( "Failed to get robots.txt (this is probably fine!)" , {
2024-12-11 19:51:08 -03:00
error : e ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-08-15 23:30:33 +02:00
}
2024-08-06 15:24:45 -03:00
2024-08-15 23:30:33 +02:00
await saveCrawl ( id , sc ) ;
2025-01-10 18:35:10 -03:00
await _addScrapeJobToBullMQ (
{
url : req.body.url ,
mode : "kickoff" as const ,
team_id : req.auth.team_id ,
crawlerOptions ,
scrapeOptions : sc.scrapeOptions ,
internalOptions : sc.internalOptions ,
2025-03-28 12:47:34 +01:00
origin : req.body.origin ,
2025-01-10 18:35:10 -03:00
crawl_id : id ,
webhook : req.body.webhook ,
v1 : true ,
} ,
{ } ,
crypto . randomUUID ( ) ,
10 ,
) ;
2024-09-05 13:03:43 -03:00
const protocol = process . env . ENV === "local" ? req . protocol : "https" ;
2024-12-11 19:46:11 -03:00
2024-08-15 23:30:33 +02:00
return res . status ( 200 ) . json ( {
success : true ,
id ,
2024-12-11 19:51:08 -03:00
url : ` ${ protocol } :// ${ req . get ( "host" ) } /v1/crawl/ ${ id } ` ,
2024-08-15 23:30:33 +02:00
} ) ;
2024-08-06 15:24:45 -03:00
}