2024-08-13 20:51:43 +02:00
import axios , { AxiosError } from "axios" ;
2025-01-24 22:04:54 +01:00
import { load } from "cheerio" ; // rustified
2024-04-15 17:01:47 -04:00
import { URL } from "url" ;
import { getLinksFromSitemap } from "./sitemap" ;
2025-01-17 15:45:52 +01:00
import robotsParser , { Robot } from "robots-parser" ;
2024-06-15 16:43:37 -04:00
import { getURLDepth } from "./utils/maxDepthUtils" ;
2024-12-27 19:59:26 +01:00
import { axiosTimeout } from "../../lib/timeout" ;
import { logger as _logger } from "../../lib/logger" ;
2024-10-23 01:07:03 +03:00
import https from "https" ;
2024-12-27 19:59:26 +01:00
import { redisConnection } from "../../services/queue-service" ;
2025-01-24 22:04:54 +01:00
import { extractLinks } from "../../lib/html-transformer" ;
2025-02-20 00:42:13 +01:00
import { TimeoutSignal } from "../../controllers/v1/types" ;
2024-04-15 17:01:47 -04:00
export class WebCrawler {
2024-07-24 14:31:25 +02:00
private jobId : string ;
2024-04-15 17:01:47 -04:00
private initialUrl : string ;
private baseUrl : string ;
private includes : string [ ] ;
private excludes : string [ ] ;
private maxCrawledLinks : number ;
2024-05-07 11:06:26 -03:00
private maxCrawledDepth : number ;
2024-04-15 17:01:47 -04:00
private visited : Set < string > = new Set ( ) ;
2024-05-14 12:12:40 -07:00
private crawledUrls : Map < string , string > = new Map ( ) ;
2024-04-15 17:01:47 -04:00
private limit : number ;
private robotsTxtUrl : string ;
2025-01-17 15:45:52 +01:00
public robots : Robot ;
2024-04-16 12:49:14 -04:00
private generateImgAltText : boolean ;
2024-06-11 15:24:39 -03:00
private allowBackwardCrawling : boolean ;
2024-06-28 17:23:40 -07:00
private allowExternalContentLinks : boolean ;
2024-11-19 18:38:59 +01:00
private allowSubdomains : boolean ;
2024-11-29 16:44:54 -03:00
private ignoreRobotsTxt : boolean ;
2025-03-06 17:05:15 +01:00
private regexOnFullURL : boolean ;
2024-12-05 20:50:36 +01:00
private logger : typeof _logger ;
2025-01-23 07:10:07 +01:00
private sitemapsHit : Set < string > = new Set ( ) ;
2025-03-12 18:46:57 +01:00
private maxDiscoveryDepth : number | undefined ;
private currentDiscoveryDepth : number ;
2024-04-15 17:01:47 -04:00
constructor ( {
2024-07-24 14:31:25 +02:00
jobId ,
2024-04-15 17:01:47 -04:00
initialUrl ,
2024-11-13 21:38:44 +01:00
baseUrl ,
2024-04-15 17:01:47 -04:00
includes ,
excludes ,
2024-05-10 12:15:54 -03:00
maxCrawledLinks = 10000 ,
2024-04-15 17:01:47 -04:00
limit = 10000 ,
2024-04-16 12:49:14 -04:00
generateImgAltText = false ,
2024-05-07 11:06:26 -03:00
maxCrawledDepth = 10 ,
2024-06-28 17:23:40 -07:00
allowBackwardCrawling = false ,
2024-11-19 18:38:59 +01:00
allowExternalContentLinks = false ,
allowSubdomains = false ,
2024-12-11 19:51:08 -03:00
ignoreRobotsTxt = false ,
2025-03-06 17:05:15 +01:00
regexOnFullURL = false ,
2025-03-12 18:46:57 +01:00
maxDiscoveryDepth ,
currentDiscoveryDepth ,
2024-04-15 17:01:47 -04:00
} : {
2024-07-24 14:31:25 +02:00
jobId : string ;
2024-04-15 17:01:47 -04:00
initialUrl : string ;
2024-11-13 21:38:44 +01:00
baseUrl? : string ;
2024-04-15 17:01:47 -04:00
includes? : string [ ] ;
excludes? : string [ ] ;
maxCrawledLinks? : number ;
limit? : number ;
2024-04-16 12:49:14 -04:00
generateImgAltText? : boolean ;
2024-05-07 11:06:26 -03:00
maxCrawledDepth? : number ;
2024-06-11 15:24:39 -03:00
allowBackwardCrawling? : boolean ;
2024-06-28 17:23:40 -07:00
allowExternalContentLinks? : boolean ;
2024-11-19 18:38:59 +01:00
allowSubdomains? : boolean ;
2024-11-29 16:44:54 -03:00
ignoreRobotsTxt? : boolean ;
2025-03-06 17:05:15 +01:00
regexOnFullURL? : boolean ;
2025-03-12 18:46:57 +01:00
maxDiscoveryDepth? : number ;
currentDiscoveryDepth? : number ;
2024-04-15 17:01:47 -04:00
} ) {
2024-07-24 14:31:25 +02:00
this . jobId = jobId ;
2024-04-15 17:01:47 -04:00
this . initialUrl = initialUrl ;
2024-11-13 21:38:44 +01:00
this . baseUrl = baseUrl ? ? new URL ( initialUrl ) . origin ;
2024-08-22 13:18:26 +02:00
this . includes = Array . isArray ( includes ) ? includes : [ ] ;
this . excludes = Array . isArray ( excludes ) ? excludes : [ ] ;
2024-04-15 17:01:47 -04:00
this . limit = limit ;
2025-01-17 15:45:52 +01:00
this . robotsTxtUrl = ` ${ this . baseUrl } ${ this . baseUrl . endsWith ( "/" ) ? "" : "/" } robots.txt ` ;
2024-04-15 17:01:47 -04:00
this . robots = robotsParser ( this . robotsTxtUrl , "" ) ;
// Deprecated, use limit instead
this . maxCrawledLinks = maxCrawledLinks ? ? limit ;
2024-05-07 11:06:26 -03:00
this . maxCrawledDepth = maxCrawledDepth ? ? 10 ;
2024-04-16 12:49:14 -04:00
this . generateImgAltText = generateImgAltText ? ? false ;
2024-06-11 15:24:39 -03:00
this . allowBackwardCrawling = allowBackwardCrawling ? ? false ;
2024-06-28 17:23:40 -07:00
this . allowExternalContentLinks = allowExternalContentLinks ? ? false ;
2024-11-19 18:38:59 +01:00
this . allowSubdomains = allowSubdomains ? ? false ;
2024-11-29 16:44:54 -03:00
this . ignoreRobotsTxt = ignoreRobotsTxt ? ? false ;
2025-03-06 17:05:15 +01:00
this . regexOnFullURL = regexOnFullURL ? ? false ;
2024-12-05 20:50:36 +01:00
this . logger = _logger . child ( { crawlId : this.jobId , module : "WebCrawler" } ) ;
2025-03-12 18:46:57 +01:00
this . maxDiscoveryDepth = maxDiscoveryDepth ;
this . currentDiscoveryDepth = currentDiscoveryDepth ? ? 0 ;
2024-04-15 17:01:47 -04:00
}
2024-12-11 19:46:11 -03:00
public filterLinks (
sitemapLinks : string [ ] ,
limit : number ,
maxDepth : number ,
2024-12-11 19:51:08 -03:00
fromMap : boolean = false ,
2024-12-11 19:46:11 -03:00
) : string [ ] {
2025-03-12 18:46:57 +01:00
if ( this . currentDiscoveryDepth === this . maxDiscoveryDepth ) {
this . logger . debug ( "Max discovery depth hit, filtering off all links" , { currentDiscoveryDepth : this.currentDiscoveryDepth , maxDiscoveryDepth : this.maxDiscoveryDepth } ) ;
return [ ] ;
}
2024-11-14 17:44:32 -05:00
// If the initial URL is a sitemap.xml, skip filtering
2024-12-11 19:46:11 -03:00
if ( this . initialUrl . endsWith ( "sitemap.xml" ) && fromMap ) {
2024-11-14 17:44:32 -05:00
return sitemapLinks . slice ( 0 , limit ) ;
}
2024-04-15 17:01:47 -04:00
return sitemapLinks
. filter ( ( link ) = > {
2024-08-20 09:11:58 -03:00
let url : URL ;
try {
url = new URL ( link . trim ( ) , this . baseUrl ) ;
} catch ( error ) {
2024-12-11 19:46:11 -03:00
this . logger . debug ( ` Error processing link: ${ link } ` , {
link ,
error ,
2024-12-11 19:51:08 -03:00
method : "filterLinks" ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-08-20 09:11:58 -03:00
return false ;
}
2024-04-15 17:01:47 -04:00
const path = url . pathname ;
2024-12-11 19:46:11 -03:00
2024-06-15 16:43:37 -04:00
const depth = getURLDepth ( url . toString ( ) ) ;
2024-05-07 11:06:26 -03:00
// Check if the link exceeds the maximum depth allowed
if ( depth > maxDepth ) {
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } DEPTH FAIL ` ) ;
}
2024-05-07 11:06:26 -03:00
return false ;
}
2024-04-15 17:01:47 -04:00
2025-03-06 17:05:15 +01:00
const excincPath = this . regexOnFullURL ? link : path ;
2024-04-15 17:01:47 -04:00
// Check if the link should be excluded
if ( this . excludes . length > 0 && this . excludes [ 0 ] !== "" ) {
if (
this . excludes . some ( ( excludePattern ) = >
2025-03-06 17:05:15 +01:00
new RegExp ( excludePattern ) . test ( excincPath ) ,
2024-04-15 17:01:47 -04:00
)
) {
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } EXCLUDE FAIL ` ) ;
}
2024-04-15 17:01:47 -04:00
return false ;
}
}
// Check if the link matches the include patterns, if any are specified
if ( this . includes . length > 0 && this . includes [ 0 ] !== "" ) {
2024-12-11 19:46:11 -03:00
if (
! this . includes . some ( ( includePattern ) = >
2025-03-06 17:05:15 +01:00
new RegExp ( includePattern ) . test ( excincPath ) ,
2024-12-11 19:46:11 -03:00
)
) {
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } INCLUDE FAIL ` ) ;
}
2024-05-15 15:30:37 -07:00
return false ;
}
}
// Normalize the initial URL and the link to account for www and non-www versions
const normalizedInitialUrl = new URL ( this . initialUrl ) ;
2024-08-22 23:30:19 +02:00
let normalizedLink ;
try {
normalizedLink = new URL ( link ) ;
} catch ( _ ) {
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } URL PARSE FAIL ` ) ;
}
2024-08-22 23:30:19 +02:00
return false ;
}
2024-12-11 19:46:11 -03:00
const initialHostname = normalizedInitialUrl . hostname . replace (
/^www\./ ,
2024-12-11 19:51:08 -03:00
"" ,
2024-12-11 19:46:11 -03:00
) ;
const linkHostname = normalizedLink . hostname . replace ( /^www\./ , "" ) ;
2024-05-15 15:30:37 -07:00
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
2024-06-28 17:23:40 -07:00
// commented to able to handling external link on allowExternalContentLinks
// if (linkHostname !== initialHostname) {
// return false;
// }
2024-04-15 17:01:47 -04:00
2024-06-11 15:24:39 -03:00
if ( ! this . allowBackwardCrawling ) {
2024-12-11 19:46:11 -03:00
if (
! normalizedLink . pathname . startsWith ( normalizedInitialUrl . pathname )
) {
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } BACKWARDS FAIL ${ normalizedLink . pathname } ${ normalizedInitialUrl . pathname } ` ) ;
}
2024-06-11 15:24:39 -03:00
return false ;
}
}
2024-12-11 19:46:11 -03:00
const isAllowed = this . ignoreRobotsTxt
? true
2025-03-05 18:48:05 +01:00
: ( ( this . robots . isAllowed ( link , "FireCrawlAgent" ) || this . robots . isAllowed ( link , "FirecrawlAgent" ) ) ? ? true ) ;
2024-04-15 17:01:47 -04:00
// Check if the link is disallowed by robots.txt
if ( ! isAllowed ) {
2024-12-11 19:46:11 -03:00
this . logger . debug ( ` Link disallowed by robots.txt: ${ link } ` , {
method : "filterLinks" ,
2024-12-11 19:51:08 -03:00
link ,
2024-12-11 19:46:11 -03:00
} ) ;
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } ROBOTS FAIL ` ) ;
}
2024-04-15 17:01:47 -04:00
return false ;
}
2024-10-14 15:44:45 -03:00
if ( this . isFile ( link ) ) {
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } FILE FAIL ` ) ;
}
2024-10-14 15:44:45 -03:00
return false ;
}
2025-03-02 13:32:46 +01:00
if ( process . env . FIRECRAWL_DEBUG_FILTER_LINKS ) {
this . logger . debug ( ` ${ link } OK ` ) ;
}
2024-04-15 17:01:47 -04:00
return true ;
} )
. slice ( 0 , limit ) ;
}
2025-02-20 00:42:13 +01:00
public async getRobotsTxt ( skipTlsVerification = false , abort? : AbortSignal ) : Promise < string > {
2024-10-23 01:07:03 +03:00
let extraArgs = { } ;
2024-12-11 19:46:11 -03:00
if ( skipTlsVerification ) {
2024-10-23 01:07:03 +03:00
extraArgs [ "httpsAgent" ] = new https . Agent ( {
2024-12-11 19:51:08 -03:00
rejectUnauthorized : false ,
2024-10-23 01:07:03 +03:00
} ) ;
}
2024-12-11 19:46:11 -03:00
const response = await axios . get ( this . robotsTxtUrl , {
timeout : axiosTimeout ,
2025-02-20 00:42:13 +01:00
signal : abort ,
2024-12-11 19:51:08 -03:00
. . . extraArgs ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-08-13 20:51:43 +02:00
return response . data ;
}
public importRobotsTxt ( txt : string ) {
this . robots = robotsParser ( this . robotsTxtUrl , txt ) ;
}
2024-12-11 19:46:11 -03:00
public async tryGetSitemap (
2024-12-27 19:59:26 +01:00
urlsHandler : ( urls : string [ ] ) = > unknown ,
2024-12-11 19:46:11 -03:00
fromMap : boolean = false ,
2024-12-11 19:51:08 -03:00
onlySitemap : boolean = false ,
2025-01-19 11:40:13 -03:00
timeout : number = 120000 ,
2025-02-20 10:41:43 +01:00
abort? : AbortSignal ,
mock? : string ,
2024-12-27 19:59:26 +01:00
) : Promise < number > {
2024-12-11 19:46:11 -03:00
this . logger . debug ( ` Fetching sitemap links from ${ this . initialUrl } ` , {
2024-12-11 19:51:08 -03:00
method : "tryGetSitemap" ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-12-27 19:59:26 +01:00
let leftOfLimit = this . limit ;
const normalizeUrl = ( url : string ) = > {
url = url . replace ( /^https?:\/\// , "" ) . replace ( /^www\./ , "" ) ;
if ( url . endsWith ( "/" ) ) {
url = url . slice ( 0 , - 1 ) ;
}
return url ;
} ;
const _urlsHandler = async ( urls : string [ ] ) = > {
2025-01-17 15:45:52 +01:00
if ( fromMap && onlySitemap ) {
return urlsHandler ( urls ) ;
} else {
let filteredLinks = this . filterLinks (
[ . . . new Set ( urls ) ] ,
leftOfLimit ,
this . maxCrawledDepth ,
fromMap ,
) ;
leftOfLimit -= filteredLinks . length ;
let uniqueURLs : string [ ] = [ ] ;
2025-01-17 17:07:44 +01:00
for ( const url of filteredLinks ) {
2025-01-17 15:45:52 +01:00
if (
await redisConnection . sadd (
"sitemap:" + this . jobId + ":links" ,
normalizeUrl ( url ) ,
)
) {
uniqueURLs . push ( url ) ;
}
2024-12-27 19:59:26 +01:00
}
2025-01-17 15:45:52 +01:00
await redisConnection . expire (
"sitemap:" + this . jobId + ":links" ,
3600 ,
"NX" ,
) ;
if ( uniqueURLs . length > 0 ) {
return urlsHandler ( uniqueURLs ) ;
}
2024-12-27 19:59:26 +01:00
}
} ;
2025-01-19 11:40:13 -03:00
const timeoutPromise = new Promise ( ( _ , reject ) = > {
2025-01-22 18:47:44 -03:00
setTimeout ( ( ) = > reject ( new Error ( "Sitemap fetch timeout" ) ) , timeout ) ;
2025-01-19 11:40:13 -03:00
} ) ;
2024-12-27 19:59:26 +01:00
2025-01-19 11:40:13 -03:00
try {
2025-01-22 18:47:44 -03:00
let count = ( await Promise . race ( [
2025-01-19 11:40:13 -03:00
Promise . all ( [
2025-02-20 10:41:43 +01:00
this . tryFetchSitemapLinks ( this . initialUrl , _urlsHandler , abort , mock ) ,
2025-01-22 18:47:44 -03:00
. . . this . robots
. getSitemaps ( )
2025-02-20 10:41:43 +01:00
. map ( ( x ) = > this . tryFetchSitemapLinks ( x , _urlsHandler , abort , mock ) ) ,
2025-01-22 18:47:44 -03:00
] ) . then ( ( results ) = > results . reduce ( ( a , x ) = > a + x , 0 ) ) ,
timeoutPromise ,
] ) ) as number ;
2025-01-19 11:40:13 -03:00
if ( count > 0 ) {
if (
await redisConnection . sadd (
"sitemap:" + this . jobId + ":links" ,
normalizeUrl ( this . initialUrl ) ,
)
) {
urlsHandler ( [ this . initialUrl ] ) ;
}
count ++ ;
2024-12-27 19:59:26 +01:00
}
2025-01-19 11:40:13 -03:00
return count ;
} catch ( error ) {
2025-01-22 18:47:44 -03:00
if ( error . message === "Sitemap fetch timeout" ) {
this . logger . warn ( "Sitemap fetch timed out" , {
2025-01-19 11:40:13 -03:00
method : "tryGetSitemap" ,
timeout ,
} ) ;
return 0 ;
}
2025-01-22 18:47:44 -03:00
this . logger . error ( "Error fetching sitemap" , {
2025-01-19 11:40:13 -03:00
method : "tryGetSitemap" ,
error ,
} ) ;
return 0 ;
}
2024-08-13 20:51:43 +02:00
}
public filterURL ( href : string , url : string ) : string | null {
let fullUrl = href ;
if ( ! href . startsWith ( "http" ) ) {
2024-08-21 20:49:25 +02:00
try {
2024-11-12 18:20:53 +01:00
fullUrl = new URL ( href , url ) . toString ( ) ;
2024-08-21 20:49:25 +02:00
} catch ( _ ) {
return null ;
}
}
let urlObj ;
try {
urlObj = new URL ( fullUrl ) ;
} catch ( _ ) {
return null ;
2024-08-13 20:51:43 +02:00
}
const path = urlObj . pathname ;
2024-12-11 19:46:11 -03:00
if ( this . isInternalLink ( fullUrl ) ) {
// INTERNAL LINKS
if (
this . isInternalLink ( fullUrl ) &&
2024-08-13 20:51:43 +02:00
this . noSections ( fullUrl ) &&
! this . matchesExcludes ( path ) &&
2024-11-29 16:44:54 -03:00
this . isRobotsAllowed ( fullUrl , this . ignoreRobotsTxt )
2024-08-13 20:51:43 +02:00
) {
return fullUrl ;
2025-01-17 17:12:04 +01:00
} else if (
this . isInternalLink ( fullUrl ) &&
this . noSections ( fullUrl ) &&
! this . matchesExcludes ( path ) &&
! this . isRobotsAllowed ( fullUrl , this . ignoreRobotsTxt )
) {
2025-01-22 18:47:44 -03:00
( async ( ) = > {
await redisConnection . sadd (
"crawl:" + this . jobId + ":robots_blocked" ,
fullUrl ,
) ;
await redisConnection . expire (
"crawl:" + this . jobId + ":robots_blocked" ,
24 * 60 * 60 ,
"NX" ,
) ;
2025-01-17 17:12:04 +01:00
} ) ( ) ;
2024-08-13 20:51:43 +02:00
}
2024-12-11 19:46:11 -03:00
} else {
// EXTERNAL LINKS
2024-08-13 20:51:43 +02:00
if (
this . isInternalLink ( url ) &&
this . allowExternalContentLinks &&
! this . isSocialMediaOrEmail ( fullUrl ) &&
! this . matchesExcludes ( fullUrl , true ) &&
! this . isExternalMainPage ( fullUrl )
) {
return fullUrl ;
}
}
2024-12-11 19:46:11 -03:00
if (
this . allowSubdomains &&
! this . isSocialMediaOrEmail ( fullUrl ) &&
this . isSubdomain ( fullUrl )
) {
2024-11-19 18:38:59 +01:00
return fullUrl ;
}
2024-08-13 20:51:43 +02:00
return null ;
}
2025-01-24 22:04:54 +01:00
private async extractLinksFromHTMLRust ( html : string , url : string ) {
return ( await extractLinks ( html ) ) . filter ( x = > this . filterURL ( x , url ) ) ;
}
private extractLinksFromHTMLCheerio ( html : string , url : string ) {
2024-08-16 23:29:30 +02:00
let links : string [ ] = [ ] ;
const $ = load ( html ) ;
$ ( "a" ) . each ( ( _ , element ) = > {
2024-11-20 20:18:40 +01:00
let href = $ ( element ) . attr ( "href" ) ;
2024-08-16 23:29:30 +02:00
if ( href ) {
2024-11-20 20:18:40 +01:00
if ( href . match ( /^https?:\/[^\/]/ ) ) {
2024-11-20 20:19:16 +01:00
href = href . replace ( /^https?:\// , "$&/" ) ;
2024-11-20 20:18:40 +01:00
}
2024-08-16 23:29:30 +02:00
const u = this . filterURL ( href , url ) ;
if ( u !== null ) {
links . push ( u ) ;
}
}
} ) ;
2024-10-31 10:53:47 -03:00
// Extract links from iframes with inline src
$ ( "iframe" ) . each ( ( _ , element ) = > {
const src = $ ( element ) . attr ( "src" ) ;
if ( src && src . startsWith ( "data:text/html" ) ) {
const iframeHtml = decodeURIComponent ( src . split ( "," ) [ 1 ] ) ;
2025-01-24 22:04:54 +01:00
const iframeLinks = this . extractLinksFromHTMLCheerio ( iframeHtml , url ) ;
2024-10-31 10:53:47 -03:00
links = links . concat ( iframeLinks ) ;
}
} ) ;
2024-08-16 23:29:30 +02:00
return links ;
}
2025-01-24 22:04:54 +01:00
public async extractLinksFromHTML ( html : string , url : string ) {
try {
2025-01-30 08:16:51 +01:00
return [ . . . new Set ( ( await this . extractLinksFromHTMLRust ( html , url ) ) . map ( x = > {
2025-01-28 09:41:37 +01:00
try {
return new URL ( x , url ) . href
} catch ( e ) {
return null ;
}
2025-01-30 08:16:51 +01:00
} ) . filter ( x = > x !== null ) as string [ ] ) ] ;
2025-01-24 22:04:54 +01:00
} catch ( error ) {
this . logger . error ( "Failed to call html-transformer! Falling back to cheerio..." , {
error ,
module : "scrapeURL" , method : "extractMetadata"
} ) ;
}
return this . extractLinksFromHTMLCheerio ( html , url ) ;
}
2024-12-11 19:46:11 -03:00
private isRobotsAllowed (
url : string ,
2024-12-11 19:51:08 -03:00
ignoreRobotsTxt : boolean = false ,
2024-12-11 19:46:11 -03:00
) : boolean {
return ignoreRobotsTxt
? true
: this . robots
2025-03-05 18:48:05 +01:00
? ( ( this . robots . isAllowed ( url , "FireCrawlAgent" ) || this . robots . isAllowed ( url , "FirecrawlAgent" ) ) ? ? true )
2024-12-11 19:46:11 -03:00
: true ;
2024-06-14 13:44:54 -07:00
}
2024-04-15 17:01:47 -04:00
2024-06-28 17:23:40 -07:00
private matchesExcludes ( url : string , onlyDomains : boolean = false ) : boolean {
return this . excludes . some ( ( pattern ) = > {
2024-12-11 19:46:11 -03:00
if ( onlyDomains ) return this . matchesExcludesExternalDomains ( url ) ;
2024-06-28 17:23:40 -07:00
return this . excludes . some ( ( pattern ) = > new RegExp ( pattern ) . test ( url ) ) ;
} ) ;
}
// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
private matchesExcludesExternalDomains ( url : string ) {
try {
const urlObj = new URL ( url ) ;
const hostname = urlObj . hostname ;
const pathname = urlObj . pathname ;
for ( let domain of this . excludes ) {
2024-12-11 19:46:11 -03:00
let domainObj = new URL ( "http://" + domain . replace ( /^https?:\/\// , "" ) ) ;
2024-06-28 17:23:40 -07:00
let domainHostname = domainObj . hostname ;
let domainPathname = domainObj . pathname ;
2024-12-11 19:46:11 -03:00
if (
hostname === domainHostname ||
hostname . endsWith ( ` . ${ domainHostname } ` )
) {
2024-06-28 17:23:40 -07:00
if ( pathname . startsWith ( domainPathname ) ) {
return true ;
}
}
}
return false ;
} catch ( e ) {
return false ;
}
}
2024-12-11 19:46:11 -03:00
private isExternalMainPage ( url : string ) : boolean {
return ! Boolean (
url
. split ( "/" )
. slice ( 3 )
2024-12-11 19:51:08 -03:00
. filter ( ( subArray ) = > subArray . length > 0 ) . length ,
2024-12-11 19:46:11 -03:00
) ;
2024-04-15 17:01:47 -04:00
}
private noSections ( link : string ) : boolean {
return ! link . includes ( "#" ) ;
}
private isInternalLink ( link : string ) : boolean {
const urlObj = new URL ( link , this . baseUrl ) ;
2025-01-07 09:29:58 +01:00
const baseDomain = new URL ( this . baseUrl ) . hostname
2024-12-11 19:46:11 -03:00
. replace ( /^www\./ , "" )
. trim ( ) ;
2024-06-14 13:44:54 -07:00
const linkDomain = urlObj . hostname . replace ( /^www\./ , "" ) . trim ( ) ;
2024-12-11 19:46:11 -03:00
2024-06-14 13:44:54 -07:00
return linkDomain === baseDomain ;
2024-04-15 17:01:47 -04:00
}
2024-11-19 18:38:59 +01:00
private isSubdomain ( link : string ) : boolean {
2024-12-11 19:46:11 -03:00
return new URL ( link , this . baseUrl ) . hostname . endsWith (
2024-12-11 19:51:08 -03:00
"." + new URL ( this . baseUrl ) . hostname . split ( "." ) . slice ( - 2 ) . join ( "." ) ,
2024-12-11 19:46:11 -03:00
) ;
2024-11-19 18:38:59 +01:00
}
2024-07-18 21:49:37 +02:00
public isFile ( url : string ) : boolean {
2024-04-15 17:01:47 -04:00
const fileExtensions = [
".png" ,
".jpg" ,
".jpeg" ,
".gif" ,
".css" ,
".js" ,
".ico" ,
".svg" ,
2024-07-18 17:07:21 -03:00
".tiff" ,
2024-12-11 19:46:11 -03:00
// ".pdf",
2024-04-15 17:01:47 -04:00
".zip" ,
".exe" ,
".dmg" ,
".mp4" ,
".mp3" ,
2024-12-10 23:24:53 +01:00
".wav" ,
2024-04-15 17:01:47 -04:00
".pptx" ,
2024-05-16 11:48:02 -07:00
// ".docx",
2024-04-15 17:01:47 -04:00
".xlsx" ,
2024-12-30 11:57:01 -03:00
// ".xml",
2024-04-27 11:03:27 +01:00
".avi" ,
".flv" ,
".woff" ,
".ttf" ,
".woff2" ,
2024-07-31 09:28:43 -03:00
".webp" ,
2024-12-11 19:51:08 -03:00
".inc" ,
2024-04-15 17:01:47 -04:00
] ;
2024-10-14 15:44:45 -03:00
try {
2024-12-11 19:46:11 -03:00
const urlWithoutQuery = url . split ( "?" ) [ 0 ] . toLowerCase ( ) ;
2024-10-14 15:44:45 -03:00
return fileExtensions . some ( ( ext ) = > urlWithoutQuery . endsWith ( ext ) ) ;
} catch ( error ) {
2024-12-11 19:46:11 -03:00
this . logger . error ( ` Error processing URL in isFile ` , {
method : "isFile" ,
2024-12-11 19:51:08 -03:00
error ,
2024-12-11 19:46:11 -03:00
} ) ;
2024-10-14 15:44:45 -03:00
return false ;
}
2024-04-15 17:01:47 -04:00
}
private isSocialMediaOrEmail ( url : string ) : boolean {
const socialMediaOrEmail = [
"facebook.com" ,
"twitter.com" ,
"linkedin.com" ,
"instagram.com" ,
"pinterest.com" ,
"mailto:" ,
2024-06-28 17:23:40 -07:00
"github.com" ,
"calendly.com" ,
"discord.gg" ,
2024-12-11 19:51:08 -03:00
"discord.com" ,
2024-04-15 17:01:47 -04:00
] ;
return socialMediaOrEmail . some ( ( ext ) = > url . includes ( ext ) ) ;
}
2025-01-10 18:35:10 -03:00
private async tryFetchSitemapLinks (
url : string ,
urlsHandler : ( urls : string [ ] ) = > unknown ,
2025-02-20 00:42:13 +01:00
abort? : AbortSignal ,
2025-02-20 10:41:43 +01:00
mock? : string ,
2025-01-10 18:35:10 -03:00
) : Promise < number > {
const sitemapUrl = url . endsWith ( ".xml" )
? url
: ` ${ url } ${ url . endsWith ( "/" ) ? "" : "/" } sitemap.xml ` ;
2024-12-27 19:59:26 +01:00
let sitemapCount : number = 0 ;
2024-05-15 17:13:04 -07:00
2024-12-26 13:51:20 -03:00
// Try to get sitemap from the provided URL first
2024-04-15 17:01:47 -04:00
try {
2024-12-27 19:59:26 +01:00
sitemapCount = await getLinksFromSitemap (
{ sitemapUrl , urlsHandler , mode : "fire-engine" } ,
2024-12-26 13:51:20 -03:00
this . logger ,
2025-01-23 07:19:00 +01:00
this . jobId ,
2025-01-23 08:10:46 +01:00
this . sitemapsHit ,
2025-02-20 00:42:13 +01:00
abort ,
2025-02-20 10:41:43 +01:00
mock ,
2024-12-26 13:51:20 -03:00
) ;
2024-12-11 19:46:11 -03:00
} catch ( error ) {
2025-02-20 00:42:13 +01:00
if ( error instanceof TimeoutSignal ) {
throw error ;
} else {
this . logger . debug ( ` Failed to fetch sitemap from ${ sitemapUrl } ` , {
method : "tryFetchSitemapLinks" ,
sitemapUrl ,
error ,
} ) ;
}
2024-12-26 13:51:20 -03:00
}
// If this is a subdomain, also try to get sitemap from the main domain
try {
const urlObj = new URL ( url ) ;
const hostname = urlObj . hostname ;
2025-01-10 18:35:10 -03:00
const domainParts = hostname . split ( "." ) ;
2024-12-26 13:51:20 -03:00
// Check if this is a subdomain (has more than 2 parts and not www)
2025-01-10 18:35:10 -03:00
if ( domainParts . length > 2 && domainParts [ 0 ] !== "www" ) {
2024-12-26 13:51:20 -03:00
// Get the main domain by taking the last two parts
2025-01-10 18:35:10 -03:00
const mainDomain = domainParts . slice ( - 2 ) . join ( "." ) ;
2024-12-26 13:51:20 -03:00
const mainDomainUrl = ` ${ urlObj . protocol } // ${ mainDomain } ` ;
const mainDomainSitemapUrl = ` ${ mainDomainUrl } /sitemap.xml ` ;
try {
// Get all links from the main domain's sitemap
2024-12-27 19:59:26 +01:00
sitemapCount += await getLinksFromSitemap (
2025-01-10 18:35:10 -03:00
{
sitemapUrl : mainDomainSitemapUrl ,
urlsHandler ( urls ) {
return urlsHandler (
urls . filter ( ( link ) = > {
try {
const linkUrl = new URL ( link ) ;
return linkUrl . hostname . endsWith ( hostname ) ;
} catch { }
} ) ,
) ;
} ,
mode : "fire-engine" ,
} ,
2024-12-26 13:51:20 -03:00
this . logger ,
2025-01-23 07:19:00 +01:00
this . jobId ,
2025-01-23 08:10:46 +01:00
this . sitemapsHit ,
2025-02-20 00:42:13 +01:00
abort ,
2025-02-20 10:41:43 +01:00
mock ,
2024-12-26 13:51:20 -03:00
) ;
} catch ( error ) {
2025-02-20 00:42:13 +01:00
if ( error instanceof TimeoutSignal ) {
throw error ;
} else {
this . logger . debug (
` Failed to fetch main domain sitemap from ${ mainDomainSitemapUrl } ` ,
{ method : "tryFetchSitemapLinks" , mainDomainSitemapUrl , error } ,
) ;
}
2024-08-13 20:51:43 +02:00
}
2024-04-15 17:01:47 -04:00
}
2024-12-26 13:51:20 -03:00
} catch ( error ) {
2025-02-20 00:42:13 +01:00
if ( error instanceof TimeoutSignal ) {
throw error ;
} else {
this . logger . debug ( ` Error processing main domain sitemap ` , {
method : "tryFetchSitemapLinks" ,
url ,
error ,
} ) ;
}
2024-04-15 17:01:47 -04:00
}
2024-05-15 15:30:37 -07:00
2024-12-26 13:51:20 -03:00
// If no sitemap found yet, try the baseUrl as a last resort
2024-12-27 19:59:26 +01:00
if ( sitemapCount === 0 ) {
2024-05-15 17:13:04 -07:00
const baseUrlSitemap = ` ${ this . baseUrl } /sitemap.xml ` ;
try {
2024-12-27 19:59:26 +01:00
sitemapCount += await getLinksFromSitemap (
{ sitemapUrl : baseUrlSitemap , urlsHandler , mode : "fire-engine" } ,
2024-12-26 13:51:20 -03:00
this . logger ,
2025-01-23 07:19:00 +01:00
this . jobId ,
2025-01-23 08:10:46 +01:00
this . sitemapsHit ,
2025-02-20 00:42:13 +01:00
abort ,
2025-02-20 10:41:43 +01:00
mock ,
2024-12-26 13:51:20 -03:00
) ;
2024-05-15 17:13:04 -07:00
} catch ( error ) {
2025-02-20 00:42:13 +01:00
if ( error instanceof TimeoutSignal ) {
throw error ;
2024-08-13 20:51:43 +02:00
} else {
2025-02-20 00:42:13 +01:00
this . logger . debug ( ` Failed to fetch sitemap from ${ baseUrlSitemap } ` , {
method : "tryFetchSitemapLinks" ,
sitemapUrl : baseUrlSitemap ,
error ,
} ) ;
if ( error instanceof AxiosError && error . response ? . status === 404 ) {
// ignore 404
} else {
sitemapCount += await getLinksFromSitemap (
{ sitemapUrl : baseUrlSitemap , urlsHandler , mode : "fire-engine" } ,
this . logger ,
this . jobId ,
this . sitemapsHit ,
abort ,
2025-02-20 10:41:43 +01:00
mock ,
2025-02-20 00:42:13 +01:00
) ;
}
2024-08-13 20:51:43 +02:00
}
2024-05-15 15:30:37 -07:00
}
}
2025-01-23 12:06:50 +01:00
if ( this . sitemapsHit . size >= 20 ) {
this . logger . warn ( "Sitemap limit hit!" , { crawlId : this.jobId , url : this.baseUrl } ) ;
}
2024-12-27 19:59:26 +01:00
return sitemapCount ;
2024-04-15 17:01:47 -04:00
}
}