Nick: ignoreSitemap, better crawling algo

This commit is contained in:
Nicolas
2024-06-10 18:12:41 -07:00
parent 1bd0327e1a
commit f6b06ac27a
4 changed files with 57 additions and 41 deletions
+14 -11
View File
@@ -35,20 +35,23 @@ export type SearchOptions = {
location?: string;
};
export type CrawlerOptions = {
returnOnlyUrls?: boolean;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
maxDepth?: number;
limit?: number;
generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean;
ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
}
export type WebScraperOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: {
returnOnlyUrls?: boolean;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
maxDepth?: number;
limit?: number;
generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
};
crawlerOptions?: CrawlerOptions;
pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions;
concurrentRequests?: number;