Files
firecrawl/apps/api/src/lib/extract/url-processor.ts
T

170 lines
4.9 KiB
TypeScript
Raw Normal View History

2024-12-26 12:41:37 -03:00
import { MapDocument, URLTrace } from "../../controllers/v1/types";
import { getMapResults } from "../../controllers/v1/map";
import { PlanType } from "../../types";
import { removeDuplicateUrls } from "../validateUrl";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { generateBasicCompletion } from "../LLM-extraction";
import { buildRefrasedPrompt } from "./build-prompts";
import { logger } from "../logger";
import { rerankLinks } from "./reranker";
import { extractConfig } from "./config";
2024-12-26 12:41:37 -03:00
interface ProcessUrlOptions {
url: string;
prompt?: string;
teamId: string;
plan: PlanType;
allowExternalLinks?: boolean;
origin?: string;
limit?: number;
includeSubdomains?: boolean;
}
2025-01-10 18:35:10 -03:00
export async function processUrl(
options: ProcessUrlOptions,
urlTraces: URLTrace[],
): Promise<string[]> {
2024-12-26 12:41:37 -03:00
const trace: URLTrace = {
url: options.url,
2025-01-10 18:35:10 -03:00
status: "mapped",
2024-12-26 12:41:37 -03:00
timing: {
discoveredAt: new Date().toISOString(),
},
};
urlTraces.push(trace);
if (!options.url.includes("/*") && !options.allowExternalLinks) {
if (!isUrlBlocked(options.url)) {
trace.usedInCompletion = true;
return [options.url];
}
2025-01-10 18:35:10 -03:00
trace.status = "error";
trace.error = "URL is blocked";
2024-12-26 12:41:37 -03:00
trace.usedInCompletion = false;
return [];
}
const baseUrl = options.url.replace("/*", "");
let urlWithoutWww = baseUrl.replace("www.", "");
let rephrasedPrompt = options.prompt;
if (options.prompt) {
2025-01-10 18:35:10 -03:00
rephrasedPrompt =
(await generateBasicCompletion(
buildRefrasedPrompt(options.prompt, baseUrl),
)) ?? options.prompt;
2024-12-26 12:41:37 -03:00
}
try {
const mapResults = await getMapResults({
url: baseUrl,
search: rephrasedPrompt,
teamId: options.teamId,
plan: options.plan,
allowExternalLinks: options.allowExternalLinks,
origin: options.origin,
limit: options.limit,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
});
let mappedLinks = mapResults.mapResults as MapDocument[];
2025-01-02 18:00:18 -03:00
let allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
let uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
2025-01-10 18:35:10 -03:00
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
2025-01-02 18:00:18 -03:00
urlTraces.push({
url: discoveredUrl,
2025-01-10 18:35:10 -03:00
status: "mapped",
2025-01-02 18:00:18 -03:00
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
// retry if only one url is returned
2025-01-10 18:35:10 -03:00
if (uniqueUrls.length <= 1) {
2025-01-02 18:00:18 -03:00
const retryMapResults = await getMapResults({
url: baseUrl,
teamId: options.teamId,
plan: options.plan,
allowExternalLinks: options.allowExternalLinks,
origin: options.origin,
limit: options.limit,
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
});
2025-01-10 18:35:10 -03:00
2025-01-02 18:00:18 -03:00
mappedLinks = retryMapResults.mapResults as MapDocument[];
allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
uniqueUrls = removeDuplicateUrls(allUrls);
// Track all discovered URLs
2025-01-10 18:35:10 -03:00
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
2025-01-02 18:00:18 -03:00
urlTraces.push({
url: discoveredUrl,
2025-01-10 18:35:10 -03:00
status: "mapped",
warning: "Broader search. Not limiting map results to prompt.",
2025-01-02 18:00:18 -03:00
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
}
2024-12-26 12:41:37 -03:00
// Track all discovered URLs
2025-01-10 18:35:10 -03:00
uniqueUrls.forEach((discoveredUrl) => {
if (!urlTraces.some((t) => t.url === discoveredUrl)) {
2024-12-26 12:41:37 -03:00
urlTraces.push({
url: discoveredUrl,
2025-01-10 18:35:10 -03:00
status: "mapped",
2024-12-26 12:41:37 -03:00
timing: {
discoveredAt: new Date().toISOString(),
},
usedInCompletion: false,
});
}
});
const existingUrls = new Set(mappedLinks.map((m) => m.url));
const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
mappedLinks = [
...mappedLinks,
...newUrls.map((url) => ({ url, title: "", description: "" })),
];
if (mappedLinks.length === 0) {
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit initial set of links (1000)
mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT);
2024-12-26 12:41:37 -03:00
// Perform reranking if prompt is provided
if (options.prompt) {
const searchQuery = options.allowExternalLinks
? `${options.prompt} ${urlWithoutWww}`
: `${options.prompt} site:${urlWithoutWww}`;
mappedLinks = await rerankLinks(mappedLinks, searchQuery, urlTraces);
}
2025-01-10 18:35:10 -03:00
return mappedLinks.map((x) => x.url);
2024-12-26 12:41:37 -03:00
} catch (error) {
2025-01-10 18:35:10 -03:00
trace.status = "error";
2024-12-26 12:41:37 -03:00
trace.error = error.message;
trace.usedInCompletion = false;
return [];
}
2025-01-10 18:35:10 -03:00
}