Files
firecrawl/apps/api/src/lib/extract/index/pinecone.ts
T

168 lines
4.0 KiB
TypeScript
Raw Normal View History

2025-01-10 18:35:10 -03:00
import { Pinecone } from "@pinecone-database/pinecone";
import { Document } from "../../../controllers/v1/types";
import { logger } from "../../logger";
2024-12-30 19:37:48 -03:00
import OpenAI from "openai";
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
});
const INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? "";
2024-12-30 20:04:22 -03:00
const MAX_METADATA_SIZE = 30 * 1024; // 30KB in bytes
2024-12-30 19:37:48 -03:00
export interface PageMetadata {
url: string;
originUrl: string;
title?: string;
description?: string;
crawlId?: string;
teamId?: string;
timestamp: number;
markdown?: string;
}
async function getEmbedding(text: string) {
const embedding = await openai.embeddings.create({
model: "text-embedding-3-small",
input: text,
encoding_format: "float",
});
return embedding.data[0].embedding;
}
function normalizeUrl(url: string) {
const urlO = new URL(url);
if (!urlO.hostname.startsWith("www.")) {
urlO.hostname = "www." + urlO.hostname;
}
return urlO.href;
}
2024-12-30 20:04:22 -03:00
export async function indexPage({
document,
originUrl,
crawlId,
2025-01-10 18:35:10 -03:00
teamId,
2024-12-30 20:04:22 -03:00
}: {
document: Document;
originUrl: string;
crawlId?: string;
teamId?: string;
2025-01-10 18:35:10 -03:00
}) {
2024-12-30 19:37:48 -03:00
try {
const index = pinecone.index(INDEX_NAME);
2024-12-30 20:04:22 -03:00
// Trim markdown if it's too long
let trimmedMarkdown = document.markdown;
2025-01-10 18:35:10 -03:00
if (
trimmedMarkdown &&
Buffer.byteLength(trimmedMarkdown, "utf-8") > MAX_METADATA_SIZE
) {
trimmedMarkdown = trimmedMarkdown.slice(
0,
Math.floor(MAX_METADATA_SIZE / 2),
); // Using half the size to be safe with UTF-8 encoding
2024-12-30 20:04:22 -03:00
}
2024-12-30 19:37:48 -03:00
// Create text to embed
const textToEmbed = [
document.metadata.title,
document.metadata.description,
2025-01-10 18:35:10 -03:00
trimmedMarkdown,
]
.filter(Boolean)
.join("\n\n");
2024-12-30 19:37:48 -03:00
// Get embedding from OpenAI
const embedding = await getEmbedding(textToEmbed);
2025-01-10 18:35:10 -03:00
const normalizedUrl = normalizeUrl(
document.metadata.sourceURL || document.metadata.url!,
);
2024-12-30 20:04:22 -03:00
2024-12-30 19:37:48 -03:00
// Prepare metadata
const metadata: PageMetadata = {
2024-12-30 20:04:22 -03:00
url: normalizedUrl,
2024-12-30 19:37:48 -03:00
originUrl: normalizeUrl(originUrl),
title: document.metadata.title ?? document.metadata.ogTitle ?? "",
description: document.metadata.description ?? document.metadata.ogDescription ?? "",
2024-12-30 19:37:48 -03:00
crawlId,
teamId,
2024-12-30 20:04:22 -03:00
markdown: trimmedMarkdown,
2025-01-10 18:35:10 -03:00
timestamp: Date.now(),
2024-12-30 19:37:48 -03:00
};
// Upsert to Pinecone
2025-01-10 18:35:10 -03:00
await index.upsert([
{
id: normalizedUrl,
values: embedding,
metadata: {
...metadata,
[document.metadata.sourceURL || document.metadata.url!]: true,
},
},
]);
logger.debug("Successfully indexed page in Pinecone", {
2024-12-30 19:37:48 -03:00
url: metadata.url,
2025-01-10 18:35:10 -03:00
crawlId,
2024-12-30 19:37:48 -03:00
});
} catch (error) {
2025-01-10 18:35:10 -03:00
logger.error("Failed to index page in Pinecone", {
2024-12-30 19:37:48 -03:00
error,
url: document.metadata.sourceURL || document.metadata.url,
2025-01-10 18:35:10 -03:00
crawlId,
2024-12-30 19:37:48 -03:00
});
}
}
export async function searchSimilarPages(
query: string,
originUrl?: string,
limit: number = 1000
): Promise<any[]> {
2024-12-30 19:37:48 -03:00
try {
const index = pinecone.index(INDEX_NAME);
// Get query embedding from OpenAI
const queryEmbedding = await getEmbedding(query);
const queryParams: any = {
vector: queryEmbedding,
topK: limit,
2025-01-10 18:35:10 -03:00
includeMetadata: true,
2024-12-30 19:37:48 -03:00
};
2024-12-30 20:04:22 -03:00
const normalizedOriginUrl = originUrl ? normalizeUrl(originUrl) : undefined;
2024-12-30 19:37:48 -03:00
// Add filter if originUrl is provided
2024-12-30 20:04:22 -03:00
if (normalizedOriginUrl) {
2024-12-30 19:37:48 -03:00
queryParams.filter = {
2025-01-10 18:35:10 -03:00
originUrl: { $eq: normalizedOriginUrl },
2024-12-30 19:37:48 -03:00
};
}
const results = await index.query(queryParams);
2025-01-10 18:35:10 -03:00
return results.matches.map((match) => ({
2024-12-30 19:37:48 -03:00
url: match.metadata?.url,
2025-01-10 18:35:10 -03:00
title: match.metadata?.title,
2024-12-30 19:37:48 -03:00
description: match.metadata?.description,
score: match.score,
2025-01-10 18:35:10 -03:00
markdown: match.metadata?.markdown,
2024-12-30 19:37:48 -03:00
}));
} catch (error) {
2025-01-10 18:35:10 -03:00
logger.error("Failed to search similar pages in Pinecone", {
2024-12-30 19:37:48 -03:00
error,
query,
2025-01-10 18:35:10 -03:00
originUrl,
2024-12-30 19:37:48 -03:00
});
return [];
}
}