Files
firecrawl/apps/api/src/lib/ranker.ts
T

91 lines
2.6 KiB
TypeScript
Raw Normal View History

2024-12-11 19:46:11 -03:00
import { configDotenv } from "dotenv";
2024-11-12 18:44:14 -03:00
import OpenAI from "openai";
configDotenv();
async function getEmbedding(text: string) {
2025-02-20 00:41:22 +01:00
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
2024-11-12 18:44:14 -03:00
const embedding = await openai.embeddings.create({
2024-12-30 19:37:48 -03:00
model: "text-embedding-3-small",
2024-11-12 18:44:14 -03:00
input: text,
2024-12-11 19:51:08 -03:00
encoding_format: "float",
2024-11-12 18:44:14 -03:00
});
return embedding.data[0].embedding;
}
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
2024-12-11 19:46:11 -03:00
const magnitude1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0));
const magnitude2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0));
2024-11-12 18:44:14 -03:00
if (magnitude1 === 0 || magnitude2 === 0) return 0;
return dotProduct / (magnitude1 * magnitude2);
};
// Function to convert text to vector
const textToVector = (searchQuery: string, text: string): number[] => {
const words = searchQuery.toLowerCase().split(/\W+/);
return words.map((word) => {
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
.length;
return count / text.length;
});
};
2024-12-11 19:46:11 -03:00
async function performRanking(
linksWithContext: string[],
links: string[],
2024-12-11 19:51:08 -03:00
searchQuery: string,
2024-12-11 19:46:11 -03:00
) {
2024-11-12 18:44:14 -03:00
try {
2024-11-24 19:34:56 -08:00
// Handle invalid inputs
if (!searchQuery || !linksWithContext.length || !links.length) {
return [];
}
// Sanitize search query by removing null characters
const sanitizedQuery = searchQuery;
2024-11-12 18:44:14 -03:00
// Generate embeddings for the search query
2024-11-24 19:34:56 -08:00
const queryEmbedding = await getEmbedding(sanitizedQuery);
2024-11-12 18:44:14 -03:00
// Generate embeddings for each link and calculate similarity in parallel
2024-12-11 19:46:11 -03:00
const linksAndScores = await Promise.all(
2025-01-22 18:47:44 -03:00
linksWithContext.map((linkWithContext, index) =>
getEmbedding(linkWithContext)
2025-01-22 18:47:44 -03:00
.then((linkEmbedding) => {
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
return {
link: links[index],
linkWithContext,
score,
originalIndex: index,
};
})
.catch(() => ({
2024-12-11 19:46:11 -03:00
link: links[index],
linkWithContext,
score: 0,
2024-12-11 19:51:08 -03:00
originalIndex: index,
2025-01-22 18:47:44 -03:00
})),
),
2024-12-11 19:46:11 -03:00
);
2024-11-12 18:44:14 -03:00
2024-11-14 15:26:15 -05:00
// Sort links based on similarity scores while preserving original order for equal scores
linksAndScores.sort((a, b) => {
const scoreDiff = b.score - a.score;
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
});
2024-11-12 18:44:14 -03:00
return linksAndScores;
} catch (error) {
console.error(`Error performing semantic search: ${error}`);
return [];
}
}
export { performRanking };