Files
firecrawl/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
T

141 lines
4.7 KiB
TypeScript
Raw Normal View History

2024-12-16 09:30:40 -03:00
import { removeDefaultProperty } from "./llmExtract";
import { truncateText } from "./llmExtract";
import { encoding_for_model } from "@dqbd/tiktoken";
jest.mock("@dqbd/tiktoken", () => ({
encoding_for_model: jest.fn(),
}));
2024-12-16 09:30:40 -03:00
describe("removeDefaultProperty", () => {
2025-01-10 18:35:10 -03:00
it("should remove the default property from a simple object", () => {
const input = { default: "test", test: "test" };
const expectedOutput = { test: "test" };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
2024-12-16 09:30:40 -03:00
2025-01-10 18:35:10 -03:00
it("should remove the default property from a nested object", () => {
const input = {
default: "test",
nested: { default: "nestedTest", test: "nestedTest" },
};
const expectedOutput = { nested: { test: "nestedTest" } };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
2024-12-16 09:30:40 -03:00
2025-01-10 18:35:10 -03:00
it("should remove the default property from an array of objects", () => {
const input = {
array: [
{ default: "test1", test: "test1" },
{ default: "test2", test: "test2" },
],
};
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
2024-12-16 09:30:40 -03:00
2025-01-10 18:35:10 -03:00
it("should handle objects without a default property", () => {
const input = { test: "test" };
const expectedOutput = { test: "test" };
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
});
2024-12-16 09:30:40 -03:00
2025-01-10 18:35:10 -03:00
it("should handle null and non-object inputs", () => {
expect(removeDefaultProperty(null)).toBeNull();
expect(removeDefaultProperty("string")).toBe("string");
expect(removeDefaultProperty(123)).toBe(123);
});
});
describe("truncateText", () => {
const mockEncode = jest.fn();
const mockEncoder = {
encode: mockEncode,
};
beforeEach(() => {
jest.clearAllMocks();
(encoding_for_model as jest.Mock).mockReturnValue(mockEncoder);
});
it("should return the original text if it's within token limit", () => {
const text = "This is a short text";
mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens
const result = truncateText(text, 10);
expect(result).toBe(text);
expect(mockEncode).toHaveBeenCalledWith(text);
});
it("should truncate text that exceeds token limit", () => {
const text = "This is a longer text that needs truncation";
mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens
const result = truncateText(text, 10);
expect(result.length).toBeLessThan(text.length);
expect(mockEncode).toHaveBeenCalled();
});
it("should handle empty string", () => {
const text = "";
mockEncode.mockReturnValue([]);
const result = truncateText(text, 10);
expect(result).toBe("");
expect(mockEncode).toHaveBeenCalledWith("");
});
it("should use character-based fallback when encoder throws error", () => {
const text = "This is some text";
mockEncode.mockImplementation(() => {
throw new Error("Encoder error");
});
const result = truncateText(text, 5);
// With modifier of 3, should truncate to approximately 15 characters
expect(result.length).toBeLessThanOrEqual(15);
});
it("should handle very short max token limits", () => {
const text = "Short text";
mockEncode.mockReturnValue(new Array(10));
const result = truncateText(text, 1);
expect(result.length).toBeLessThan(text.length);
});
it("should handle zero max tokens", () => {
const text = "Some text";
mockEncode.mockReturnValue(new Array(2));
const result = truncateText(text, 0);
expect(result).toBe("");
});
it("should handle extremely large text exceeding model context", () => {
// Create a very large text (e.g., 100,000 characters)
const text = "a".repeat(100000);
// First call: simulate 25000 tokens
mockEncode.mockReturnValueOnce(new Array(25000));
// Subsequent calls: simulate gradually decreasing token counts
// This simulates the iterative truncation process
mockEncode
.mockReturnValueOnce(new Array(20000))
.mockReturnValueOnce(new Array(15000))
.mockReturnValueOnce(new Array(12000))
.mockReturnValueOnce(new Array(9000));
const result = truncateText(text, 10000); // Common model context limit
// The result should be significantly shorter but not empty
expect(result.length).toBeLessThan(text.length);
expect(result.length).toBeGreaterThan(0);
// Given our new conservative approach, we should have a substantial amount of text
expect(result.length).toBeGreaterThan(30000); // At least 30% of original
expect(mockEncode).toHaveBeenCalled();
// Log the actual length for verification
console.log("Result length:", result.length, "characters");
});
});