5c47e97db2
* Nick:
* Revert "fix(v1/types): fix extract -> json rename (FIR-1072) (#1195)"
This reverts commit 586a10f40d.
* Update deep-research-service.ts
* Nick:
* Nick:
* Nick:
* Nick:
* Nick:
* Nick:
* Update deep-research-service.ts
* Nick:
* Update deep-research-service.ts
* Apply suggestions from code review
---------
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
141 lines
4.7 KiB
TypeScript
141 lines
4.7 KiB
TypeScript
import { removeDefaultProperty } from "./llmExtract";
|
|
import { truncateText } from "./llmExtract";
|
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
|
|
|
jest.mock("@dqbd/tiktoken", () => ({
|
|
encoding_for_model: jest.fn(),
|
|
}));
|
|
|
|
describe("removeDefaultProperty", () => {
|
|
it("should remove the default property from a simple object", () => {
|
|
const input = { default: "test", test: "test" };
|
|
const expectedOutput = { test: "test" };
|
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
|
});
|
|
|
|
it("should remove the default property from a nested object", () => {
|
|
const input = {
|
|
default: "test",
|
|
nested: { default: "nestedTest", test: "nestedTest" },
|
|
};
|
|
const expectedOutput = { nested: { test: "nestedTest" } };
|
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
|
});
|
|
|
|
it("should remove the default property from an array of objects", () => {
|
|
const input = {
|
|
array: [
|
|
{ default: "test1", test: "test1" },
|
|
{ default: "test2", test: "test2" },
|
|
],
|
|
};
|
|
const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
|
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
|
});
|
|
|
|
it("should handle objects without a default property", () => {
|
|
const input = { test: "test" };
|
|
const expectedOutput = { test: "test" };
|
|
expect(removeDefaultProperty(input)).toEqual(expectedOutput);
|
|
});
|
|
|
|
it("should handle null and non-object inputs", () => {
|
|
expect(removeDefaultProperty(null)).toBeNull();
|
|
expect(removeDefaultProperty("string")).toBe("string");
|
|
expect(removeDefaultProperty(123)).toBe(123);
|
|
});
|
|
});
|
|
|
|
describe("truncateText", () => {
|
|
const mockEncode = jest.fn();
|
|
const mockEncoder = {
|
|
encode: mockEncode,
|
|
};
|
|
|
|
beforeEach(() => {
|
|
jest.clearAllMocks();
|
|
(encoding_for_model as jest.Mock).mockReturnValue(mockEncoder);
|
|
});
|
|
|
|
it("should return the original text if it's within token limit", () => {
|
|
const text = "This is a short text";
|
|
mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens
|
|
|
|
const result = truncateText(text, 10);
|
|
expect(result).toBe(text);
|
|
expect(mockEncode).toHaveBeenCalledWith(text);
|
|
});
|
|
|
|
it("should truncate text that exceeds token limit", () => {
|
|
const text = "This is a longer text that needs truncation";
|
|
mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens
|
|
|
|
const result = truncateText(text, 10);
|
|
expect(result.length).toBeLessThan(text.length);
|
|
expect(mockEncode).toHaveBeenCalled();
|
|
});
|
|
|
|
it("should handle empty string", () => {
|
|
const text = "";
|
|
mockEncode.mockReturnValue([]);
|
|
|
|
const result = truncateText(text, 10);
|
|
expect(result).toBe("");
|
|
expect(mockEncode).toHaveBeenCalledWith("");
|
|
});
|
|
|
|
it("should use character-based fallback when encoder throws error", () => {
|
|
const text = "This is some text";
|
|
mockEncode.mockImplementation(() => {
|
|
throw new Error("Encoder error");
|
|
});
|
|
|
|
const result = truncateText(text, 5);
|
|
// With modifier of 3, should truncate to approximately 15 characters
|
|
expect(result.length).toBeLessThanOrEqual(15);
|
|
});
|
|
|
|
it("should handle very short max token limits", () => {
|
|
const text = "Short text";
|
|
mockEncode.mockReturnValue(new Array(10));
|
|
|
|
const result = truncateText(text, 1);
|
|
expect(result.length).toBeLessThan(text.length);
|
|
});
|
|
|
|
it("should handle zero max tokens", () => {
|
|
const text = "Some text";
|
|
mockEncode.mockReturnValue(new Array(2));
|
|
|
|
const result = truncateText(text, 0);
|
|
expect(result).toBe("");
|
|
});
|
|
|
|
it("should handle extremely large text exceeding model context", () => {
|
|
// Create a very large text (e.g., 100,000 characters)
|
|
const text = "a".repeat(100000);
|
|
|
|
// First call: simulate 25000 tokens
|
|
mockEncode.mockReturnValueOnce(new Array(25000));
|
|
// Subsequent calls: simulate gradually decreasing token counts
|
|
// This simulates the iterative truncation process
|
|
mockEncode
|
|
.mockReturnValueOnce(new Array(20000))
|
|
.mockReturnValueOnce(new Array(15000))
|
|
.mockReturnValueOnce(new Array(12000))
|
|
.mockReturnValueOnce(new Array(9000));
|
|
|
|
const result = truncateText(text, 10000); // Common model context limit
|
|
|
|
// The result should be significantly shorter but not empty
|
|
expect(result.length).toBeLessThan(text.length);
|
|
expect(result.length).toBeGreaterThan(0);
|
|
// Given our new conservative approach, we should have a substantial amount of text
|
|
expect(result.length).toBeGreaterThan(30000); // At least 30% of original
|
|
expect(mockEncode).toHaveBeenCalled();
|
|
|
|
// Log the actual length for verification
|
|
console.log("Result length:", result.length, "characters");
|
|
});
|
|
});
|