firecrawl/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts

import { removeDefaultProperty } from "./llmExtract";
import { truncateText } from "./llmExtract";
import { encoding_for_model } from "@dqbd/tiktoken";

jest.mock("@dqbd/tiktoken", () => ({
  encoding_for_model: jest.fn(),
}));

describe("removeDefaultProperty", () => {
  it("should remove the default property from a simple object", () => {
    const input = { default: "test", test: "test" };
    const expectedOutput = { test: "test" };
    expect(removeDefaultProperty(input)).toEqual(expectedOutput);
  });

  it("should remove the default property from a nested object", () => {
    const input = {
      default: "test",
      nested: { default: "nestedTest", test: "nestedTest" },
    };
    const expectedOutput = { nested: { test: "nestedTest" } };
    expect(removeDefaultProperty(input)).toEqual(expectedOutput);
  });

  it("should remove the default property from an array of objects", () => {
    const input = {
      array: [
        { default: "test1", test: "test1" },
        { default: "test2", test: "test2" },
      ],
    };
    const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
    expect(removeDefaultProperty(input)).toEqual(expectedOutput);
  });

  it("should handle objects without a default property", () => {
    const input = { test: "test" };
    const expectedOutput = { test: "test" };
    expect(removeDefaultProperty(input)).toEqual(expectedOutput);
  });

  it("should handle null and non-object inputs", () => {
    expect(removeDefaultProperty(null)).toBeNull();
    expect(removeDefaultProperty("string")).toBe("string");
    expect(removeDefaultProperty(123)).toBe(123);
  });
});

describe("truncateText", () => {
  const mockEncode = jest.fn();
  const mockEncoder = {
    encode: mockEncode,
  };

  beforeEach(() => {
    jest.clearAllMocks();
    (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder);
  });

  it("should return the original text if it's within token limit", () => {
    const text = "This is a short text";
    mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens

    const result = truncateText(text, 10);
    expect(result).toBe(text);
    expect(mockEncode).toHaveBeenCalledWith(text);
  });

  it("should truncate text that exceeds token limit", () => {
    const text = "This is a longer text that needs truncation";
    mockEncode.mockReturnValue(new Array(20)); // Simulate 20 tokens

    const result = truncateText(text, 10);
    expect(result.length).toBeLessThan(text.length);
    expect(mockEncode).toHaveBeenCalled();
  });

  it("should handle empty string", () => {
    const text = "";
    mockEncode.mockReturnValue([]);

    const result = truncateText(text, 10);
    expect(result).toBe("");
    expect(mockEncode).toHaveBeenCalledWith("");
  });

  it("should use character-based fallback when encoder throws error", () => {
    const text = "This is some text";
    mockEncode.mockImplementation(() => {
      throw new Error("Encoder error");
    });

    const result = truncateText(text, 5);
    // With modifier of 3, should truncate to approximately 15 characters
    expect(result.length).toBeLessThanOrEqual(15);
  });

  it("should handle very short max token limits", () => {
    const text = "Short text";
    mockEncode.mockReturnValue(new Array(10));

    const result = truncateText(text, 1);
    expect(result.length).toBeLessThan(text.length);
  });

  it("should handle zero max tokens", () => {
    const text = "Some text";
    mockEncode.mockReturnValue(new Array(2));

    const result = truncateText(text, 0);
    expect(result).toBe("");
  });

  it("should handle extremely large text exceeding model context", () => {
    // Create a very large text (e.g., 100,000 characters)
    const text = "a".repeat(100000);

    // First call: simulate 25000 tokens
    mockEncode.mockReturnValueOnce(new Array(25000));
    // Subsequent calls: simulate gradually decreasing token counts
    // This simulates the iterative truncation process
    mockEncode
      .mockReturnValueOnce(new Array(20000))
      .mockReturnValueOnce(new Array(15000))
      .mockReturnValueOnce(new Array(12000))
      .mockReturnValueOnce(new Array(9000));

    const result = truncateText(text, 10000); // Common model context limit

    // The result should be significantly shorter but not empty
    expect(result.length).toBeLessThan(text.length);
    expect(result.length).toBeGreaterThan(0);
    // Given our new conservative approach, we should have a substantial amount of text
    expect(result.length).toBeGreaterThan(30000); // At least 30% of original
    expect(mockEncode).toHaveBeenCalled();

    // Log the actual length for verification
    console.log("Result length:", result.length, "characters");
  });
});