Add test script for manual JSON parser with LLM currency conversion

- Implemented a new test script `test_parser.py` to validate the functionality of the manual JSON parser. - The script loads investor data from a CSV file and processes a sample of three investors. - Results include detailed information about each investor, their funds, team members, and investment thesis. - Added error handling for missing API key in the environment variables.
2025-10-06 14:07:28 +01:00
parent c199f5423a
commit cd7172ed9f
11 changed files with 31090 additions and 49 deletions
@@ -44,6 +44,18 @@ def health():
 async def parse_csv(
    db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...)
 ):
+    """
+    Parse and import CSV data into the database.
+
+    For investors: Expected columns - Name, Website, Final Investor Profile, Final Profile sourcing
+    For companies: Uses legacy LLM-based parsing
+
+    The new investor parser:
+    - Manually parses JSON profiles for efficiency
+    - Uses LLM only for currency conversion to USD
+    - Handles AUM, fund sizes, and check sizes as integers
+    - Automatically saves to database
+    """
    # Read uploaded CSV with pandas
    content = await file.read()
    df = pd.read_csv(io.StringIO(content.decode("utf-8")))
@@ -52,12 +64,15 @@ async def parse_csv(
    processor = InvestorProcessor()

    if is_investor == 1:
-        results = await processor.parse_investors(df)
+        # New manual parser with LLM currency conversion
+        results = await processor.parse_investors(df, save_to_db=True)
+        # Results are already dicts from the new parser
+        return results
    else:
-        results = await processor.parse_companies(df)
-
-    # Convert Pydantic objects to dictionaries
-    return [r.model_dump() for r in results]
+        # Legacy LLM-based company parser
+        results = await processor.parse_companies(df, save_to_db=True)
+        # Convert Pydantic objects to dictionaries
+        return [r.model_dump() if hasattr(r, "model_dump") else r for r in results]


@app.post("/query", response_model=InvestorList, tags=["Querying"])