Implement manual JSON parsing for company profiles; enhance data extraction and processing efficiency; add comprehensive test script for validation

This commit is contained in:
bolade
2025-10-07 12:07:43 +01:00
parent 1f3f08e80d
commit c0fbbdd917
5 changed files with 795 additions and 62 deletions
+18 -9
View File
@@ -47,14 +47,23 @@ async def parse_csv(
"""
Parse and import CSV data into the database.
For investors: Expected columns - Name, Website, Final Investor Profile, Final Profile sourcing
For companies: Uses legacy LLM-based parsing
The new investor parser:
**For investors:**
- Expected columns: Name, Website, Final Investor Profile, Final Profile sourcing
- Manually parses JSON profiles for efficiency
- Uses LLM only for currency conversion to USD
- Handles AUM, fund sizes, and check sizes as integers
- Automatically saves to database
**For companies:**
- Expected columns: Name, Website, Investor, Final Investor Profile (company profile)
- 100% manual JSON parsing - no LLM needed
- Extracts company details, executives, investors, and client categories
- Automatically links companies to investors in database
**Benefits:**
- Fast processing (5-10s per record)
- Low cost (minimal or no LLM usage)
- Accurate data extraction
- Automatic database persistence
"""
# Read uploaded CSV with pandas
content = await file.read()
@@ -64,15 +73,15 @@ async def parse_csv(
processor = InvestorProcessor()
if is_investor == 1:
# New manual parser with LLM currency conversion
# Manual parser with LLM currency conversion
results = await processor.parse_investors(df, save_to_db=True)
# Results are already dicts from the new parser
return results
else:
# Legacy LLM-based company parser
# Manual parser for companies (no LLM needed)
results = await processor.parse_companies(df, save_to_db=True)
# Convert Pydantic objects to dictionaries
return [r.model_dump() if hasattr(r, "model_dump") else r for r in results]
# Results are already dicts from the new parser
return results
@app.post("/query", response_model=InvestorList, tags=["Querying"])