Implement manual JSON parsing for company profiles; enhance data extraction and processing efficiency; add comprehensive test script for validation
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the company parser with manual JSON parsing.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/app")
|
||||
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
from services.llm_parser import InvestorProcessor
|
||||
|
||||
# Load environment variables from root directory
|
||||
load_dotenv("/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/.env")
|
||||
|
||||
# Also check if API key is set (not needed for companies now but for consistency)
|
||||
if not os.getenv("OPENROUTER_API_KEY"):
|
||||
print("⚠️ WARNING: OPENROUTER_API_KEY not found in environment")
|
||||
print("This is OK for companies (no LLM needed), but will fail for investors")
|
||||
|
||||
|
||||
async def test_parser():
|
||||
"""Test the new company parser with a small sample"""
|
||||
print("🧪 Testing Manual Company JSON Parser (No LLM)\n")
|
||||
|
||||
# Load the company data
|
||||
df = pd.read_csv(
|
||||
"/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/data/300 Companies data.csv"
|
||||
)
|
||||
|
||||
# Process just the first 3 rows for testing
|
||||
test_df = df.head(3)
|
||||
|
||||
processor = InvestorProcessor()
|
||||
|
||||
print(f"Processing {len(test_df)} test companies...\n")
|
||||
results = await processor.parse_companies(test_df, save_to_db=False)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("📊 TEST RESULTS")
|
||||
print("=" * 80)
|
||||
|
||||
for idx, result in enumerate(results, 1):
|
||||
print(f"\n{idx}. {result.get('name')}")
|
||||
print(f" Website: {result.get('website')}")
|
||||
print(f" Location: {result.get('location')}")
|
||||
print(f" Industry: {result.get('industry')}")
|
||||
print(
|
||||
f" Founded: {result.get('founded_year')}"
|
||||
if result.get("founded_year")
|
||||
else " Founded: Unknown"
|
||||
)
|
||||
print(f" Executives: {len(result.get('key_executives', []))}")
|
||||
if result.get("key_executives"):
|
||||
for exec_member in result.get("key_executives", [])[:3]: # Show first 3
|
||||
print(f" - {exec_member.get('name')} ({exec_member.get('title')})")
|
||||
print(f" Investors: {len(result.get('investor_names', []))}")
|
||||
if result.get("investor_names"):
|
||||
print(
|
||||
f" - {', '.join(result.get('investor_names', [])[:5])}"
|
||||
) # Show first 5
|
||||
print(f" Client Categories: {len(result.get('client_categories', []))}")
|
||||
if result.get("client_categories"):
|
||||
print(
|
||||
f" - {', '.join(result.get('client_categories', [])[:3])}"
|
||||
) # Show first 3
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print(f"✅ Successfully processed {len(results)}/{len(test_df)} companies")
|
||||
print("🎉 No LLM calls needed - 100% manual parsing!")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_parser())
|
||||
Reference in New Issue
Block a user