feat: Enhance investor and company parsing with asynchronous batch processing

This commit is contained in:
bolade
2025-10-08 13:29:25 +01:00
parent be6fde9ba2
commit 58722f1102
3 changed files with 168 additions and 126 deletions
Binary file not shown.
+155 -113
View File
@@ -1,3 +1,4 @@
import asyncio
import json import json
import os import os
import re import re
@@ -649,31 +650,14 @@ Return the lower and upper bounds in USD."""
print(f"Error processing row {row_idx + 1}: {e}") print(f"Error processing row {row_idx + 1}: {e}")
return None return None
async def parse_investors(self, df: pd.DataFrame, save_to_db: bool = True): async def _process_single_investor(
""" self, idx: int, row: pd.Series, total_rows: int
Parse investors from DataFrame using manual JSON parsing and LLM for currency conversion. ) -> Optional[dict]:
Expected CSV columns: Name, Website, Final Investor Profile, Final Profile sourcing """Process a single investor row"""
"""
results = []
db = None
if save_to_db:
db = get_db_session()
try: try:
total_rows = len(df) name = row.get("Name", "").strip() if pd.notna(row.get("Name")) else None
print(f"\n🚀 Starting to process {total_rows} investors...")
for idx, row in df.iterrows():
try:
name = (
row.get("Name", "").strip()
if pd.notna(row.get("Name"))
else None
)
website = ( website = (
row.get("Website", "").strip() row.get("Website", "").strip() if pd.notna(row.get("Website")) else None
if pd.notna(row.get("Website"))
else None
) )
profile_json = ( profile_json = (
row.get("Final Investor Profile", "") row.get("Final Investor Profile", "")
@@ -683,9 +667,9 @@ Return the lower and upper bounds in USD."""
if not name or not profile_json: if not name or not profile_json:
print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile") print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile")
continue return None
print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}") print(f"📊 Processing {idx + 1}/{total_rows}: {name}")
# Process the investor profile # Process the investor profile
investor_data = await self.process_investor_profile( investor_data = await self.process_investor_profile(
@@ -693,18 +677,61 @@ Return the lower and upper bounds in USD."""
) )
if investor_data: if investor_data:
print(f"{name} parsed successfully")
return investor_data
else:
print(f" ⚠️ {name} failed to process")
return None
except Exception as e:
print(f"❌ Error processing row {idx + 1}: {e}")
return None
async def parse_investors(
self, df: pd.DataFrame, save_to_db: bool = True, batch_size: int = 10
):
"""
Parse investors from DataFrame using manual JSON parsing and LLM for currency conversion.
Processes multiple investors concurrently for better performance.
Expected CSV columns: Name, Website, Final Investor Profile, Final Profile sourcing
Args:
df: DataFrame with investor data
save_to_db: Whether to save to database
batch_size: Number of investors to process concurrently (default: 10)
"""
results = []
db = None
if save_to_db:
db = get_db_session()
try:
total_rows = len(df)
print(
f"\n🚀 Starting to process {total_rows} investors with batch size {batch_size}..."
)
# Process in batches
for batch_start in range(0, total_rows, batch_size):
batch_end = min(batch_start + batch_size, total_rows)
print(
f"\n🔄 Processing batch {batch_start + 1}-{batch_end} of {total_rows}..."
)
# Create tasks for concurrent processing
tasks = []
for idx in range(batch_start, batch_end):
row = df.iloc[idx]
task = self._process_single_investor(idx, row, total_rows)
tasks.append(task)
# Process batch concurrently
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out None results and exceptions, then save to database
for investor_data in batch_results:
if investor_data and not isinstance(investor_data, Exception):
results.append(investor_data) results.append(investor_data)
print(" ✓ Parsed successfully")
print(f" - HQ: {investor_data.get('headquarters')}")
print(
f" - AUM: ${investor_data.get('aum'):,}"
if investor_data.get("aum")
else " - AUM: Not Available"
)
print(f" - Funds: {len(investor_data.get('funds', []))}")
print(
f" - Team: {len(investor_data.get('team_members', []))}"
)
# Save to database # Save to database
if save_to_db and db: if save_to_db and db:
@@ -713,33 +740,29 @@ Return the lower and upper bounds in USD."""
db, investor_data db, investor_data
) )
if saved_investor: if saved_investor:
db.commit()
print( print(
f" ✅ Saved to database (ID: {saved_investor.id})" f" ✅ Saved {investor_data['name']} to database (ID: {saved_investor.id})"
) )
else: else:
print(" ❌ Failed to save to database") print(
f" ❌ Failed to save {investor_data['name']} to database"
)
except Exception as e: except Exception as e:
db.rollback() db.rollback()
print(f" ❌ Database error: {e}") print(
else: f" ❌ Database error for {investor_data['name']}: {e}"
print(" ⚠️ Failed to process profile") )
elif isinstance(investor_data, Exception):
print(f" ❌ Exception occurred: {investor_data}")
# Commit every 10 investors to avoid memory issues # Commit batch to database
if save_to_db and db and (idx + 1) % 10 == 0:
db.commit()
print(f"\n💾 Committed batch at row {idx + 1}")
except Exception as e:
print(f"❌ Error processing row {idx + 1}: {e}")
if db:
db.rollback()
continue
# Final commit
if save_to_db and db: if save_to_db and db:
try:
db.commit() db.commit()
print("\n✅ Final commit completed") print(f"💾 Committed batch {batch_start + 1}-{batch_end}")
except Exception as e:
db.rollback()
print(f"❌ Failed to commit batch: {e}")
except Exception as e: except Exception as e:
print(f"❌ Fatal error in parse_investors: {e}") print(f"❌ Fatal error in parse_investors: {e}")
@@ -752,31 +775,14 @@ Return the lower and upper bounds in USD."""
print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} investors") print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} investors")
return results return results
async def parse_companies(self, df: pd.DataFrame, save_to_db: bool = True): async def _process_single_company(
""" self, idx: int, row: pd.Series, total_rows: int
Parse companies from DataFrame using manual JSON parsing. ) -> Optional[dict]:
Expected CSV columns: Name, Website, Investor, Final Investor Profile (actually company profile) """Process a single company row"""
"""
results = []
db = None
if save_to_db:
db = get_db_session()
try: try:
total_rows = len(df) name = row.get("Name", "").strip() if pd.notna(row.get("Name")) else None
print(f"\n🚀 Starting to process {total_rows} companies...")
for idx, row in df.iterrows():
try:
name = (
row.get("Name", "").strip()
if pd.notna(row.get("Name"))
else None
)
website = ( website = (
row.get("Website", "").strip() row.get("Website", "").strip() if pd.notna(row.get("Website")) else None
if pd.notna(row.get("Website"))
else None
) )
investor_names = ( investor_names = (
row.get("Investor", "").strip() row.get("Investor", "").strip()
@@ -791,9 +797,9 @@ Return the lower and upper bounds in USD."""
if not name or not profile_json: if not name or not profile_json:
print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile") print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile")
continue return None
print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}") print(f"📊 Processing {idx + 1}/{total_rows}: {name}")
# Process the company profile # Process the company profile
company_data = await self.process_company_profile( company_data = await self.process_company_profile(
@@ -801,21 +807,61 @@ Return the lower and upper bounds in USD."""
) )
if company_data: if company_data:
print(f"{name} parsed successfully")
return company_data
else:
print(f" ⚠️ {name} failed to process")
return None
except Exception as e:
print(f"❌ Error processing row {idx + 1}: {e}")
return None
async def parse_companies(
self, df: pd.DataFrame, save_to_db: bool = True, batch_size: int = 10
):
"""
Parse companies from DataFrame using manual JSON parsing.
Processes multiple companies concurrently for better performance.
Expected CSV columns: Name, Website, Investor, Final Investor Profile (actually company profile)
Args:
df: DataFrame with company data
save_to_db: Whether to save to database
batch_size: Number of companies to process concurrently (default: 10)
"""
results = []
db = None
if save_to_db:
db = get_db_session()
try:
total_rows = len(df)
print(
f"\n🚀 Starting to process {total_rows} companies with batch size {batch_size}..."
)
# Process in batches
for batch_start in range(0, total_rows, batch_size):
batch_end = min(batch_start + batch_size, total_rows)
print(
f"\n🔄 Processing batch {batch_start + 1}-{batch_end} of {total_rows}..."
)
# Create tasks for concurrent processing
tasks = []
for idx in range(batch_start, batch_end):
row = df.iloc[idx]
task = self._process_single_company(idx, row, total_rows)
tasks.append(task)
# Process batch concurrently
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out None results and exceptions, then save to database
for company_data in batch_results:
if company_data and not isinstance(company_data, Exception):
results.append(company_data) results.append(company_data)
print(" ✓ Parsed successfully")
print(f" - Location: {company_data.get('location')}")
print(f" - Industry: {company_data.get('industry')}")
print(
f" - Founded: {company_data.get('founded_year')}"
if company_data.get("founded_year")
else " - Founded: Unknown"
)
print(
f" - Executives: {len(company_data.get('key_executives', []))}"
)
print(
f" - Investors: {len(company_data.get('investor_names', []))}"
)
# Save to database # Save to database
if save_to_db and db: if save_to_db and db:
@@ -824,33 +870,29 @@ Return the lower and upper bounds in USD."""
db, company_data db, company_data
) )
if saved_company: if saved_company:
db.commit()
print( print(
f" ✅ Saved to database (ID: {saved_company.id})" f" ✅ Saved {company_data['name']} to database (ID: {saved_company.id})"
) )
else: else:
print(" ❌ Failed to save to database") print(
f" ❌ Failed to save {company_data['name']} to database"
)
except Exception as e: except Exception as e:
db.rollback() db.rollback()
print(f" ❌ Database error: {e}") print(
else: f" ❌ Database error for {company_data['name']}: {e}"
print(" ⚠️ Failed to process profile") )
elif isinstance(company_data, Exception):
print(f" ❌ Exception occurred: {company_data}")
# Commit every 10 companies to avoid memory issues # Commit batch to database
if save_to_db and db and (idx + 1) % 10 == 0:
db.commit()
print(f"\n💾 Committed batch at row {idx + 1}")
except Exception as e:
print(f"❌ Error processing row {idx + 1}: {e}")
if db:
db.rollback()
continue
# Final commit
if save_to_db and db: if save_to_db and db:
try:
db.commit() db.commit()
print("\n✅ Final commit completed") print(f"💾 Committed batch {batch_start + 1}-{batch_end}")
except Exception as e:
db.rollback()
print(f"❌ Failed to commit batch: {e}")
except Exception as e: except Exception as e:
print(f"❌ Fatal error in parse_companies: {e}") print(f"❌ Fatal error in parse_companies: {e}")
BIN
View File
Binary file not shown.