feat: Simplify company profile processing to only extract founded_year and key_executives

This commit is contained in:
bolade
2025-10-08 13:20:08 +01:00
parent 37e1ad01c4
commit be6fde9ba2
3 changed files with 12 additions and 55 deletions
Binary file not shown.
+11 -54
View File
@@ -265,37 +265,20 @@ Return the lower and upper bounds in USD."""
) -> Optional[dict]: ) -> Optional[dict]:
""" """
Process company profile from CSV data. Process company profile from CSV data.
Manually extracts fields without using LLM. Only extracts founded_year and key_executives - rest is in base database.
""" """
profile = self.parse_json_profile(profile_json) profile = self.parse_json_profile(profile_json)
if not profile: if not profile:
return None return None
try: try:
# Extract basic info # Only extract founded_year and key_executives
company_data = { company_data = {
"name": name.strip() if name else None, "name": name.strip() if name else None,
"website": website.strip() if website else None, "founded_year": None,
"description": profile.get("companyDescription"),
"location": profile.get("geographicFocus"),
"industry": profile.get("sectorDescription"),
"founded_year": None, # Not typically in the company JSON
"key_executives": [], "key_executives": [],
"client_categories": profile.get("clientCategories", []),
"product_description": profile.get("productDescription"),
"linked_documents": profile.get("linkedDocuments", []),
"researcher_notes": profile.get("researcherNotes"),
"missing_important_fields": profile.get("missingImportantFields", []),
"sources": profile.get("sources", {}),
"investor_names": [],
} }
# Parse investor names from the Investor column
if investor_names and pd.notna(investor_names):
# Split by comma and clean
investors = [inv.strip() for inv in str(investor_names).split(",")]
company_data["investor_names"] = [inv for inv in investors if inv]
# Process key executives/leadership # Process key executives/leadership
key_executives = profile.get("keyExecutives", []) key_executives = profile.get("keyExecutives", [])
if not key_executives: if not key_executives:
@@ -313,7 +296,7 @@ Return the lower and upper bounds in USD."""
) )
# Try to extract founding year from description # Try to extract founding year from description
description = company_data.get("description", "") description = profile.get("companyDescription", "")
if description: if description:
# Look for patterns like "founded in 2020", "Gegründet 2020", "founded 2020" # Look for patterns like "founded in 2020", "Gegründet 2020", "founded 2020"
year_patterns = [ year_patterns = [
@@ -344,40 +327,27 @@ Return the lower and upper bounds in USD."""
def _save_parsed_company_to_db( def _save_parsed_company_to_db(
self, db: Session, company_data: dict self, db: Session, company_data: dict
) -> Optional[CompanyTable]: ) -> Optional[CompanyTable]:
"""Save manually parsed company data to database""" """Save manually parsed company data to database - only updates founded_year and key_executives"""
try: try:
# Check if company already exists # Check if company already exists (should exist in base database)
existing_company = ( existing_company = (
db.query(CompanyTable).filter_by(name=company_data["name"]).first() db.query(CompanyTable).filter_by(name=company_data["name"]).first()
) )
if existing_company: if existing_company:
# Update existing company # Update only founded_year on existing company
company = existing_company company = existing_company
company.website = company_data.get("website") or company.website
company.location = company_data.get("location") or company.location
company.description = (
company_data.get("description") or company.description
)
company.industry = company_data.get("industry") or company.industry
if company_data.get("founded_year"): if company_data.get("founded_year"):
company.founded_year = company_data["founded_year"] company.founded_year = company_data["founded_year"]
else: else:
# Create new company # Company should already be in base database, but if not found, skip
company = CompanyTable( print(
name=company_data["name"], f"⚠️ Company '{company_data['name']}' not found in base database - skipping"
website=company_data.get("website"),
location=company_data.get("location"),
description=company_data.get("description"),
industry=company_data.get("industry"),
founded_year=company_data.get("founded_year"),
) )
db.add(company) return None
db.flush()
# Add/update company members (key executives) # Add/update company members (key executives)
# First, remove existing members if updating # First, remove existing members if updating
if existing_company:
db.query(CompanyMember).filter_by(company_id=company.id).delete() db.query(CompanyMember).filter_by(company_id=company.id).delete()
for exec_data in company_data.get("key_executives", []): for exec_data in company_data.get("key_executives", []):
@@ -391,19 +361,6 @@ Return the lower and upper bounds in USD."""
) )
db.add(member) db.add(member)
# Link to investors if provided
for investor_name in company_data.get("investor_names", []):
# Find investor in database
investor = (
db.query(InvestorTable)
.filter_by(name=investor_name.strip())
.first()
)
if investor:
# Add company to investor's portfolio if not already there
if company not in investor.portfolio_companies:
investor.portfolio_companies.append(company)
return company return company
except Exception as e: except Exception as e:
BIN
View File
Binary file not shown.