feat: Update investor report generation and HTML template to include fund details and improve data handling

This commit is contained in:
bolade
2025-10-21 10:48:58 +01:00
parent 63d8e57e57
commit 483c2cc114
10 changed files with 289 additions and 135 deletions
Binary file not shown.
+5 -3
View File
@@ -61,16 +61,18 @@ async def parse_csv(
- Handles AUM, fund sizes, and check sizes as integers - Handles AUM, fund sizes, and check sizes as integers
**For companies:** **For companies:**
- Expected columns: Name, Website, Investor, Final Investor Profile (company profile) - Expected columns: Name, Website, Perplexity Gap Output (or Final Investor Profile)
- 100% manual JSON parsing - no LLM needed - 100% manual JSON parsing - no LLM needed
- Extracts company details, executives, investors, and client categories - **Only extracts:** founded_year and key_executives
- Automatically links companies to investors in database - **Only updates companies already in the database** (syncs with existing records)
- Skips companies not found in the database
**Benefits:** **Benefits:**
- Fast processing (5-10s per record) - Fast processing (5-10s per record)
- Low cost (minimal or no LLM usage) - Low cost (minimal or no LLM usage)
- Accurate data extraction - Accurate data extraction
- Automatic database persistence - Automatic database persistence
- Safe: won't create duplicate companies
""" """
# Read uploaded CSV with pandas # Read uploaded CSV with pandas
content = await file.read() content = await file.read()
Binary file not shown.
+12 -15
View File
@@ -52,7 +52,6 @@ async def generate_investor_report(
"website": investor.website, "website": investor.website,
"headquarters": investor.headquarters, "headquarters": investor.headquarters,
"aum": investor.aum, "aum": investor.aum,
"geographic_focus": investor.geographic_focus,
"portfolio_highlights": investor.portfolio_highlights or [], "portfolio_highlights": investor.portfolio_highlights or [],
"investment_thesis": investor.investment_thesis or [], "investment_thesis": investor.investment_thesis or [],
"sectors": [sector.name for sector in investor.sectors], "sectors": [sector.name for sector in investor.sectors],
@@ -65,24 +64,22 @@ async def generate_investor_report(
} }
for member in investor.team_members for member in investor.team_members
], ],
"check_size_lower": None, "funds": [],
"check_size_upper": None,
"investment_stages": [],
} }
# Get check sizes and stages from funds # Get all funds with their data
if investor.funds: if investor.funds:
# Use the first fund's data or aggregate
fund = investor.funds[0]
investor_data["check_size_lower"] = fund.check_size_lower
investor_data["check_size_upper"] = fund.check_size_upper
# Aggregate all investment stages from all funds
stages = set()
for fund in investor.funds: for fund in investor.funds:
for stage in fund.investment_stages: fund_data = {
stages.add(stage.name) "fund_name": fund.fund_name,
investor_data["investment_stages"] = list(stages) "fund_size": fund.fund_size,
"check_size_lower": fund.check_size_lower,
"check_size_upper": fund.check_size_upper,
"geographic_focus": fund.geographic_focus,
"investment_stages": [stage.name for stage in fund.investment_stages],
"sectors": [sector.name for sector in fund.sectors],
}
investor_data["funds"].append(fund_data)
# Fetch project data if project_id is provided # Fetch project data if project_id is provided
project_data = None project_data = None
Binary file not shown.
Binary file not shown.
+97 -25
View File
@@ -145,16 +145,74 @@ Return the lower and upper bounds in USD."""
""" """
Manually parse the JSON profile from the CSV. Manually parse the JSON profile from the CSV.
Returns a cleaned dictionary with the investor profile data. Returns a cleaned dictionary with the investor profile data.
Handles JSON wrapped in markdown code blocks (```json ... ```).
Handles trailing quotes and extra data after JSON.
""" """
if not json_str or pd.isna(json_str): if not json_str or pd.isna(json_str):
return None return None
try: try:
# Clean the JSON string
cleaned_json = json_str.strip()
# Check if it's plain text (no JSON structure)
if not cleaned_json.startswith(("{", "```", "'")):
print(" ⚠️ No JSON structure found - skipping")
return None
# Remove markdown code block markers if present
if cleaned_json.startswith("```"):
# Remove opening marker (```json or ```Json or ```)
lines = cleaned_json.split("\n")
if lines[0].startswith("```"):
lines = lines[1:] # Remove first line
# Remove closing marker (``` or ```')
if lines and lines[-1].strip() in ("```", "```'", '```"'):
lines = lines[:-1] # Remove last line
cleaned_json = "\n".join(lines).strip()
# Remove trailing quotes that might be left over
if cleaned_json.endswith(("'", '"')):
cleaned_json = cleaned_json[:-1].strip()
# Try to find JSON boundaries if there's extra data
# Look for the first { and the last }
start_idx = cleaned_json.find("{")
if start_idx == -1:
print(" ⚠️ No opening brace found - not valid JSON")
return None
# Find the matching closing brace
# We need to count braces to find the actual end
brace_count = 0
end_idx = -1
for i in range(start_idx, len(cleaned_json)):
if cleaned_json[i] == "{":
brace_count += 1
elif cleaned_json[i] == "}":
brace_count -= 1
if brace_count == 0:
end_idx = i + 1
break
if end_idx == -1:
print(" ⚠️ No matching closing brace found")
return None
# Extract just the JSON part
cleaned_json = cleaned_json[start_idx:end_idx]
# Parse JSON string # Parse JSON string
profile = json.loads(json_str) profile = json.loads(cleaned_json)
return profile return profile
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}") print(f" ❌ JSON parsing error: {e}")
# Print first 200 chars for debugging
preview = json_str[:200] if len(json_str) > 200 else json_str
print(f" Preview: {preview}...")
return None
except Exception as e:
print(f" ❌ Unexpected error: {e}")
return None return None
async def process_investor_profile( async def process_investor_profile(
@@ -338,34 +396,45 @@ Return the lower and upper bounds in USD."""
if existing_company: if existing_company:
# Update only founded_year on existing company # Update only founded_year on existing company
company = existing_company company = existing_company
updated_fields = []
if company_data.get("founded_year"): if company_data.get("founded_year"):
company.founded_year = company_data["founded_year"] company.founded_year = company_data["founded_year"]
updated_fields.append(
f"founded_year: {company_data['founded_year']}"
)
# Add/update company members (key executives)
# First, remove existing members if updating
db.query(CompanyMember).filter_by(company_id=company.id).delete()
exec_count = 0
for exec_data in company_data.get("key_executives", []):
member = CompanyMember(
name=exec_data.get("name"),
role=exec_data.get("title"),
linkedin=exec_data.get(
"source_url"
), # Store source URL in linkedin field
company_id=company.id,
)
db.add(member)
exec_count += 1
if exec_count > 0:
updated_fields.append(f"{exec_count} executives")
if updated_fields:
print(f" 📝 Updated: {', '.join(updated_fields)}")
return company
else: else:
# Company should already be in base database, but if not found, skip # Company not found in base database, skip
print( print(" ⚠️ Not in database - skipping")
f"⚠️ Company '{company_data['name']}' not found in base database - skipping"
)
return None return None
# Add/update company members (key executives)
# First, remove existing members if updating
db.query(CompanyMember).filter_by(company_id=company.id).delete()
for exec_data in company_data.get("key_executives", []):
member = CompanyMember(
name=exec_data.get("name"),
role=exec_data.get("title"),
linkedin=exec_data.get(
"source_url"
), # Store source URL in linkedin field
company_id=company.id,
)
db.add(member)
return company
except Exception as e: except Exception as e:
print(f"Error saving company to database: {e}") print(f"Error saving: {e}")
db.rollback() db.rollback()
return None return None
@@ -789,8 +858,11 @@ Return the lower and upper bounds in USD."""
if pd.notna(row.get("Investor")) if pd.notna(row.get("Investor"))
else None else None
) )
# Try both column names for flexibility
profile_json = ( profile_json = (
row.get("Final Investor Profile", "") row.get("Perplexity Gap Output", "")
if pd.notna(row.get("Perplexity Gap Output"))
else row.get("Final Investor Profile", "")
if pd.notna(row.get("Final Investor Profile")) if pd.notna(row.get("Final Investor Profile"))
else None else None
) )
+137 -61
View File
@@ -80,34 +80,70 @@ class ReportGenerator:
"thesis": 5, "thesis": 5,
} }
# Aggregate data from all funds
all_sectors = set(investor_data.get("sectors", []))
all_stages = set()
all_geographies = []
check_ranges = []
for fund in investor_data.get("funds", []):
all_sectors.update(fund.get("sectors", []))
all_stages.update(fund.get("investment_stages", []))
if fund.get("geographic_focus"):
all_geographies.append(fund["geographic_focus"])
if fund.get("check_size_lower") and fund.get("check_size_upper"):
check_ranges.append(
{
"lower": fund["check_size_lower"],
"upper": fund["check_size_upper"],
}
)
# Sector match # Sector match
investor_sectors = set(investor_data.get("sectors", []))
project_sectors = set(project_data.get("sectors", [])) project_sectors = set(project_data.get("sectors", []))
if investor_sectors and project_sectors: if all_sectors and project_sectors:
if investor_sectors & project_sectors: if all_sectors & project_sectors:
score += weights["sector"] score += weights["sector"]
# Stage match # Stage match - case insensitive comparison
investor_stages = set(investor_data.get("investment_stages", []))
project_stage = project_data.get("stage") project_stage = project_data.get("stage")
if project_stage and project_stage in investor_stages: if project_stage and all_stages:
score += weights["stage"] # Normalize stage names for comparison (case-insensitive)
normalized_stages = {
stage.lower().replace("_", " ") for stage in all_stages
}
project_stage_normalized = project_stage.lower().replace("_", " ")
if project_stage_normalized in normalized_stages:
score += weights["stage"]
# Geography match # Geography match - check if any fund matches
investor_geo = (investor_data.get("geographic_focus") or "").lower()
project_geo = (project_data.get("location") or "").lower() project_geo = (project_data.get("location") or "").lower()
if investor_geo and project_geo and investor_geo in project_geo: geo_match = False
if all_geographies:
for geo in all_geographies:
if geo:
geo_lower = geo.lower()
# Match if investor geography is "global" or if there's a location overlap
if "global" in geo_lower or "worldwide" in geo_lower:
geo_match = True
break
if project_geo and (
geo_lower in project_geo or project_geo in geo_lower
):
geo_match = True
break
if geo_match:
score += weights["geography"] score += weights["geography"]
# Check size match # Check size match - check if any fund's range matches
project_valuation = project_data.get("valuation", 0) project_valuation = project_data.get("valuation", 0)
check_lower = investor_data.get("check_size_lower") or 0 check_match = False
check_upper = investor_data.get("check_size_upper") or float("inf") if project_valuation and check_ranges:
if ( for check_range in check_ranges:
check_lower if check_range["lower"] <= project_valuation <= check_range["upper"]:
and check_upper check_match = True
and check_lower <= project_valuation <= check_upper break
): if check_match:
score += weights["check_size"] score += weights["check_size"]
# Thesis alignment (simplified) # Thesis alignment (simplified)
@@ -121,86 +157,126 @@ class ReportGenerator:
"""Generate detailed match criteria table""" """Generate detailed match criteria table"""
criteria = [] criteria = []
# Aggregate data from all funds
all_sectors = set(investor_data.get("sectors", []))
all_stages = set()
all_geographies = []
check_ranges = []
for fund in investor_data.get("funds", []):
all_sectors.update(fund.get("sectors", []))
all_stages.update(fund.get("investment_stages", []))
if fund.get("geographic_focus"):
all_geographies.append(fund["geographic_focus"])
if fund.get("check_size_lower") and fund.get("check_size_upper"):
check_ranges.append(
{
"lower": fund["check_size_lower"],
"upper": fund["check_size_upper"],
"fund_name": fund.get("fund_name", "Unnamed Fund"),
}
)
# Sector criterion # Sector criterion
investor_sectors = investor_data.get("sectors", [])
project_sectors = project_data.get("sectors", []) project_sectors = project_data.get("sectors", [])
sector_match = ( sector_match = "Perfect" if all_sectors & set(project_sectors) else "Mismatch"
"Perfect" if set(investor_sectors) & set(project_sectors) else "Mismatch"
)
criteria.append( criteria.append(
{ {
"name": "Sector", "name": "Sector",
"requirement": "Cybersecurity, B2B SaaS" if project_sectors else "N/A", "requirement": ", ".join(project_sectors) if project_sectors else "N/A",
"evidence": ", ".join(investor_sectors[:3]) "evidence": ", ".join(list(all_sectors)[:3]) if all_sectors else "N/A",
if investor_sectors
else "N/A",
"match": sector_match, "match": sector_match,
"weight": "30%", "weight": "30%",
} }
) )
# Stage criterion # Stage criterion - case insensitive comparison
investor_stages = investor_data.get("investment_stages", [])
project_stage = project_data.get("stage", "N/A") project_stage = project_data.get("stage", "N/A")
stage_match = "Perfect" if project_stage in investor_stages else "Mismatch" stage_match = "Mismatch"
if project_stage != "N/A" and all_stages:
# Normalize stage names for comparison
normalized_stages = {
stage.lower().replace("_", " ") for stage in all_stages
}
project_stage_normalized = project_stage.lower().replace("_", " ")
stage_match = (
"Perfect"
if project_stage_normalized in normalized_stages
else "Mismatch"
)
elif project_stage == "N/A":
stage_match = "N/A"
criteria.append( criteria.append(
{ {
"name": "Stage", "name": "Stage",
"requirement": str(project_stage), "requirement": str(project_stage),
"evidence": ", ".join(investor_stages) if investor_stages else "N/A", "evidence": ", ".join(all_stages) if all_stages else "N/A",
"match": stage_match, "match": stage_match,
"weight": "30%", "weight": "30%",
} }
) )
# Geography criterion # Geography criterion
investor_geo = investor_data.get("geographic_focus") or "N/A"
project_geo = project_data.get("location") or "N/A" project_geo = project_data.get("location") or "N/A"
investor_geo_display = ", ".join(all_geographies) if all_geographies else "N/A"
# Safe comparison handling None values and "Global" matches
geo_match = "Mismatch"
if project_geo != "N/A" and all_geographies:
for geo in all_geographies:
if geo:
geo_lower = geo.lower()
# Match if investor geography is "global" or if there's a location overlap
if "global" in geo_lower or "worldwide" in geo_lower:
geo_match = "Perfect"
break
if (
geo_lower in project_geo.lower()
or project_geo.lower() in geo_lower
):
geo_match = "Strong"
break
elif not all_geographies and project_geo == "N/A":
geo_match = "N/A"
# Safe comparison handling None values
if investor_geo == "N/A" or project_geo == "N/A":
geo_match = (
"N/A" if investor_geo == "N/A" and project_geo == "N/A" else "Mismatch"
)
else:
investor_geo_lower = investor_geo.lower()
project_geo_lower = project_geo.lower()
geo_match = (
"Strong"
if investor_geo_lower in project_geo_lower
or project_geo_lower in investor_geo_lower
else "Mismatch"
)
criteria.append( criteria.append(
{ {
"name": "Geography", "name": "Geography",
"requirement": project_geo, "requirement": project_geo,
"evidence": investor_geo, "evidence": investor_geo_display,
"match": geo_match, "match": geo_match,
"weight": "20%", "weight": "20%",
} }
) )
# Check Size criterion # Check Size criterion
check_lower = investor_data.get("check_size_lower") or 0
check_upper = investor_data.get("check_size_upper") or 0
project_val = project_data.get("valuation", 0) project_val = project_data.get("valuation", 0)
# Build evidence string from all fund ranges
check_evidence = "N/A" check_evidence = "N/A"
if check_lower and check_upper: if check_ranges:
check_evidence = ( evidence_parts = []
f"{check_lower / 1000000:.0f}M - €{check_upper / 1000000:.0f}M" for cr in check_ranges[:3]: # Show up to 3 funds
) range_str = (
elif check_lower: f"{cr['lower'] / 1000000:.0f}M - €{cr['upper'] / 1000000:.0f}M"
check_evidence = f"{check_lower / 1000000:.0f}M+" )
if cr["fund_name"]:
evidence_parts.append(f"{cr['fund_name']}: {range_str}")
else:
evidence_parts.append(range_str)
check_evidence = "; ".join(evidence_parts)
# Check if project valuation matches any fund
check_match = "N/A"
if project_val > 0 and check_ranges:
match_found = any(
cr["lower"] <= project_val <= cr["upper"] for cr in check_ranges
)
check_match = "Perfect" if match_found else "Mismatch"
elif project_val > 0:
check_match = "Strong"
check_match = (
"Perfect"
if check_lower and check_upper and check_lower <= project_val <= check_upper
else "Strong"
if project_val > 0
else "N/A"
)
criteria.append( criteria.append(
{ {
"name": "Check Size", "name": "Check Size",
+38 -31
View File
@@ -161,13 +161,6 @@
</p> </p>
</div> </div>
<div>
<p class="text-xs text-gray-600">DACH Region:</p>
<p class="font-semibold text-gray-900">
{{ investor.geographic_focus or 'N/A' }}
</p>
</div>
<div> <div>
<p class="text-xs text-gray-600">AUM (EUR million):</p> <p class="text-xs text-gray-600">AUM (EUR million):</p>
<p class="font-semibold text-gray-900"> <p class="font-semibold text-gray-900">
@@ -179,33 +172,47 @@
</p> </p>
</div> </div>
<div class="mb-4"> <div>
<p class="text-xs text-gray-600 mb-1"> <p class="text-xs text-gray-600 mb-1">Number of Funds:</p>
Investment Stage: <p class="font-semibold text-gray-900">
</p> {{ investor.funds | length if investor.funds else 'N/A' }}
<p class="text-sm font-semibold text-gray-900">
{% if investor.investment_stages %} {{
investor.investment_stages | join(', ') }} {% else
%} N/A {% endif %}
</p>
</div>
<div class="mb-4">
<p class="text-xs text-gray-600 mb-1">
Est. Investment Size:
</p>
<p class="text-sm font-semibold text-gray-900">
{% if investor.check_size_lower and
investor.check_size_upper %} €{{
'{:,.0f}'.format(investor.check_size_lower /
1000000) }}M - €{{
'{:,.0f}'.format(investor.check_size_upper /
1000000) }}M {% elif investor.check_size_lower %}
€{{ '{:,.0f}'.format(investor.check_size_lower /
1000000) }}M+ {% else %} N/A {% endif %}
</p> </p>
</div> </div>
</div> </div>
<div class="mt-4">
<h3 class="text-xs font-bold text-gray-900 uppercase mb-2">
Fund Details
</h3>
{% if investor.funds %}
{% for fund in investor.funds %}
<div class="mb-3 pb-3 border-b border-gray-200">
<p class="text-sm font-semibold text-gray-900 mb-1">
{{ fund.fund_name or 'Fund ' + loop.index|string }}
</p>
<div class="text-xs text-gray-700 space-y-1">
{% if fund.fund_size %}
<p>Fund Size: €{{ '{:,.0f}'.format(fund.fund_size / 1000000) }}M</p>
{% endif %}
{% if fund.check_size_lower and fund.check_size_upper %}
<p>Check Size: €{{ '{:,.0f}'.format(fund.check_size_lower / 1000000) }}M - €{{ '{:,.0f}'.format(fund.check_size_upper / 1000000) }}M</p>
{% endif %}
{% if fund.geographic_focus %}
<p>Geography: {{ fund.geographic_focus }}</p>
{% endif %}
{% if fund.investment_stages %}
<p>Stages: {{ fund.investment_stages | join(', ') }}</p>
{% endif %}
{% if fund.sectors %}
<p>Sectors: {{ fund.sectors[:3] | join(', ') }}</p>
{% endif %}
</div>
</div>
{% endfor %}
{% else %}
<p class="text-xs text-gray-500">No fund information available</p>
{% endif %}
</div>
</div> </div>
</div> </div>
BIN
View File
Binary file not shown.