feat: Update investor report generation and HTML template to include fund details and improve data handling

2025-10-21 10:48:58 +01:00
parent 63d8e57e57
commit 483c2cc114
10 changed files with 289 additions and 135 deletions
@@ -61,16 +61,18 @@ async def parse_csv(
    - Handles AUM, fund sizes, and check sizes as integers
    **For companies:**
-    - Expected columns: Name, Website, Investor, Final Investor Profile (company profile)
+    - Expected columns: Name, Website, Perplexity Gap Output (or Final Investor Profile)
    - 100% manual JSON parsing - no LLM needed
-    - Extracts company details, executives, investors, and client categories
+    - **Only extracts:** founded_year and key_executives
-    - Automatically links companies to investors in database
+    - **Only updates companies already in the database** (syncs with existing records)
    - Skips companies not found in the database
    **Benefits:**
    - Fast processing (5-10s per record)
    - Low cost (minimal or no LLM usage)
    - Accurate data extraction
    - Automatic database persistence
    - Safe: won't create duplicate companies
    """
    # Read uploaded CSV with pandas
    content = await file.read()
@@ -52,7 +52,6 @@ async def generate_investor_report(
        "website": investor.website,
        "headquarters": investor.headquarters,
        "aum": investor.aum,
        "geographic_focus": investor.geographic_focus,
        "portfolio_highlights": investor.portfolio_highlights or [],
        "investment_thesis": investor.investment_thesis or [],
        "sectors": [sector.name for sector in investor.sectors],
@@ -65,24 +64,22 @@ async def generate_investor_report(
            }
            for member in investor.team_members
        ],
-        "check_size_lower": None,
+        "funds": [],
        "check_size_upper": None,
        "investment_stages": [],
    }
-    # Get check sizes and stages from funds
+    # Get all funds with their data
    if investor.funds:
        # Use the first fund's data or aggregate
        fund = investor.funds[0]
        investor_data["check_size_lower"] = fund.check_size_lower
        investor_data["check_size_upper"] = fund.check_size_upper
        # Aggregate all investment stages from all funds
        stages = set()
        for fund in investor.funds:
-            for stage in fund.investment_stages:
+            fund_data = {
-                stages.add(stage.name)
+                "fund_name": fund.fund_name,
-        investor_data["investment_stages"] = list(stages)
+                "fund_size": fund.fund_size,
                "check_size_lower": fund.check_size_lower,
                "check_size_upper": fund.check_size_upper,
                "geographic_focus": fund.geographic_focus,
                "investment_stages": [stage.name for stage in fund.investment_stages],
                "sectors": [sector.name for sector in fund.sectors],
            }
            investor_data["funds"].append(fund_data)
    # Fetch project data if project_id is provided
    project_data = None
@@ -145,16 +145,74 @@ Return the lower and upper bounds in USD."""
        """
        Manually parse the JSON profile from the CSV.
        Returns a cleaned dictionary with the investor profile data.
        Handles JSON wrapped in markdown code blocks (```json ... ```).
        Handles trailing quotes and extra data after JSON.
        """
        if not json_str or pd.isna(json_str):
            return None
        try:
            # Clean the JSON string
            cleaned_json = json_str.strip()
            # Check if it's plain text (no JSON structure)
            if not cleaned_json.startswith(("{", "```", "'")):
                print("   ⚠️  No JSON structure found - skipping")
                return None
            # Remove markdown code block markers if present
            if cleaned_json.startswith("```"):
                # Remove opening marker (```json or ```Json or ```)
                lines = cleaned_json.split("\n")
                if lines[0].startswith("```"):
                    lines = lines[1:]  # Remove first line
                # Remove closing marker (``` or ```')
                if lines and lines[-1].strip() in ("```", "```'", '```"'):
                    lines = lines[:-1]  # Remove last line
                cleaned_json = "\n".join(lines).strip()
            # Remove trailing quotes that might be left over
            if cleaned_json.endswith(("'", '"')):
                cleaned_json = cleaned_json[:-1].strip()
            # Try to find JSON boundaries if there's extra data
            # Look for the first { and the last }
            start_idx = cleaned_json.find("{")
            if start_idx == -1:
                print("   ⚠️  No opening brace found - not valid JSON")
                return None
            # Find the matching closing brace
            # We need to count braces to find the actual end
            brace_count = 0
            end_idx = -1
            for i in range(start_idx, len(cleaned_json)):
                if cleaned_json[i] == "{":
                    brace_count += 1
                elif cleaned_json[i] == "}":
                    brace_count -= 1
                    if brace_count == 0:
                        end_idx = i + 1
                        break
            if end_idx == -1:
                print("   ⚠️  No matching closing brace found")
                return None
            # Extract just the JSON part
            cleaned_json = cleaned_json[start_idx:end_idx]
            # Parse JSON string
-            profile = json.loads(json_str)
+            profile = json.loads(cleaned_json)
            return profile
        except json.JSONDecodeError as e:
-            print(f"Error parsing JSON: {e}")
+            print(f"   ❌ JSON parsing error: {e}")
            # Print first 200 chars for debugging
            preview = json_str[:200] if len(json_str) > 200 else json_str
            print(f"   Preview: {preview}...")
            return None
        except Exception as e:
            print(f"   ❌ Unexpected error: {e}")
            return None
    async def process_investor_profile(
@@ -338,34 +396,45 @@ Return the lower and upper bounds in USD."""
            if existing_company:
                # Update only founded_year on existing company
                company = existing_company
                updated_fields = []
                if company_data.get("founded_year"):
                    company.founded_year = company_data["founded_year"]
                    updated_fields.append(
                        f"founded_year: {company_data['founded_year']}"
                    )
                # Add/update company members (key executives)
                # First, remove existing members if updating
                db.query(CompanyMember).filter_by(company_id=company.id).delete()
                exec_count = 0
                for exec_data in company_data.get("key_executives", []):
                    member = CompanyMember(
                        name=exec_data.get("name"),
                        role=exec_data.get("title"),
                        linkedin=exec_data.get(
                            "source_url"
                        ),  # Store source URL in linkedin field
                        company_id=company.id,
                    )
                    db.add(member)
                    exec_count += 1
                if exec_count > 0:
                    updated_fields.append(f"{exec_count} executives")
                if updated_fields:
                    print(f"      📝 Updated: {', '.join(updated_fields)}")
                return company
            else:
-                # Company should already be in base database, but if not found, skip
+                # Company not found in base database, skip
-                print(
+                print("      ⚠️  Not in database - skipping")
                    f"⚠️  Company '{company_data['name']}' not found in base database - skipping"
                )
                return None
            # Add/update company members (key executives)
            # First, remove existing members if updating
            db.query(CompanyMember).filter_by(company_id=company.id).delete()
            for exec_data in company_data.get("key_executives", []):
                member = CompanyMember(
                    name=exec_data.get("name"),
                    role=exec_data.get("title"),
                    linkedin=exec_data.get(
                        "source_url"
                    ),  # Store source URL in linkedin field
                    company_id=company.id,
                )
                db.add(member)
            return company
        except Exception as e:
-            print(f"Error saving company to database: {e}")
+            print(f"      ❌ Error saving: {e}")
            db.rollback()
            return None
@@ -789,8 +858,11 @@ Return the lower and upper bounds in USD."""
                if pd.notna(row.get("Investor"))
                else None
            )
            # Try both column names for flexibility
            profile_json = (
-                row.get("Final Investor Profile", "")
+                row.get("Perplexity Gap Output", "")
                if pd.notna(row.get("Perplexity Gap Output"))
                else row.get("Final Investor Profile", "")
                if pd.notna(row.get("Final Investor Profile"))
                else None
            )
@@ -80,34 +80,70 @@ class ReportGenerator:
            "thesis": 5,
        }
        # Aggregate data from all funds
        all_sectors = set(investor_data.get("sectors", []))
        all_stages = set()
        all_geographies = []
        check_ranges = []
        for fund in investor_data.get("funds", []):
            all_sectors.update(fund.get("sectors", []))
            all_stages.update(fund.get("investment_stages", []))
            if fund.get("geographic_focus"):
                all_geographies.append(fund["geographic_focus"])
            if fund.get("check_size_lower") and fund.get("check_size_upper"):
                check_ranges.append(
                    {
                        "lower": fund["check_size_lower"],
                        "upper": fund["check_size_upper"],
                    }
                )
        # Sector match
        investor_sectors = set(investor_data.get("sectors", []))
        project_sectors = set(project_data.get("sectors", []))
-        if investor_sectors and project_sectors:
+        if all_sectors and project_sectors:
-            if investor_sectors & project_sectors:
+            if all_sectors & project_sectors:
                score += weights["sector"]
-        # Stage match
+        # Stage match - case insensitive comparison
        investor_stages = set(investor_data.get("investment_stages", []))
        project_stage = project_data.get("stage")
-        if project_stage and project_stage in investor_stages:
+        if project_stage and all_stages:
-            score += weights["stage"]
+            # Normalize stage names for comparison (case-insensitive)
            normalized_stages = {
                stage.lower().replace("_", " ") for stage in all_stages
            }
            project_stage_normalized = project_stage.lower().replace("_", " ")
            if project_stage_normalized in normalized_stages:
                score += weights["stage"]
-        # Geography match
+        # Geography match - check if any fund matches
        investor_geo = (investor_data.get("geographic_focus") or "").lower()
        project_geo = (project_data.get("location") or "").lower()
-        if investor_geo and project_geo and investor_geo in project_geo:
+        geo_match = False
        if all_geographies:
            for geo in all_geographies:
                if geo:
                    geo_lower = geo.lower()
                    # Match if investor geography is "global" or if there's a location overlap
                    if "global" in geo_lower or "worldwide" in geo_lower:
                        geo_match = True
                        break
                    if project_geo and (
                        geo_lower in project_geo or project_geo in geo_lower
                    ):
                        geo_match = True
                        break
        if geo_match:
            score += weights["geography"]
-        # Check size match
+        # Check size match - check if any fund's range matches
        project_valuation = project_data.get("valuation", 0)
-        check_lower = investor_data.get("check_size_lower") or 0
+        check_match = False
-        check_upper = investor_data.get("check_size_upper") or float("inf")
+        if project_valuation and check_ranges:
-        if (
+            for check_range in check_ranges:
-            check_lower
+                if check_range["lower"] <= project_valuation <= check_range["upper"]:
-            and check_upper
+                    check_match = True
-            and check_lower <= project_valuation <= check_upper
+                    break
-        ):
+        if check_match:
            score += weights["check_size"]
        # Thesis alignment (simplified)
@@ -121,86 +157,126 @@ class ReportGenerator:
        """Generate detailed match criteria table"""
        criteria = []
        # Aggregate data from all funds
        all_sectors = set(investor_data.get("sectors", []))
        all_stages = set()
        all_geographies = []
        check_ranges = []
        for fund in investor_data.get("funds", []):
            all_sectors.update(fund.get("sectors", []))
            all_stages.update(fund.get("investment_stages", []))
            if fund.get("geographic_focus"):
                all_geographies.append(fund["geographic_focus"])
            if fund.get("check_size_lower") and fund.get("check_size_upper"):
                check_ranges.append(
                    {
                        "lower": fund["check_size_lower"],
                        "upper": fund["check_size_upper"],
                        "fund_name": fund.get("fund_name", "Unnamed Fund"),
                    }
                )
        # Sector criterion
        investor_sectors = investor_data.get("sectors", [])
        project_sectors = project_data.get("sectors", [])
-        sector_match = (
+        sector_match = "Perfect" if all_sectors & set(project_sectors) else "Mismatch"
            "Perfect" if set(investor_sectors) & set(project_sectors) else "Mismatch"
        )
        criteria.append(
            {
                "name": "Sector",
-                "requirement": "Cybersecurity, B2B SaaS" if project_sectors else "N/A",
+                "requirement": ", ".join(project_sectors) if project_sectors else "N/A",
-                "evidence": ", ".join(investor_sectors[:3])
+                "evidence": ", ".join(list(all_sectors)[:3]) if all_sectors else "N/A",
                if investor_sectors
                else "N/A",
                "match": sector_match,
                "weight": "30%",
            }
        )
-        # Stage criterion
+        # Stage criterion - case insensitive comparison
        investor_stages = investor_data.get("investment_stages", [])
        project_stage = project_data.get("stage", "N/A")
-        stage_match = "Perfect" if project_stage in investor_stages else "Mismatch"
+        stage_match = "Mismatch"
        if project_stage != "N/A" and all_stages:
            # Normalize stage names for comparison
            normalized_stages = {
                stage.lower().replace("_", " ") for stage in all_stages
            }
            project_stage_normalized = project_stage.lower().replace("_", " ")
            stage_match = (
                "Perfect"
                if project_stage_normalized in normalized_stages
                else "Mismatch"
            )
        elif project_stage == "N/A":
            stage_match = "N/A"
        criteria.append(
            {
                "name": "Stage",
                "requirement": str(project_stage),
-                "evidence": ", ".join(investor_stages) if investor_stages else "N/A",
+                "evidence": ", ".join(all_stages) if all_stages else "N/A",
                "match": stage_match,
                "weight": "30%",
            }
        )
        # Geography criterion
        investor_geo = investor_data.get("geographic_focus") or "N/A"
        project_geo = project_data.get("location") or "N/A"
        investor_geo_display = ", ".join(all_geographies) if all_geographies else "N/A"
        # Safe comparison handling None values and "Global" matches
        geo_match = "Mismatch"
        if project_geo != "N/A" and all_geographies:
            for geo in all_geographies:
                if geo:
                    geo_lower = geo.lower()
                    # Match if investor geography is "global" or if there's a location overlap
                    if "global" in geo_lower or "worldwide" in geo_lower:
                        geo_match = "Perfect"
                        break
                    if (
                        geo_lower in project_geo.lower()
                        or project_geo.lower() in geo_lower
                    ):
                        geo_match = "Strong"
                        break
        elif not all_geographies and project_geo == "N/A":
            geo_match = "N/A"
        # Safe comparison handling None values
        if investor_geo == "N/A" or project_geo == "N/A":
            geo_match = (
                "N/A" if investor_geo == "N/A" and project_geo == "N/A" else "Mismatch"
            )
        else:
            investor_geo_lower = investor_geo.lower()
            project_geo_lower = project_geo.lower()
            geo_match = (
                "Strong"
                if investor_geo_lower in project_geo_lower
                or project_geo_lower in investor_geo_lower
                else "Mismatch"
            )
        criteria.append(
            {
                "name": "Geography",
                "requirement": project_geo,
-                "evidence": investor_geo,
+                "evidence": investor_geo_display,
                "match": geo_match,
                "weight": "20%",
            }
        )
        # Check Size criterion
        check_lower = investor_data.get("check_size_lower") or 0
        check_upper = investor_data.get("check_size_upper") or 0
        project_val = project_data.get("valuation", 0)
        # Build evidence string from all fund ranges
        check_evidence = "N/A"
-        if check_lower and check_upper:
+        if check_ranges:
-            check_evidence = (
+            evidence_parts = []
-                f"€{check_lower / 1000000:.0f}M - €{check_upper / 1000000:.0f}M"
+            for cr in check_ranges[:3]:  # Show up to 3 funds
-            )
+                range_str = (
-        elif check_lower:
+                    f"€{cr['lower'] / 1000000:.0f}M - €{cr['upper'] / 1000000:.0f}M"
-            check_evidence = f"€{check_lower / 1000000:.0f}M+"
+                )
                if cr["fund_name"]:
                    evidence_parts.append(f"{cr['fund_name']}: {range_str}")
                else:
                    evidence_parts.append(range_str)
            check_evidence = "; ".join(evidence_parts)
        # Check if project valuation matches any fund
        check_match = "N/A"
        if project_val > 0 and check_ranges:
            match_found = any(
                cr["lower"] <= project_val <= cr["upper"] for cr in check_ranges
            )
            check_match = "Perfect" if match_found else "Mismatch"
        elif project_val > 0:
            check_match = "Strong"
        check_match = (
            "Perfect"
            if check_lower and check_upper and check_lower <= project_val <= check_upper
            else "Strong"
            if project_val > 0
            else "N/A"
        )
        criteria.append(
            {
                "name": "Check Size",
@@ -161,13 +161,6 @@
                                </p>
                            </div>
                            <div>
                                <p class="text-xs text-gray-600">DACH Region:</p>
                                <p class="font-semibold text-gray-900">
                                    {{ investor.geographic_focus or 'N/A' }}
                                </p>
                            </div>
                            <div>
                                <p class="text-xs text-gray-600">AUM (EUR million):</p>
                                <p class="font-semibold text-gray-900">
@@ -179,33 +172,47 @@
                                </p>
                            </div>
-                    <div class="mb-4">
+                            <div>
-                        <p class="text-xs text-gray-600 mb-1">
+                                <p class="text-xs text-gray-600 mb-1">Number of Funds:</p>
-                            Investment Stage:
+                                <p class="font-semibold text-gray-900">
-                        </p>
+                                    {{ investor.funds | length if investor.funds else 'N/A' }}
                        <p class="text-sm font-semibold text-gray-900">
                            {% if investor.investment_stages %} {{
                            investor.investment_stages | join(', ') }} {% else
                            %} N/A {% endif %}
                        </p>
                    </div>
                    <div class="mb-4">
                        <p class="text-xs text-gray-600 mb-1">
                            Est. Investment Size:
                        </p>
                        <p class="text-sm font-semibold text-gray-900">
                            {% if investor.check_size_lower and
                            investor.check_size_upper %} €{{
                            '{:,.0f}'.format(investor.check_size_lower /
                            1000000) }}M - €{{
                            '{:,.0f}'.format(investor.check_size_upper /
                            1000000) }}M {% elif investor.check_size_lower %}
                            €{{ '{:,.0f}'.format(investor.check_size_lower /
                            1000000) }}M+ {% else %} N/A {% endif %}
                                </p>
                            </div>
                        </div>
                        <div class="mt-4">
                            <h3 class="text-xs font-bold text-gray-900 uppercase mb-2">
                                Fund Details
                            </h3>
                            {% if investor.funds %}
                            {% for fund in investor.funds %}
                            <div class="mb-3 pb-3 border-b border-gray-200">
                                <p class="text-sm font-semibold text-gray-900 mb-1">
                                    {{ fund.fund_name or 'Fund ' + loop.index|string }}
                                </p>
                                <div class="text-xs text-gray-700 space-y-1">
                                    {% if fund.fund_size %}
                                    <p>Fund Size: €{{ '{:,.0f}'.format(fund.fund_size / 1000000) }}M</p>
                                    {% endif %}
                                    {% if fund.check_size_lower and fund.check_size_upper %}
                                    <p>Check Size: €{{ '{:,.0f}'.format(fund.check_size_lower / 1000000) }}M - €{{ '{:,.0f}'.format(fund.check_size_upper / 1000000) }}M</p>
                                    {% endif %}
                                    {% if fund.geographic_focus %}
                                    <p>Geography: {{ fund.geographic_focus }}</p>
                                    {% endif %}
                                    {% if fund.investment_stages %}
                                    <p>Stages: {{ fund.investment_stages | join(', ') }}</p>
                                    {% endif %}
                                    {% if fund.sectors %}
                                    <p>Sectors: {{ fund.sectors[:3] | join(', ') }}</p>
                                    {% endif %}
                                </div>
                            </div>
                            {% endfor %}
                            {% else %}
                            <p class="text-xs text-gray-500">No fund information available</p>
                            {% endif %}
                        </div>
                    </div>
                </div>