feat: Update investor report generation and HTML template to include fund details and improve data handling

2025-10-21 10:48:58 +01:00
parent 63d8e57e57
commit 483c2cc114
10 changed files with 289 additions and 135 deletions
@@ -145,16 +145,74 @@ Return the lower and upper bounds in USD."""
        """
        Manually parse the JSON profile from the CSV.
        Returns a cleaned dictionary with the investor profile data.
+        Handles JSON wrapped in markdown code blocks (```json ... ```).
+        Handles trailing quotes and extra data after JSON.
        """
        if not json_str or pd.isna(json_str):
            return None

        try:
+            # Clean the JSON string
+            cleaned_json = json_str.strip()
+
+            # Check if it's plain text (no JSON structure)
+            if not cleaned_json.startswith(("{", "```", "'")):
+                print("   ⚠️  No JSON structure found - skipping")
+                return None
+
+            # Remove markdown code block markers if present
+            if cleaned_json.startswith("```"):
+                # Remove opening marker (```json or ```Json or ```)
+                lines = cleaned_json.split("\n")
+                if lines[0].startswith("```"):
+                    lines = lines[1:]  # Remove first line
+                # Remove closing marker (``` or ```')
+                if lines and lines[-1].strip() in ("```", "```'", '```"'):
+                    lines = lines[:-1]  # Remove last line
+                cleaned_json = "\n".join(lines).strip()
+
+            # Remove trailing quotes that might be left over
+            if cleaned_json.endswith(("'", '"')):
+                cleaned_json = cleaned_json[:-1].strip()
+
+            # Try to find JSON boundaries if there's extra data
+            # Look for the first { and the last }
+            start_idx = cleaned_json.find("{")
+            if start_idx == -1:
+                print("   ⚠️  No opening brace found - not valid JSON")
+                return None
+
+            # Find the matching closing brace
+            # We need to count braces to find the actual end
+            brace_count = 0
+            end_idx = -1
+            for i in range(start_idx, len(cleaned_json)):
+                if cleaned_json[i] == "{":
+                    brace_count += 1
+                elif cleaned_json[i] == "}":
+                    brace_count -= 1
+                    if brace_count == 0:
+                        end_idx = i + 1
+                        break
+
+            if end_idx == -1:
+                print("   ⚠️  No matching closing brace found")
+                return None
+
+            # Extract just the JSON part
+            cleaned_json = cleaned_json[start_idx:end_idx]
+
            # Parse JSON string
-            profile = json.loads(json_str)
+            profile = json.loads(cleaned_json)
            return profile
        except json.JSONDecodeError as e:
-            print(f"Error parsing JSON: {e}")
+            print(f"   ❌ JSON parsing error: {e}")
+            # Print first 200 chars for debugging
+            preview = json_str[:200] if len(json_str) > 200 else json_str
+            print(f"   Preview: {preview}...")
+            return None
+        except Exception as e:
+            print(f"   ❌ Unexpected error: {e}")
            return None

    async def process_investor_profile(
@@ -338,34 +396,45 @@ Return the lower and upper bounds in USD."""
            if existing_company:
                # Update only founded_year on existing company
                company = existing_company
+                updated_fields = []
+
                if company_data.get("founded_year"):
                    company.founded_year = company_data["founded_year"]
+                    updated_fields.append(
+                        f"founded_year: {company_data['founded_year']}"
+                    )
+
+                # Add/update company members (key executives)
+                # First, remove existing members if updating
+                db.query(CompanyMember).filter_by(company_id=company.id).delete()
+
+                exec_count = 0
+                for exec_data in company_data.get("key_executives", []):
+                    member = CompanyMember(
+                        name=exec_data.get("name"),
+                        role=exec_data.get("title"),
+                        linkedin=exec_data.get(
+                            "source_url"
+                        ),  # Store source URL in linkedin field
+                        company_id=company.id,
+                    )
+                    db.add(member)
+                    exec_count += 1
+
+                if exec_count > 0:
+                    updated_fields.append(f"{exec_count} executives")
+
+                if updated_fields:
+                    print(f"      📝 Updated: {', '.join(updated_fields)}")
+
+                return company
            else:
-                # Company should already be in base database, but if not found, skip
-                print(
-                    f"⚠️  Company '{company_data['name']}' not found in base database - skipping"
-                )
+                # Company not found in base database, skip
+                print("      ⚠️  Not in database - skipping")
                return None

-            # Add/update company members (key executives)
-            # First, remove existing members if updating
-            db.query(CompanyMember).filter_by(company_id=company.id).delete()
-
-            for exec_data in company_data.get("key_executives", []):
-                member = CompanyMember(
-                    name=exec_data.get("name"),
-                    role=exec_data.get("title"),
-                    linkedin=exec_data.get(
-                        "source_url"
-                    ),  # Store source URL in linkedin field
-                    company_id=company.id,
-                )
-                db.add(member)
-
-            return company
-
        except Exception as e:
-            print(f"Error saving company to database: {e}")
+            print(f"      ❌ Error saving: {e}")
            db.rollback()
            return None

@@ -789,8 +858,11 @@ Return the lower and upper bounds in USD."""
                if pd.notna(row.get("Investor"))
                else None
            )
+            # Try both column names for flexibility
            profile_json = (
-                row.get("Final Investor Profile", "")
+                row.get("Perplexity Gap Output", "")
+                if pd.notna(row.get("Perplexity Gap Output"))
+                else row.get("Final Investor Profile", "")
                if pd.notna(row.get("Final Investor Profile"))
                else None
            )
@@ -80,34 +80,70 @@ class ReportGenerator:
            "thesis": 5,
        }

+        # Aggregate data from all funds
+        all_sectors = set(investor_data.get("sectors", []))
+        all_stages = set()
+        all_geographies = []
+        check_ranges = []
+
+        for fund in investor_data.get("funds", []):
+            all_sectors.update(fund.get("sectors", []))
+            all_stages.update(fund.get("investment_stages", []))
+            if fund.get("geographic_focus"):
+                all_geographies.append(fund["geographic_focus"])
+            if fund.get("check_size_lower") and fund.get("check_size_upper"):
+                check_ranges.append(
+                    {
+                        "lower": fund["check_size_lower"],
+                        "upper": fund["check_size_upper"],
+                    }
+                )
+
        # Sector match
-        investor_sectors = set(investor_data.get("sectors", []))
        project_sectors = set(project_data.get("sectors", []))
-        if investor_sectors and project_sectors:
-            if investor_sectors & project_sectors:
+        if all_sectors and project_sectors:
+            if all_sectors & project_sectors:
                score += weights["sector"]

-        # Stage match
-        investor_stages = set(investor_data.get("investment_stages", []))
+        # Stage match - case insensitive comparison
        project_stage = project_data.get("stage")
-        if project_stage and project_stage in investor_stages:
-            score += weights["stage"]
+        if project_stage and all_stages:
+            # Normalize stage names for comparison (case-insensitive)
+            normalized_stages = {
+                stage.lower().replace("_", " ") for stage in all_stages
+            }
+            project_stage_normalized = project_stage.lower().replace("_", " ")
+            if project_stage_normalized in normalized_stages:
+                score += weights["stage"]

-        # Geography match
-        investor_geo = (investor_data.get("geographic_focus") or "").lower()
+        # Geography match - check if any fund matches
        project_geo = (project_data.get("location") or "").lower()
-        if investor_geo and project_geo and investor_geo in project_geo:
+        geo_match = False
+        if all_geographies:
+            for geo in all_geographies:
+                if geo:
+                    geo_lower = geo.lower()
+                    # Match if investor geography is "global" or if there's a location overlap
+                    if "global" in geo_lower or "worldwide" in geo_lower:
+                        geo_match = True
+                        break
+                    if project_geo and (
+                        geo_lower in project_geo or project_geo in geo_lower
+                    ):
+                        geo_match = True
+                        break
+        if geo_match:
            score += weights["geography"]

-        # Check size match
+        # Check size match - check if any fund's range matches
        project_valuation = project_data.get("valuation", 0)
-        check_lower = investor_data.get("check_size_lower") or 0
-        check_upper = investor_data.get("check_size_upper") or float("inf")
-        if (
-            check_lower
-            and check_upper
-            and check_lower <= project_valuation <= check_upper
-        ):
+        check_match = False
+        if project_valuation and check_ranges:
+            for check_range in check_ranges:
+                if check_range["lower"] <= project_valuation <= check_range["upper"]:
+                    check_match = True
+                    break
+        if check_match:
            score += weights["check_size"]

        # Thesis alignment (simplified)
@@ -121,86 +157,126 @@ class ReportGenerator:
        """Generate detailed match criteria table"""
        criteria = []

+        # Aggregate data from all funds
+        all_sectors = set(investor_data.get("sectors", []))
+        all_stages = set()
+        all_geographies = []
+        check_ranges = []
+
+        for fund in investor_data.get("funds", []):
+            all_sectors.update(fund.get("sectors", []))
+            all_stages.update(fund.get("investment_stages", []))
+            if fund.get("geographic_focus"):
+                all_geographies.append(fund["geographic_focus"])
+            if fund.get("check_size_lower") and fund.get("check_size_upper"):
+                check_ranges.append(
+                    {
+                        "lower": fund["check_size_lower"],
+                        "upper": fund["check_size_upper"],
+                        "fund_name": fund.get("fund_name", "Unnamed Fund"),
+                    }
+                )
+
        # Sector criterion
-        investor_sectors = investor_data.get("sectors", [])
        project_sectors = project_data.get("sectors", [])
-        sector_match = (
-            "Perfect" if set(investor_sectors) & set(project_sectors) else "Mismatch"
-        )
+        sector_match = "Perfect" if all_sectors & set(project_sectors) else "Mismatch"
        criteria.append(
            {
                "name": "Sector",
-                "requirement": "Cybersecurity, B2B SaaS" if project_sectors else "N/A",
-                "evidence": ", ".join(investor_sectors[:3])
-                if investor_sectors
-                else "N/A",
+                "requirement": ", ".join(project_sectors) if project_sectors else "N/A",
+                "evidence": ", ".join(list(all_sectors)[:3]) if all_sectors else "N/A",
                "match": sector_match,
                "weight": "30%",
            }
        )

-        # Stage criterion
-        investor_stages = investor_data.get("investment_stages", [])
+        # Stage criterion - case insensitive comparison
        project_stage = project_data.get("stage", "N/A")
-        stage_match = "Perfect" if project_stage in investor_stages else "Mismatch"
+        stage_match = "Mismatch"
+        if project_stage != "N/A" and all_stages:
+            # Normalize stage names for comparison
+            normalized_stages = {
+                stage.lower().replace("_", " ") for stage in all_stages
+            }
+            project_stage_normalized = project_stage.lower().replace("_", " ")
+            stage_match = (
+                "Perfect"
+                if project_stage_normalized in normalized_stages
+                else "Mismatch"
+            )
+        elif project_stage == "N/A":
+            stage_match = "N/A"
+
        criteria.append(
            {
                "name": "Stage",
                "requirement": str(project_stage),
-                "evidence": ", ".join(investor_stages) if investor_stages else "N/A",
+                "evidence": ", ".join(all_stages) if all_stages else "N/A",
                "match": stage_match,
                "weight": "30%",
            }
        )

        # Geography criterion
-        investor_geo = investor_data.get("geographic_focus") or "N/A"
        project_geo = project_data.get("location") or "N/A"
+        investor_geo_display = ", ".join(all_geographies) if all_geographies else "N/A"
+
+        # Safe comparison handling None values and "Global" matches
+        geo_match = "Mismatch"
+        if project_geo != "N/A" and all_geographies:
+            for geo in all_geographies:
+                if geo:
+                    geo_lower = geo.lower()
+                    # Match if investor geography is "global" or if there's a location overlap
+                    if "global" in geo_lower or "worldwide" in geo_lower:
+                        geo_match = "Perfect"
+                        break
+                    if (
+                        geo_lower in project_geo.lower()
+                        or project_geo.lower() in geo_lower
+                    ):
+                        geo_match = "Strong"
+                        break
+        elif not all_geographies and project_geo == "N/A":
+            geo_match = "N/A"

-        # Safe comparison handling None values
-        if investor_geo == "N/A" or project_geo == "N/A":
-            geo_match = (
-                "N/A" if investor_geo == "N/A" and project_geo == "N/A" else "Mismatch"
-            )
-        else:
-            investor_geo_lower = investor_geo.lower()
-            project_geo_lower = project_geo.lower()
-            geo_match = (
-                "Strong"
-                if investor_geo_lower in project_geo_lower
-                or project_geo_lower in investor_geo_lower
-                else "Mismatch"
-            )
        criteria.append(
            {
                "name": "Geography",
                "requirement": project_geo,
-                "evidence": investor_geo,
+                "evidence": investor_geo_display,
                "match": geo_match,
                "weight": "20%",
            }
        )

        # Check Size criterion
-        check_lower = investor_data.get("check_size_lower") or 0
-        check_upper = investor_data.get("check_size_upper") or 0
        project_val = project_data.get("valuation", 0)

+        # Build evidence string from all fund ranges
        check_evidence = "N/A"
-        if check_lower and check_upper:
-            check_evidence = (
-                f"€{check_lower / 1000000:.0f}M - €{check_upper / 1000000:.0f}M"
-            )
-        elif check_lower:
-            check_evidence = f"€{check_lower / 1000000:.0f}M+"
+        if check_ranges:
+            evidence_parts = []
+            for cr in check_ranges[:3]:  # Show up to 3 funds
+                range_str = (
+                    f"€{cr['lower'] / 1000000:.0f}M - €{cr['upper'] / 1000000:.0f}M"
+                )
+                if cr["fund_name"]:
+                    evidence_parts.append(f"{cr['fund_name']}: {range_str}")
+                else:
+                    evidence_parts.append(range_str)
+            check_evidence = "; ".join(evidence_parts)
+
+        # Check if project valuation matches any fund
+        check_match = "N/A"
+        if project_val > 0 and check_ranges:
+            match_found = any(
+                cr["lower"] <= project_val <= cr["upper"] for cr in check_ranges
+            )
+            check_match = "Perfect" if match_found else "Mismatch"
+        elif project_val > 0:
+            check_match = "Strong"

-        check_match = (
-            "Perfect"
-            if check_lower and check_upper and check_lower <= project_val <= check_upper
-            else "Strong"
-            if project_val > 0
-            else "N/A"
-        )
        criteria.append(
            {
                "name": "Check Size",