From e386ebbdefdaccc616aa85a4d6a836bdd6288ea7 Mon Sep 17 00:00:00 2001
From: bolade <babawale030@gmail.com>
Date: Mon, 13 Oct 2025 23:19:46 +0100
Subject: [PATCH] feat: Add insight generation functionality with compatibility
 scoring and web search integration

---
 .gitignore                                    |   2 +-
 app/__pycache__/main.cpython-312.pyc          | Bin 5023 -> 5130 bytes
 app/main.py                                   |   3 +-
 .../__pycache__/companies.cpython-312.pyc     | Bin 11046 -> 11055 bytes
 .../__pycache__/investors.cpython-312.pyc     | Bin 23593 -> 23602 bytes
 app/routers/insight_route.py                  |  80 ++++++++
 app/schemas/insight_schema.py                 |  18 ++
 .../__pycache__/llm_parser.cpython-312.pyc    | Bin 37401 -> 37410 bytes
 .../__pycache__/querying.cpython-312.pyc      | Bin 8315 -> 8324 bytes
 app/services/compatibility_score.py           | 137 ++++++++++++++
 app/services/insight.py                       | 175 ++++++++++++++++++
 11 files changed, 413 insertions(+), 2 deletions(-)
 create mode 100644 app/routers/insight_route.py
 create mode 100644 app/schemas/insight_schema.py
diff --git a/.gitignore b/.gitignore
index f0e8930..e729125 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,4 @@
 
 *.cypython
 
-
+nohup.out
\ No newline at end of file
diff --git a/app/__pycache__/main.cpython-312.pyc b/app/__pycache__/main.cpython-312.pyc
index fcb4a864bafaaa50fac135d7484f66ffe4f53c42..413ba49e71a5379ee1d0f71a46fab7343d36a4d0 100644
GIT binary patch
delta 988
zcmaJ<O=}ZT6rGnzCTZ+wlF2m5<g2xfNiD|Kf{Ik725GS%R1|a&(~x=jsPhr;B$eW#
zF0v9AhW7(>Uv~kwZrlk<MXAifZrlpG5pm^v6Ixr5KA6M3=bU@r<-U2Ie^L<NieiwX
zr?&8|@xCV}JORE}Fz*?%DkDzgG`~J##MGD(SL1FT&=ZEDDsDWaCykVvLY#-gflbzh
z9;yVpoSLRw`Y%Vb;BKa7df#PV7H3xHNf1qrjnU}ppm;xO$Fxu#;c+}cZlZH?t|n+B
zUEf}l6Re87KyoRLV+mTQ#p~#0Vv`^A&=j1ld9-BLyH}B?70df?7{oZbdV>@_>BRrl
z@S&=Fs2Vv`l?H0;q?Wp0;5bOqbt8RBoubQ{TD?u)d3wl4R``*lV$-yn^+vl4R;P{o
z9sV&wDe}`hffSPTy+T>?!PkwmC4j;pNx>0HmRW1oVT_nUF7EPq%3q%pV4RsI805&N
zpeT%ogXNH6T!!oj8Nsd15Z*tRc1;JJj?lmrz1@H!x$9Su(tqfW`jAAP4i}EhuqJ01
z%#zo`GhuhIp;iTKT&~({P$EeDAxtyF>HaS<>Y0tw*b!S`Rk3ZCA9OIRLzZ%rJg|QR
z<{4a}V6%rF9UX-vxg3(wBDodX2%Ulply!tw#ndV`oF`vH)&_(amu;<U6|>!}mhEZ-
z8x_0gE)#ZB2vZkCm{qbdtTpKXMO`<_?z8|BQ^I?SJ6Y4L0~;7p46ZR?YjgNIZbOkp
z-j+q)grh=<X^Z4bICYjS-|=_o1s0eN4;}NL)&CC68J5=3@g-QHUPM3p5t)u0_b+|o
mmUg)1{+&n~ML%==HW%6ABF`>vOZhD+|B)+@H7Vz1&-DiwqR|-u

delta 1060
zcmaJ<O-vI(6yBe9TPO>(#kR|`O~aps2#OapX+*#TF+@43*@K&=J8ie@Pkpl`@l*~?
zOgtEI?syjuBu(^Yyqg#j<7F@E#k*He&TIii)JgW+dEfiqH*eqUlj4i>@_SkC=jeL=
z@>~7suB-?jgx#Xs<~fryWA-^Gqi39~o(<!;opTgj3HN>WsH5sC&j}og?=uIwXr{lz
z>0^`|`^C|y|2Wb$fm`FuM29m6(9pivtJCvHYb^ElArghpM+=l(B_&=F$I%42&o8zo
ztCE@NL{H47nBupxqpg`mS(H&|Waeu8E9LkYa}?#Pf~j^QCl1rp;m%nHlx}REWyyh%
zQ2wq@Ha5wgp#38|^51N#m(9+YV_QXz1C@@Rw`cWPI=I=!T@n{}Gm)UL;yKN-YatxO
ztDe1ORKV%J6?b@EB}>ssULj`mHJ>M{)XC*&@4&tvMkxf+s^?a%8WhMIX=)?nrzwAX
zMuKrBnqV-=lp^*a7&45q>w13Kw>%dc1~4(4_5_V_8Y#x|QfR1=jacbQC_1eSq&h0w
zejVm$1#`kOg&pq8K>FFe!-RGi(Lm|Gi;YD26!|nzOk85l3k)ukxV(@IKcTN#28@gf
z-i8%&Oa3lhWP<tb^5BTDo~HXiZh}`q7#j~;2p&P6ax)^_V6y8BmMLKNzZD9F-dKi0
zrD3_-2>TvjSR;GG?cob>m8t@X4%;kaSR`{vuN`D?rH-63o(HexBQTCa7-rr{R*=RB
zwk_Jiylp#1*e^heWs)bE@I=e40c#ml2I~yiGy}1Q{BTM^r0HQlh}afG)2jFn0?~4P
zSYg^VrX950irq4i(Q7A=dhhT)gIJ69;ei{7&~pE{xF7M7<C!fRT?2y}@$|tp@;Egf
WD}CfjpShdegOtXTuj#1>yVsvQP3gM;

diff --git a/app/main.py b/app/main.py
index 720bf20..0f04f93 100644
--- a/app/main.py
+++ b/app/main.py
@@ -5,7 +5,7 @@ from db.db import Base, db_dependency, engine
 from dotenv import load_dotenv
 from fastapi import FastAPI, File, Form, UploadFile
 from pydantic import BaseModel
-from routers import companies, folk_crm, investors, projects
+from routers import companies, folk_crm, insight_route, investors, projects
 from schemas.router_schemas import InvestmentResponse, PaginatedResponse
 from services.llm_parser import InvestorProcessor
 from services.querying import QueryProcessor
@@ -109,6 +109,7 @@ app.include_router(investors.router)
 app.include_router(companies.router)
 app.include_router(projects.router)
 app.include_router(folk_crm.router)
+app.include_router(insight_route.router)
 
 if __name__ == "__main__":
     import uvicorn
diff --git a/app/routers/__pycache__/companies.cpython-312.pyc b/app/routers/__pycache__/companies.cpython-312.pyc
index ab5935069f732fe0476d9cb5dbe4f1ccc5d2cf80..6e3cf9594540e4af92712cfe4cc55d31e1b45c7f 100644
GIT binary patch
delta 44
zcmZ1$wmyvaG%qg~0}!OSzs-o+$h%3MOTM5oB{8ogGdX_p0rm5YUYjRqOk)B7MgR|N

delta 35
qcmZ1<wk(YIG%qg~0}$|QKg$T*$h%3MQEc)__4AC5n`dfFV*&uaa|<N^

diff --git a/app/routers/__pycache__/investors.cpython-312.pyc b/app/routers/__pycache__/investors.cpython-312.pyc
index 3227140fda3abf9538fb0d7f0bfb1fb1e389b755..dd436cf08ce101dd4fd04a434b14496e914b3474 100644
GIT binary patch
delta 41
vcmZ3vgK^UiM!wU$yj%=Gkmmk2gDHF?Uwb&0d_iSOVqQsRa{T6L;geYa6*djt

delta 32
ncmdnAgK^~!M!wU$yj%=G@VER~#?P>geC^?kVw>lOPi6rC!3zt}

diff --git a/app/routers/insight_route.py b/app/routers/insight_route.py
new file mode 100644
index 0000000..632bde2
--- /dev/null
+++ b/app/routers/insight_route.py
@@ -0,0 +1,80 @@
+from typing import Optional
+
+from db.db import get_db
+from db.models import InvestorTable, ProjectTable
+from fastapi import APIRouter, Depends, HTTPException
+from schemas.insight_schema import InsightResponse
+from services.compatibility_score import (
+    calculate_project_investor_compatibility,
+    generate_compatibility_explanation,
+)
+from services.insight import QueryProcessor
+from sqlalchemy.orm import Session
+
+router = APIRouter()
+
+
+@router.get(
+    "/insights/{investor_id}", response_model=InsightResponse, tags=["Insights"]
+)
+async def get_insights(
+    investor_id: int, project_id: Optional[int] = None, db: Session = Depends(get_db)
+):
+    """
+    Get investor insights including investment pattern analysis, market position,
+    and optionally compatibility score with a project.
+
+    Args:
+        investor_id: The ID of the investor to analyze
+        project_id: Optional project ID to calculate compatibility score
+
+    Returns:
+        InsightResponse with investment_pattern_analysis, market_position,
+        and compatibility_score (if project_id provided)
+    """
+    # Get investor from database
+    investor = db.query(InvestorTable).filter(InvestorTable.id == investor_id).first()
+    if not investor:
+        raise HTTPException(
+            status_code=404, detail=f"Investor with id {investor_id} not found"
+        )
+
+    # Initialize the query processor for insights
+    query_processor = QueryProcessor()
+
+    # Get investment pattern analysis and market position using web search
+    insights = await query_processor.get_investor_insights(
+        investor_name=investor.name,
+        investor_website=investor.website,
+        investor_description=investor.description,
+        investor_headquarters=investor.headquarters,
+        investment_thesis=investor.investment_thesis,
+        portfolio_highlights=investor.portfolio_highlights,
+    )
+
+    # Calculate compatibility score if project_id is provided
+    compatibility_score = None
+    if project_id:
+        project = db.query(ProjectTable).filter(ProjectTable.id == project_id).first()
+        if not project:
+            raise HTTPException(
+                status_code=404, detail=f"Project with id {project_id} not found"
+            )
+
+        # Calculate the compatibility score
+        score = calculate_project_investor_compatibility(
+            project, investor, use_funds=True
+        )
+
+        # Generate detailed explanation
+        compatibility_score = generate_compatibility_explanation(
+            project, investor, score, use_funds=True
+        )
+    else:
+        compatibility_score = "Select a project to see compatibility analysis"
+
+    return InsightResponse(
+        investment_pattern_analysis=insights["investment_pattern_analysis"],
+        market_position=insights["market_position"],
+        compatibility_score=compatibility_score,
+    )
diff --git a/app/schemas/insight_schema.py b/app/schemas/insight_schema.py
new file mode 100644
index 0000000..ddf807d
--- /dev/null
+++ b/app/schemas/insight_schema.py
@@ -0,0 +1,18 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class InsightResponse(BaseModel):
+    investment_pattern_analysis: str
+    market_position: str
+    compatibility_score: Optional[str] = None
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "investment_pattern_analysis": "Sequoia has been increasingly active in AI/ML startups (43% increase in last 18 months). Their average investment size has grown 23% year-over-year, indicating confidence in larger rounds. Peak activity in Q2-Q3, suggesting seasonal investment patterns.",
+                "market_position": "Top 3 most active VC in enterprise software deals. Strong presence in unicorn companies (47 portfolio unicorns). Consistently leads or co-leads rounds, indicating decision-making influence.",
+                "compatibility_score": "0.85",
+            }
+        }
diff --git a/app/services/__pycache__/llm_parser.cpython-312.pyc b/app/services/__pycache__/llm_parser.cpython-312.pyc
index c929764619f878340e8b9ea0eb6932bb76649b70..8ee9cab36abc72f708b752a879ad3efaeb3e179e 100644
GIT binary patch
delta 45
zcmbQaglW+dCf?J$yj%=Gkmmk2<KaeLmx)}`1(hj@c_o?2@sool&SG?&{C1)a09z#x
AX#fBK

delta 36
rcmZ3qglXmyCf?J$yj%=G5FP$3<Hkl_mx+u*lcOijVzik2ZK4hU*k=ry

diff --git a/app/services/__pycache__/querying.cpython-312.pyc b/app/services/__pycache__/querying.cpython-312.pyc
index 330306d61c33f9ca5d11833616caab77f4734159..2f5f8568b9de0b861c272865a7ac4ceefc6a3af9 100644
GIT binary patch
delta 38
scmezE(BjB@nwOW00SMCE-)2Z`<c*T&k}s%CNz5zBOpf22Aur4b0N^PLfB*mh

delta 29
jcmZp1{O!PdnwOW00SNw<Kg;0X$QvckD7Lv!UYHR8ig*Zf

diff --git a/app/services/compatibility_score.py b/app/services/compatibility_score.py
index 30472f5..4bb7e6c 100644
--- a/app/services/compatibility_score.py
+++ b/app/services/compatibility_score.py
@@ -507,3 +507,140 @@ def get_compatibility_score_breakdown(
             ),
             "note": "Using investor-level data (no specific fund selected)",
         }
+
+
+def generate_compatibility_explanation(
+    project: ProjectTable, investor: InvestorTable, score: float, use_funds: bool = True
+) -> str:
+    """
+    Generate a detailed, natural language explanation of the compatibility score.
+
+    Args:
+        project: The project being evaluated
+        investor: The investor being compared against
+        score: The calculated compatibility score (0-1)
+        use_funds: Whether fund-level data was used
+
+    Returns:
+        A formatted string with the compatibility score and detailed explanation
+    """
+    score_percentage = int(score * 100)
+
+    # Determine match quality
+    if score_percentage >= 80:
+        match_level = "Excellent match"
+    elif score_percentage >= 65:
+        match_level = "Strong match"
+    elif score_percentage >= 50:
+        match_level = "Good match"
+    elif score_percentage >= 35:
+        match_level = "Moderate match"
+    else:
+        match_level = "Limited match"
+
+    # Collect alignment factors
+    alignment_factors = []
+    recommendations = []
+
+    # Get the best matching fund if using funds
+    best_fund = None
+    if use_funds and investor.funds:
+        best_score = 0
+        for fund in investor.funds:
+            fund_score = _calculate_project_fund_compatibility(project, fund)
+            if fund_score > best_score:
+                best_score = fund_score
+                best_fund = fund
+
+    # Analyze sector alignment
+    if project.sector:
+        project_sectors = [s.name for s in project.sector if hasattr(s, "name")]
+
+        if best_fund and best_fund.sectors:
+            fund_sectors = {s.name for s in best_fund.sectors if hasattr(s, "name")}
+            common_sectors = set(project_sectors) & fund_sectors
+
+            if common_sectors:
+                sectors_str = ", ".join(list(common_sectors)[:2])
+                alignment_factors.append(f"{sectors_str} sector focus")
+            elif project_sectors:
+                recommendations.append(
+                    f"Consider emphasizing any {project_sectors[0]} industry connections"
+                )
+        elif investor.sectors:
+            investor_sectors = {s.name for s in investor.sectors if hasattr(s, "name")}
+            common_sectors = set(project_sectors) & investor_sectors
+
+            if common_sectors:
+                sectors_str = ", ".join(list(common_sectors)[:2])
+                alignment_factors.append(f"{sectors_str} sector focus")
+
+    # Analyze stage alignment
+    if project.stage:
+        stage_name = (
+            project.stage.value
+            if hasattr(project.stage, "value")
+            else str(project.stage)
+        )
+        stage_display = stage_name.replace("_", " ").title()
+
+        if best_fund and best_fund.investment_stages:
+            fund_stage_names = {
+                s.name for s in best_fund.investment_stages if hasattr(s, "name")
+            }
+            if stage_name in fund_stage_names:
+                alignment_factors.append(f"{stage_display} stage")
+            else:
+                recommendations.append(
+                    "Investor typically focuses on different stages; highlight your traction and growth metrics"
+                )
+
+        if not best_fund:
+            alignment_factors.append(f"{stage_display} stage")
+
+    # Analyze geographic alignment
+    if project.location:
+        if best_fund and best_fund.geographic_focus:
+            if (
+                project.location.lower() in best_fund.geographic_focus.lower()
+                or best_fund.geographic_focus.lower() in project.location.lower()
+            ):
+                alignment_factors.append(f"{project.location} presence")
+        elif investor.headquarters:
+            if (
+                project.location.lower() in investor.headquarters.lower()
+                or investor.headquarters.lower() in project.location.lower()
+            ):
+                alignment_factors.append(f"{project.location} market presence")
+
+    # Analyze valuation/check size fit
+    if project.valuation:
+        if best_fund and best_fund.check_size_lower and best_fund.check_size_upper:
+            reasonable_min = best_fund.check_size_lower * 3
+            reasonable_max = best_fund.check_size_upper * 10
+
+            if reasonable_min <= project.valuation <= reasonable_max:
+                alignment_factors.append("appropriate funding stage")
+            elif project.valuation < reasonable_min:
+                recommendations.append(
+                    "You may be early for this investor; consider approaching at a later stage"
+                )
+            else:
+                recommendations.append(
+                    "Consider highlighting your growth trajectory and market opportunity"
+                )
+
+    # Build the explanation
+    explanation_parts = [f"Based on your startup profile: {score_percentage}% match"]
+
+    if alignment_factors:
+        alignment_text = ", ".join(alignment_factors)
+        explanation_parts.append(f"{match_level}: {alignment_text}.")
+    else:
+        explanation_parts.append(f"{match_level}.")
+
+    if recommendations:
+        rec_text = recommendations[0]  # Show the most important recommendation
+        explanation_parts.append(rec_text + ".")
+
+    return " ".join(explanation_parts)
diff --git a/app/services/insight.py b/app/services/insight.py
index e69de29..c4e9387 100644
--- a/app/services/insight.py
+++ b/app/services/insight.py
@@ -0,0 +1,175 @@
+import asyncio
+import logging
+import os
+
+from crawl4ai import AsyncWebCrawler
+from ddgs import DDGS
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from langgraph.prebuilt import create_react_agent
+from schemas.insight_schema import InsightResponse
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+)
+logger = logging.getLogger("web_search_agent")
+
+load_dotenv()
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+
+if not OPENROUTER_API_KEY:
+    logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.")
+
+
+class QueryProcessor:
+    def __init__(self):
+        self.llm = ChatOpenAI(
+            api_key=OPENROUTER_API_KEY,
+            base_url="https://openrouter.ai/api/v1",
+            model="openai/gpt-5-nano",
+            temperature=0,
+        )
+        self.agent = create_react_agent(
+            model=self.llm,
+            tools=[self.web_search],
+            response_format=InsightResponse,
+        )
+
+        self.ddg_search = DDGS()
+
+    async def crawl(self, url: str):
+        """Tool to search the web using a web crawler. given the url"""
+
+        logger.info(f"\nCrawl tool called with url: {url}")
+        async with AsyncWebCrawler() as crawler:
+            results = await crawler.arun(url)
+            return results.markdown
+
+    def web_search(self, query: str):
+        """Tool to search the web using google, provide the relevant query to get the information"""
+        logger.info(f"\nWeb Search Tool Called with query: {query}")
+        if query:
+            result = self.ddg_search.text(query, max_results=10, backend="google")
+            return result
+        return "No query provided."
+
+    async def get_investor_insights(
+        self,
+        investor_name: str,
+        investor_website: str = None,
+        investor_description: str = None,
+        investor_headquarters: str = None,
+        investment_thesis: list = None,
+        portfolio_highlights: list = None,
+    ) -> dict:
+        """
+        Get investment pattern analysis and market position for an investor.
+
+        Args:
+            investor_name: Name of the investor/VC firm
+            investor_website: Website URL of the investor
+            investor_description: Description of the investor
+            investor_headquarters: Headquarters location
+            investment_thesis: List of investment thesis statements
+            portfolio_highlights: List of notable portfolio companies
+
+        Returns:
+            Dictionary with investment_pattern_analysis and market_position
+        """
+        logger.info(f"Getting insights for investor: {investor_name}")
+
+        # Build context information
+        context_parts = [f'Investment Firm: "{investor_name}"']
+
+        if investor_website:
+            context_parts.append(f"Website: {investor_website}")
+        if investor_headquarters:
+            context_parts.append(f"Location: {investor_headquarters}")
+        if investor_description:
+            context_parts.append(f"Description: {investor_description}")
+        if investment_thesis:
+            thesis_str = ", ".join(investment_thesis[:3])  # Limit to first 3
+            context_parts.append(f"Investment Focus: {thesis_str}")
+        if portfolio_highlights:
+            portfolio_str = ", ".join(portfolio_highlights[:5])  # Limit to first 5
+            context_parts.append(f"Notable Portfolio Companies: {portfolio_str}")
+
+        context = "\n".join(context_parts)
+
+        prompt = f"""
+        Research and analyze the following investment firm:
+        
+        {context}
+        
+        CRITICAL INSTRUCTIONS:
+        - You MUST provide concrete, data-driven insights with specific numbers and percentages
+        - Use the web_search tool to find recent news, press releases, and investment databases (Crunchbase, PitchBook, etc.)
+        - If you cannot find sufficient data after searching, make reasonable inferences based on available information
+        - DO NOT state that data is unavailable or ambiguous - provide the best analysis possible with what you find
+        - Focus on ACTIONABLE insights, not disclaimers
+        
+        Provide insights in the InsightResponse schema format:
+        
+        1. investment_pattern_analysis (MAX 3 SENTENCES):
+           - Recent investment activity and trends in the last 12-18 months
+           - Investment size ranges, deal frequency, and sector preferences
+           - Notable patterns (e.g., "increased AI investments by 40%", "average check size $5-10M")
+           - If specific numbers aren't available, provide reasonable estimates based on portfolio and market position
+        
+        2. market_position (MAX 3 SENTENCES):
+           - Standing in the venture capital market
+           - Activity level in specific sectors and notable unicorn investments
+           - Deal leadership roles (lead vs co-lead) and market influence
+           - Regional or global market presence and competitive positioning
+        
+        Use the web_search tool strategically. Search for:
+        - "{investor_name}" recent investments 2024 2025
+        - "{investor_name}" portfolio Crunchbase
+        - "{investor_name}" funding rounds news
+        - Specific portfolio companies if mentioned above
+        """
+
+        try:
+            result = await self.agent.ainvoke({"messages": [("user", prompt)]})
+            # The agent with response_format=InsightResponse returns structured output
+            logger.info(f"Raw agent result keys: {result.keys()}")
+
+            # Check if structured_response exists and is an InsightResponse object
+            if "structured_response" in result:
+                structured = result["structured_response"]
+                logger.info(f"Structured response type: {type(structured)}")
+
+                # If it's already an InsightResponse object, convert to dict
+                if isinstance(structured, InsightResponse):
+                    return structured.model_dump()
+                # If it's already a dict, return it
+                elif isinstance(structured, dict):
+                    return structured
+
+            # Fallback: shouldn't reach here, but handle it gracefully
+            logger.warning("No structured_response found in result, using fallback")
+            return {
+                "investment_pattern_analysis": "Unable to retrieve investment pattern analysis at this time.",
+                "market_position": "Unable to retrieve market position at this time.",
+            }
+
+        except Exception as e:
+            logger.error(f"Error getting insights for {investor_name}: {e}")
+            logger.exception("Full exception details:")
+            return {
+                "investment_pattern_analysis": "Unable to retrieve investment pattern analysis at this time.",
+                "market_position": "Unable to retrieve market position at this time.",
+            }
+
+
+async def main():
+    qp = QueryProcessor()
+    result = await qp.agent.ainvoke(
+        {"messages": [("user", "Can you tell me about 3T Finance investment company")]}
+    )
+    final_message = result["messages"][-1].content
+    print(final_message)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())