From ba0ed169ceed827388bb5db3135dcc59bd8b1322 Mon Sep 17 00:00:00 2001 From: bolade Date: Fri, 29 Aug 2025 18:42:55 +0100 Subject: [PATCH] Implement investor processing and querying functionality - Added InvestorProcessor class for processing CSV data in batches and saving to SQL and vector databases. - Introduced QueryProcessor class for querying investor information from SQL and vector databases. - Integrated OpenAI's ChatGPT for structured output generation. - Implemented data cleaning and control character removal in CSV processing. - Added asynchronous processing capabilities for batch handling. - Established connection to ChromaDB for vector storage of investor descriptions. - Defined structured output schemas using Pydantic for investor data validation. - Enhanced settings management for API key and database configurations. --- app/__pycache__/main.cpython-312.pyc | Bin 0 -> 2311 bytes .../pydantic_schemas.cpython-312.pyc | Bin 0 -> 1640 bytes app/__pycache__/schemas.cpython-312.pyc | Bin 0 -> 885 bytes app/__pycache__/settings.cpython-312.pyc | Bin 0 -> 672 bytes app/db/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 167 bytes app/db/__pycache__/db.cpython-312.pyc | Bin 0 -> 1623 bytes app/db/__pycache__/tables.cpython-312.pyc | Bin 0 -> 1295 bytes app/db/db.py | 32 +- app/db/tables.py | 23 + app/main.py | 41 +- app/pydantic_schemas.py | 38 ++ .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 173 bytes .../langgraph_agent.cpython-312.pyc | Bin 0 -> 11285 bytes .../__pycache__/llm_parser.cpython-312.pyc | Bin 0 -> 2916 bytes .../__pycache__/openrouter.cpython-312.pyc | Bin 0 -> 8681 bytes .../__pycache__/settings.cpython-312.pyc | Bin 0 -> 681 bytes app/services/investor_parser.py | 449 ------------------ app/services/langgraph_agent.py | 0 app/services/llm_parser.py | 382 ++++++++++++++- app/services/openrouter.py | 178 +++++++ app/services/querying.py | 61 +++ app/settings.py | 7 +- 22 files changed, 719 insertions(+), 492 deletions(-) create mode 100644 app/__pycache__/main.cpython-312.pyc create mode 100644 app/__pycache__/pydantic_schemas.cpython-312.pyc create mode 100644 app/__pycache__/schemas.cpython-312.pyc create mode 100644 app/__pycache__/settings.cpython-312.pyc create mode 100644 app/db/__pycache__/__init__.cpython-312.pyc create mode 100644 app/db/__pycache__/db.cpython-312.pyc create mode 100644 app/db/__pycache__/tables.cpython-312.pyc create mode 100644 app/db/tables.py create mode 100644 app/pydantic_schemas.py create mode 100644 app/services/__pycache__/__init__.cpython-312.pyc create mode 100644 app/services/__pycache__/langgraph_agent.cpython-312.pyc create mode 100644 app/services/__pycache__/llm_parser.cpython-312.pyc create mode 100644 app/services/__pycache__/openrouter.cpython-312.pyc create mode 100644 app/services/__pycache__/settings.cpython-312.pyc delete mode 100644 app/services/investor_parser.py create mode 100644 app/services/langgraph_agent.py create mode 100644 app/services/openrouter.py create mode 100644 app/services/querying.py diff --git a/app/__pycache__/main.cpython-312.pyc b/app/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9c2a4c1689218ae235fa426aae78fd34f019050 GIT binary patch literal 2311 zcmZ`)T}%{L6ux(6c6OKj19o9S{6U(6n-;c$iI!Sh36)Y(>Q5V!HO*$&JHV{7GwYpM z33O>zA=M@()!2wWG^J0)XcPO^q%YN{CVc@hVy`u7>_hvuN*|P`o;%Bcwe3mfo^$Rw z_ug~Q`R zUex8BH|5RwQofu&<#*;Jy($+-1svI<2XmoR$dP3|oQtF)7zs%AURgaVRSlZQ)i6@4 zvoE>ftaIYM0ugZ{zG>uor)t5wtup`oNc^rh?Cm2}t~QKGK$S>P^$$xVG`)6eD)vA7 zgx!_Sx`-Npb=A|-+WB>AJ*hdm4I$d()+Pt#Ra^-*3o z6~g5NX2Ammm5v`&j}=tQ2EG2!xRL2Mj)P`W$F_GwHMph5o!w8(jA~gXUBSr=55gq@ z1QEVvjue0aCW3|!p@cZdJnyQyZZgkdlj`I$q7m)}A6j`UIcnzAq^TFi6iYF3TC&&7 z6moE;l?3mj$@d3(lZs)RMtV%6>M&KnTgm5>IYl!%^5e`;RfVLfY1*_FHsm{7As9;N zK`?su@O)s?ytIi{!^}igGOtieZO>T8JD5O*m^2Jo$ndyb@`5l0-*PViX8@8tgNiu2 z3L@8ubGMjN0OwEO))#~s@iUJHO$s*8fkWBVlDRtPHb23ktk0F!c~Qs))-_im*ag&! zd{VUeT^%Z-tAcBvl!{W8r{V=Fimu1M;`R`+=;=jY$QfY-WrU9*nUiu+&epuJ1*q=b z&k1K@68KKayv{q(nG$@h838+UW9U0^3@4<)2~WWuZr^novw(F>Pg?*U%`}*os#Za_ zt%RRRyoOB36XtWO2?b(;W-{L)n`*{L{|8JaYQ`ig<zP%x0}D&e9gPKb=P9u>x=&GQqNy9s%e=Ux>dcs z)PE~B zEmxsP$I25KHFQ0*c0e2_V?E9I0`7@RGkcpryxxg>8l~%9elG8h_cX~jq8^ZMG-5tB zad|>c!keb!Jk8GI{s+uCMEMPui!fARZm)qHfV2j^Jn4RJzBQ?UQQFwQ97F#?X zUe!uXBhK2iw%W=p`U;p^;InqZ;8gbaV9Ol673?g%b0-{K3U6HqZ=G#hYUx~P>0AtV zmHO@k>&}kN6wejUe>xv%otIjlIcJ>g-i?_ro#qWFou2S<-VGgz`E@g+=%c1(Yj1Bs z=);7>L_RO^DJJDjYBQOty!$2^=|%{H$$7;fibXeY#Y2WV$Pne(ru;I@I}Dv$$!pZZ zr}$r%@|3db{5XM@sAbYtW>n3=dfrM{gQe2r8uaE4T`@*RsFEK|SMuXzHM(Xt;b%e3 zf~`{LJ<$%nS{zX;9h66E+0ILFlE&|9K-W#DnYRG@o77+;Ef`J{cN(jc@fF|hoJk70 zx`)0A`*Ac`Kf_QKF~)b1{}GDbM;-T3`vPjehZ6VD8~4!mhjM62D~S(LXgUHfL-3r6 zpNOB1PHlWFV7z%*L_)aCrLvFP8csBvesvMYf9>A$gY{$LF{;8lE{4jS&841j##c0y z1(3=c+!fmvakMOmxb0%DjDTI!pK!)qW6pdzh@{}@iJM~MU8(kg6e;zc8aOd<_PyC7 S*X*BqZ%h4UFOoJmH~kySXc_jU`@N zK@_2$+JB*V^6$`#q=zuzNj!KH8ZSNh-fR-m5JAC(`OR;BZ$93<_hvt3GARPzyU!mh zZxe)k!$J2;cF5HmAZNrSF0B%umMBG@s3v@^q*0O}o5a;lh^sU0gBGtT=}Jw4YV@f| zrH+7V_Nj(aEl^W^>WETDK~49mW+PJ=`+s}*Yv7i6z& z3%H{1h}Bwy8?d7SBw`id6=|h1$6GBcef*#`K6(78H8IuHr&_suQ_r{3+2ifj%v@8S zYt7AH>hpZ2yTR~0E`WJ3kGDW$$vB&hL_w43ZXU2WWTqFsDhziE1|P>WatIR$SR}FI zd1TWF%8glMb6vm%A`JjHnw&W|->+ZlOYuaUGu+<_SO)$5pLM6|*isdkDvZj^<2vJw zCrs2J&y0`2>D)xH5b&MzQ3SjO&mfEeNGLu>Kuz&1N&^9yM-5>Lnz#$_4=HWx^N{0{ z?Kn=j%kVg1sE5ZX&S>Xehjn;g(o`KjfpJAvJk~AVxA^}sDdSF6S9jZ#SDuoFlW#DAl*TkxS_0FL(Yd~Uz zsr(D56MqL|I#?$$AqF<2gryVj?8E}Z4c~q5-Fx3Vzjyx9Xw(s`2Unjbk1#^tq_JN5 ziYyL+Tp|y7I6xsDVJx{4D4{x1F;dWDh6TPXN3E&sjxV8;h3Dp?v#sLJaxu&Wf+W( z{Wua$zt9M2l#+Bpxz9zrbU~g+xy$?{g}|pnLzslfI)sQiAz|!I1IaBy&ZkbWI#GGb z#3mt56ve3{GdaYlew2Pf3)!TbeR~pz)Q*Gcg~OdF^z8$lZZ83@PhUSCe8BkKCiQAm zWOftDF>v4{fs{mqfCKk{J4e7>L8fl0iyQ}pGp!Cq?!*fkKWr9|ymglu@la%0EceZz zk`%$KlGzO;#~>gg93p2AgeyEw)^Yd0+0-f3QNtF`{kmelvc1#m?{4os>h01yJNM~U z@5zD-sa?)5xNw4PI)OXy)Rnp$N)F@w7hYG$=l#ITeM%)u1*Q2CrK#fM#PSxU&&DjN z{`jb+!#EM|VP=iy8{vUUMW~YTAu~*-ak!}l<5bAZgpZYe8a~+C44KT7q=&H+Q*7#@G};U;N;@fx4vFRuI**|K~tZ#iJhZ9hmgA_(|LzdCZ)<) z7At8eEyTKQMa9{UWp=^^*eRG5gx7F#W$lZ%_QxZQ)!%?n%B|8q$A!^^W8?9F)~#wg z{-Q|k|Jz&infvSI%iY$OQEZ1{t|9z%2_fI$`f1Z6t8ZSO0)}79(7N>Q)~oW0e*qSD Bo!$Td literal 0 HcmV?d00001 diff --git a/app/db/__pycache__/__init__.cpython-312.pyc b/app/db/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3173eb0374a22b0b98187be7befc0471db634573 GIT binary patch literal 167 zcmX@j%ge<81o{^@WPs?$AOanHW&w&!XQ*V*Wb|9fP{ah}eFmxd<))vJpPQ`QdR+I=-omfzypOT~>AD@|* nSrQ+wS5Wzj!zMRBr8Fniu80+AEF%yXgBTx~85tRin1L(+($Okt literal 0 HcmV?d00001 diff --git a/app/db/__pycache__/db.cpython-312.pyc b/app/db/__pycache__/db.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21b09ee2e3009d681ea96bb4b0219cb0b1d9bc24 GIT binary patch literal 1623 zcmah}O>Epm6rQoYUfa8y-6Tyz(p51O3clZnBvszgf#MI304pmHcmD#QVa8{E01Y7odI0t85K3k`>=;=mj4Zj&M;MxHnCy?OII z@0)Mtn?gZD(BAm!^X6wFLcegM-!eUAcMFuuh#-PpWb#pPu_u@U=DO&Lo@7d?F1Z;` zHsw^$xLHpzl~k8q)zeJP%b7Wh1Voj!UTQNB7Frtb|8j36S)!~7UE3UhD47lrbzQsb zD}paaRHD-2dTtH&&YM{x`E$q|qyp7oUZR6*xI5Dj#`ydYDZuZ{S3o^8FyPB1*u>&d?V1NeF7eW)!?N>MuGz1xJZc+qIY#Im@(Gw?b+NNp^vT zp%eH?KI|%eq<#W&O4M!! zhL~tpI|}T;^PDJAx!!QwVKb5Hj!*1vIbA8ia0L2p1Y#CpIHjmpFG78c&mt9_K_3d2 zF@|xehg3H~SN0<@iXq+q#jK#y;%jIHUl&)ff#;#_e6Q}4fo-`-#&!dM11AEhhkQ5s zo#(2}z@yc`ZLe6N<$F$bCa~Kc^`o%*YQRoa=VoWBmLCOvZN*`_n^s<;eEFb0+0mRZA(2K_H8~c^>4>MXC(KHb=>D2 zMv>b05@F99i-xpXdC_+w$8w!jsz*(#6DzVfm-Kq9r-XC-Mcr~;9o#Mj9D+d#WU+jZjy=8a4uO+%_g@n~f6#EI79FeFUmE|HT=i#dMuDca?GWJ4g5 z0#91&Z-te?r$Z09=%8OEC0>M>nMMrethM>V{VNR;sc?5R}M_Ms4Qb z11^U!1@|B*|3%XUWZ>1uUZ9a~?cvb7M|vY*dKmk5lLdaz4)vf#`PB=2j!6+RGX8|5 zfA1Di;(Y0?uJM)Zeh4@UJ>(A_)R7y#czi<|OV=19i4cT|9K|g_obi1o*@hKHR?A`h zk|i44TFbSY)QcHE`DA?0QK6@!%D(4-aT+88`vL;yu`D>5!UgAVN@(gYGfob69~WsE zoZm@-)N3StdZ=&5=?SFU|5uTBlxMtvv|W0fJqpYCRSVC8*by&Fol5+L2Ld>^!>^Ux=?9-Y}+a7Na!c;kKZUg(J9=LMLe;S7ck8HgHcHP!#M^X={6Ni;zo5JJU98KGsRry5eDl z{sDWlcv??Z+Jl}2(US*HOINl#9z;aMo6vgk<^!5;e#FTeMjH*emX_a^VN*%YAi z>GjLS4?F;0nDIvv9k4S-!D9dcUe4 z3PK+jtyu?=q}?%tqQ;YS3EQrPE9Nq)kc90)bbpNYTk*<{!`vyBA)2?!PN?zt5WNAS z_+GgL>99;|y+koh&vKDzl9Xw>J}f&7XH4^M*>XB7SOX14J@(=4jsLygyeCNID3u1=KMlj23L)!(x;X%O+%_D|r425oU^` zG!90Tg{BpfRGcrv&%qAe0e%(qMc+j_9sLz6usqk+3-k~dd10X6@$s@gHCfOtFZ4Zg z#l~nJ(*$*^RMO$B9u*UuDOGTihKiIzLGX&IRj}QAqB`|C*;SM4#TUKJ!(`zD~R{nY;fBlW}F7r0Cm7i*=(=>?Mx2@#1bN#K{U?Vr! z$_+Pi!}Y~y&QoVAcYa$PXsJUDb*QC|G}Mv${bvuKKHO3-?8;)d^k@RJ#}HMcCGk;; zNh-MOSPpH7TZy~IZaZpS+_nUqJp*Sk`xV4jagw1Z40}Sql&kSXT*JfEWG^Tl?Vd1ulODM|PzyPO24t0{V%z{|0&JGp7Im literal 0 HcmV?d00001 diff --git a/app/db/db.py b/app/db/db.py index 81215fa..401bdd6 100644 --- a/app/db/db.py +++ b/app/db/db.py @@ -1,11 +1,12 @@ import os -from contextlib import contextmanager -from typing import Generator +from typing import Annotated +from fastapi import Depends from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, sessionmaker -from schema import Base +Base = declarative_base() # Database configuration DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///investors.db") @@ -17,26 +18,23 @@ engine = create_engine(DATABASE_URL, echo=False) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + + +db_dependency = Annotated[Session, Depends(get_db)] + + def init_database(): """Initialize the database by creating all tables""" Base.metadata.create_all(bind=engine) print("Database initialized successfully!") -@contextmanager -def get_session() -> Generator[Session, None, None]: - """Get a database session with automatic cleanup""" - session = SessionLocal() - try: - yield session - session.commit() - except Exception as e: - session.rollback() - raise e - finally: - session.close() - - def get_session_sync() -> Session: """Get a database session for synchronous operations""" return SessionLocal() diff --git a/app/db/tables.py b/app/db/tables.py new file mode 100644 index 0000000..c66d162 --- /dev/null +++ b/app/db/tables.py @@ -0,0 +1,23 @@ +import datetime + +from sqlalchemy import Column, DateTime, Integer, String + +from db.db import Base + + +class InvestorTable(Base): + __tablename__ = "investors" + + id = Column(Integer, primary_key=True, index=True) + name = Column(String, nullable=False) + aum = Column(Integer, nullable=False) + check_size = Column(String, nullable=False) + sector_focus = Column(String, nullable=False) + stage_focus = Column(String, nullable=False) + region = Column(String, nullable=False) + created_at = Column(DateTime, default=datetime.datetime.now(datetime.UTC)) + updated_at = Column( + DateTime, + default=datetime.datetime.now(datetime.UTC), + onupdate=datetime.datetime.now(datetime.UTC), + ) diff --git a/app/main.py b/app/main.py index 4781a23..a09042e 100644 --- a/app/main.py +++ b/app/main.py @@ -1,7 +1,44 @@ -from fastapi import FastAPI +import io + +import pandas as pd +from db.db import db_dependency, init_database +from fastapi import FastAPI, File, UploadFile +from services.openrouter import InvestorProcessor + +from app.services.querying import QueryProcessor app = FastAPI() +init_database() + + @app.get("/") def read_root(): - return {"Hello": "World"} \ No newline at end of file + return {"Hello": "World"} + + +@app.post("/parse-csv") +async def parse_csv(db: db_dependency, file: UploadFile = File(...)): + # Read uploaded CSV with pandas + content = await file.read() + df = pd.read_csv(io.StringIO(content.decode("utf-8"))) + + # Process the dataframe + processor = InvestorProcessor(sql_session=db) + results = await processor.process_csv(df) + + # Convert Pydantic objects to dictionaries + return {"results": [r.dict() for r in results]} + + +@app.post("/query") +async def query_investors(db: db_dependency, question: str): + processor = QueryProcessor(sql_session=db) + results = processor.process_query(question) + return {"results": [r.dict() for r in results]} + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app="main:app", host="localhost", port=8000, reload=True) diff --git a/app/pydantic_schemas.py b/app/pydantic_schemas.py new file mode 100644 index 0000000..08588b5 --- /dev/null +++ b/app/pydantic_schemas.py @@ -0,0 +1,38 @@ +from typing import List + +from pydantic import BaseModel + + +class Investor(BaseModel): + name: str + aum: int + check_size: str + sector_focus: str + stage_focus: str + region: str + investment_thesis: str + investor_description: str + + +class InvestorList(BaseModel): + investor_list: List[Investor] + + +class QueryResponse(BaseModel): + name: str + aum: int + check_size: str + sector_focus: str + stage_focus: str + region: str + investment_thesis: str + investor_description: str + reason: str + + +class QueryRequest(BaseModel): + question: str + + +class QueryResponseList(BaseModel): + responses: List[QueryResponse] \ No newline at end of file diff --git a/app/services/__pycache__/__init__.cpython-312.pyc b/app/services/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9191821e8d8caa29167735920660866a2272a897 GIT binary patch literal 173 zcmX@j%ge<81nL(zWPs?$AOanHW&w&!XQ*V*Wb|9fP{ah}eFmxd<)fdGpPQ`QdR+I=-omfzyUz}Q0mYJMd ttREkrnU`4-AFo$X`HRCQH$SB`C)KWq6=*mk5Ep|OADI~$8H<>KEC3JAEd&4n literal 0 HcmV?d00001 diff --git a/app/services/__pycache__/langgraph_agent.cpython-312.pyc b/app/services/__pycache__/langgraph_agent.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe524c0d6bcdaf52266d0021e07a6eb987a590c4 GIT binary patch literal 11285 zcmb_iYit`=cD}>m@GX&gQ18cBvL(@yNIUXVeyuFYvSVA0B`1!x-EwKpNE(?OGBZQV zVyGnDcoz!bcAfsHW$bRFEU<_cb^&jH6hVJw6E6^8_XiniOLm%cT_gpXe~jf|x9Otj zxpz2xiQY6t2jHFiIQPuuz4x5^opbK5tEy}Wp1=C%U(8SJLg-Um&>tq3+4@Jw+(8`T zs1TaLZ$pR*8)gh)dWH@gXN+NHh6$TyOq51ro-qRr9kPV2GgeY(4B5i=8GG0<;~-@w zR28nCsU~?-$QgFcxJcd{stMQ5)RMdpBz(d(@H?$C!|1h9teyIe*+Vmr+HZh29T|}7~`d-DPE37 zM489jq0%DriXw<}a!?zimKt+~tq^4HARf(79GWpak9e9la@0|%*K zL@@#r2va5d{Sw|;5}pMKucu_fNfjhNM_Z7hucm$TkudLzgkp;UIUt4w-&iCV3&X%= z-{pvO&3FFnm@gnI5z)UWNc^k>!|(;7QJ>69*M%T2`$7S6ZVoS&KY;(QH@YOb@KG+l%Q#%O>)trCX>?0%I@ii8%t3Qug*l3+XAPX;CI}5j^9QGTcB*?th^oQ9YAk`a@CUE<4|kQh}U@;){bmMvdr4j%Gw?c=HvuM zgbU6-zt>+8xd$}x7=0~@L zJod?W+q|MgkH0Hk#NAzUB&7n8c^m{w~$*fKgY{K zNg%37R_#Tl_+EWc*`ivR4Tv18^~OTMD)YQ7$m}c#i3|$Mh=otJgn1>v1(bltA+a!8 z$$}F+y=v`+=f)?eEUxfsfTsfNM?(Unsw<}% zxU2r4R?{@cD}K-wL5T<5=nqChAy6J5*p{?nGem9fysC- zSypPtLj$D()2sOX@mBq>7xuGPZ)^lo`DI96Lz&i|72~F*>EpWgkL%mg^*#{MjC&|k z+mmT(eqv?nZ7arGmM1Q>!;@wYr`W^qE^o5uahLW?OGmoJmum5)TTZ81PG@TR9=Y0+ zol{xb&`_OaQ0p0L3o)&jfivqu4J~PRPs-i1;qF^8{-uqGzG{+qSYa`k+srT|gs>10 z;d%vX?;s`LrsPFX-~7u|f?CiITGCldI1tE6BByV53PiSIKLhl`3tjenZ9f|N5hzgF zv7{eX(Pa-u>Hma`$XV75MXV%BxKyG6HS)DIw1Sm4c}i&O^xDE(7xZZm(RkZQy#26@ zu;<`ZAV)a62&c$wD6**9@Zl%(0Vz1IR^@hhuILO#!dC^6*Uv(aO~Nxo&JfiS;5fes zlCHY^1r}OOoxg;UR_O-$C}F2nI{_j1LA<0Te>5V1m=VT2H^)m@o2h2#6%|6f#~|&; zWs5%YVyTgyfu?w8C4niDy#XMSe+tQK=(D=^A1`dwc~&f06SCH%EzK!Q^XkC5G2M48 z)pu;uay(;k`V3^-LxFbR{)>;)0>vPkDQHZXJ^XUx#8TAwCvDii;*$rDml0C=6M}{Tl+wi z&LRrL%L<~vQ5fhf8w#QZn=U7I%QQzP3=8=)4G43OzD<4vX!R%bvJpfee}3sS2rcAK zr$SwV=8OrWK!XzTTS;SIG7g_53w+9o|;ctdSVp`_=a7K5I$hhsR?kL~)LQ|Q+BjVNJ)l`&)J znV_aoH&QzEPCJpbf7t>mY=>S`cypFHq|Ym7Rl4<-ghl86BlHu)92JKK;BUeF=WvdE z=?d5h%a&Jw_jE3|NG+lp^cA#7{T;nXdF*lTG!b{A5vDJmWAPb@Pj5IsC061?{B?kq zx#Ja}u7uG?;BDAHHn!}U9zQoedWqdukIC>EJW#P^k-sVn%2OJ(QymV~qVv8oo5K0; z*Uo=GIsYpY&+`EeV7UaqMy^Z|h=u)ua3m%wmD^`yA}7PyJR7Op9+YBYaGszbu#KcZ z@F{#kf_RPR1koRrBD0{_Dsz(~F)7IV6)C37>y+Cj^VHOZ^M!TVz6_U7k57$nug0r$ z?!x8qsXn|$9`@Xsv*YZrru6+VL1Fg7)V6OGI#pVs|05F&-bxnl8}lh%*Kgc;`;vWQ z7Uiq?jTuy0=n71>t;e}D=g(YX_wFfa_=?&ECJ&KrvXCkbN3+K+c|d|yb58Em`bsOP zHaDeL!gfo%7V9PLkFdnx9%d&! zwGy^kR0d!R)Uh~k=5O#puvSzPSiii;k$)rw$s|rL;N&@+T!cim zj^7CK#Pm>U!1EG5NIg~pHl#63h1H*G#TbXU3#18L2H1=J1!Tx7Edh|Cco`pgSWd`8 z;JkDUs^Z;ewppg$a?pec`=h zO{Rt>GkyK(zC)?LLzy1WJL21-{_^fkd&g4VvCM!kJ@Bp6z_$uryL!K9bhsP7q^oOc zvow4z+l6-eR>p4+rW|b6fn4qt#~+?pk*oPJGQi*L=zdGME38`{bZvC(PaZk{n_zNk z`q#4`zL-4xLh{h%RL5oL+xNzWO!qF#ZcSq1>nHM&oT|r$OwRm!8=R(uAbver2L|iw|+`n?&aVcKq}aVe+F=G znE{AptOUmdfEquepuZXO%yG7tmJ{UrA^--cCw{Ubx}tKP&cxX``;Y0L6-iKn)htvWKfG0yWi2-jYg~_5WD{YO3;}hI|P?jdR)j3h)L{Lvm%H z#uYzS1Zp(vqZHT3d>BxT5DbJuOROw}g-}2$oSj6-KgZUMhgPk+<3LueobGpnEO-L~ zA~@H8#7{e1?-SNw$jSLuAbjFa&@s*}< zA8gHOI?(+B@kDM@gJx+jrqR?AFM?YToGiTbJg$5JCzo+T)R?7QjhS<9PVyO*5~SUj z_S=xico}o5OLIj?$Dt783>a#acai^##^ZW9yyBn~`)wf2yHB)84f@ zk!fjDn@(iSsBvHm85^rtCO|QCuq%@pXT$B6KXP`jn;+OyeMjHD_)*`9kGlufFF*Kp zYH;Me9jU>wjqdTs$Whbq=GogLw{K+XyVCWZRJ|u_ur{}3I(o7+yy2T{@7l|$&Y=e* zA9Wtc(#W$X$1v4*;yvo4zLSr3bU*6s`8t}eOpo_*4RHLDw!0g$G<-MPhI;p}j3;Zm zQjTu$i)i|+is&;V{O=NdmZ|N%Pp^;MGp*f7y7qu)<#6>E$k17>sWcd7uZOVut)n}q z)vADpuGL1HshcNjPd1_V8w`+tzsWkNEin`q za#{tJEB|m*K{l5SAitGpK(FaW$ubQZfOyuIjRI(;gaLFEY$VW5riy)y9HaNGq&Ial z6wF4mOx@}Mvx!L%W9HTX=o=cigO2}y<8Dqc*D3f;f6;gL8c~rRW-94eeU!Ss0`tux z*C=^2cMcV~bc@VB(6>e~u&saEA)K&aZQBCjocmS!TtTnupD@c-IDsmd!j+uDN?sS| zM`=sg7Akm%5>}n!bubfOhO_E#=sWZe7zQod6L#I0#QOFix@8B`HE&9TmOruVc*-tu z7Iq2eQm~hjJPC*XwGu7w#9IDO&mb{p9k0Omh&chaUKS-6vZcXZXkp!X@-LvCNTd2vcAQB0`jS@=f!JcWKrz;+E2<_ zsE@b}hS~E0L4?B=NB?9ztxLJj!>eqlfCJ`u&D$`{YUd*vBdO2zB~00f2&47Rc?Z~O zp675X3B1gX46wWs^p;P7#Lq$SV3?iYBP2>71htTcuh;@(Ly;f}B$f?ITeKpJnyjUu zgB&uz=)4e$$dTy$l23D)ROC_Wd>Lj>!kA}^<1cJvmh3xypOkTdXo!se#^w1XAcMG! zPN!{I`2X;6O>9bfmK3yUg9yVQu+=E<>zp&A&4cJ1lWM062DQ>y`(UuNmN7x z=!ZY8j|UF~1Suf#1u;<~h0%H0DXdPQ3#gGLohHbaikgPj>NG7xRz^2jA@D@7*ffk( zHHHA^0Rjw!V!TYEy{d5m=1FC^SU4(6*gLH@EDGT7ftYbDh~s9QA3|}_m;zxo7<1$r zuK_%Q?W&5f9Wk8qs!Vm|Jnfv0OQkEoT=N3h0RZdOstHsz1jcx-E9q#1cWcjML|5&*Z31idPW;XITF-jdM&n?*aVXU| zw9&Z#wo$EX%nS}C>vyhQ`lx<))(q5uE?YhSi+-zP$-YA$wH+$&{{;GXCOduU&I75= z0~?)(Hd+p^oc*`Dj*oXAdGG4x?lZTUWJ6EN)thN-yY0xDsFt0Xou0LWS>)`mdeVWa zYkxTN#>}dHZFFra+0^%-X0vK2>DZS=^!}>sj=Bcd%Bk!C>gamw#k()2+mEK&k8U2D z+-Sc5Q_$X@ZXZgu4?Q@w(S9V|cx2`DqaJS_epVCsnXN-jNB{GQg#WhAE}4J!&+*r!}zeyit}v-Xn5E`;XG>{@1q}j_KkPbziKx@ z{ja(isQFbN4S8b8lwemJa3B_W7BZSeUI=U?0Ihq!5sKOxmT8A>1W~6RfUOCMrQ7j;JV)QPVQS*sQ{VX#p<7kg$|$*KWp;f{f!v zSVd?->?+VJ_z!3vvv}@|EWVlHEur{V=q)coLY!ZpHFafb8#7HUnb!79YiFj;{lso+ zwyjiwt<}Ect!M8(n{M-_+Pv$_8*RtZ4aZjOkL(RUJn_bf)kB;1ogX`DlJ&dRu07~W z^&Cm=I-0CKw&^(j$l3D4SKoLQBzSZ0$q%PC_dd7byqL6H%-T`o@vmeYRQjH8C_Xlu!#{Hlri{SO1Z41A$-!eFvDQdkAbP&9gdrv+g8EqtZ5JC9mAH0J2 z+3JJj=~s3*BX5FhHw;|6F_Q~b4bw|vaQh)3vB0K8ssSBqw*q#O4-`wZE_rey`Yv^Nl@49`%y)S=%L6e??&{#57 ztXJ@Ow;n+PSO37T7XmEE08m-V^UJ8@;DCoD)gQ1aVupVYhWz z*Nmes?ErsJ*V?{+a`a^xAoxngTJs045rlq;c8?6wzZ{?-h1q+$TEWN3(-p^G;8vV` z0Y7;cBv^4!%zmsm;FUGmnA6nSP!{1g+%$MXGMd2Sex622f|qJiz`zpcwA+VxZ)rCY z$vs4T_)3GA%7rfx2Im2K`n3ySz6TUKtsdiW|pQ+iGn15wp4R|WjU%{vpg>SH4Kn~pZqK&S(>7# ze?j)&p@Y9g2R}v5Pf+jgQ2!^W^%KzCHHO zb!N}UxweD?tw4%Gkg6J~sl+1|RBd0Qy!b6@73oe^d)9`$G!ISR988rI^`YPF-5Nsa zOUK@CzVF*_zS*1k&F`B(XEG@SO@8~2NXPN?;_QA0sn%8JRMXZ;PFjrKEs}WX4IN(%B_e2;@$m z$3gO>&dBSYNo+6@jQGCg&NHKvsw2B!iM9@by@CkhzqTZtMMQ+J#>c@MYQnZAa;+id zV?lDtt4|wc;^;w~5yQ7UH&Dohs;#?vXvA5~adg_4gy4jxxw=C%El6pauF8< zSxvXQZ$#$PpjXp$*Y*4`1=F-uw8cO5!DO-QIi%>>wYtu9*RhJ@o>6m%>$BodJ$kq1vDcNE!;RupEeApp8u5rPnsH z{p(UcsFz=(+#Loj4wikt%8nL`9;`C;YCdGqEttS6E*yp}NX)`oYc*;Iaqwy9NyF&w z8Kgic4$mccwswQ}G{UgfKKF;M(#1ez^S&ZIYIXDq&+=F^u_>7168a7Ltzb$Y3iH^E z`TQq^t)fKQ!~S+xHRTV5JMjpomjKm22Q>UzSP_32i=k!wNS4gQ(UG=w=!vunU&_Z?gyw1Vee~YZ--&gcPfk7f=_kBS1GZ{0&Ke4#oy7?st`!jvq=y)Jl6+w1WJVgCcN1SKomAg@*7ps zb+Qn>t6~rXL=rnlID|2Mp0J>o5?1wG*wQ(VI=WB!7UffcY}gi0Dj8)RQX8IAwFw71 zgIAR301LkOK{iaNg;RHC37+b>7IfP(AufCeUs?7zID>e1YB~VJ;YWzntHUpWuV8#? zRi#hyyh@dN5Y9AsO$*@Rq{5a#e2Kofi0)>0t_)lsXlD<$vIpDQVk=wR=o#J^*w-9A z{mCnzywohb*~(2dv(p><3LiJxCrYgorS%hUw|2d=8ArL%EhOdSuaT6NFOJ_&q4bWn za4c!G|apZp?Q`KElowIz^FB7$ch@&aWr=okg2=SGq&ww6Cv&7fwv)uh0 zw__dF3K=WUz0Y&69C}SU492%G@|{O8q5-AW48KN+NneD(@ABXo5P%gWd+pfOV{K)i zr3|!{11;r1TN!C7BN1X|n%Zo0#%LDIR*p2Yb9Z;`1C*o-fD#UnO#n#Wwc}Th-@sQ- zw3YoWWq(`Ax0HNS%7?^hz%<)!n*hHASC;^{3DU$76}*I&kQwkMI`5{hE^4vwR1 zf+5T!LwE;BYFY5NDRN1O;;swt^-5@2gtyyhsca)6nhT*N@qOUvE)9wu8Z-nb_V5># zX^f^}E)37IQ0KIjdj=gw4A;ST4rm<;GrAz2Lv8%5=`5 zs^$6=t_9&WLIr5+^Hi^v;kf9&8eS?amxYH13aw?R!0raA`j7z5RkcQ|9wVwZ7e<)f zi$Ou1@Krbr3zkV3AK{Y=z6yj6b%U>JXE4-q+j1eG&ZmNWau@;RYksJ^G#M5nGg zM?ELZKIhrCSBINaRCHJ&-$(gs(C>i=WWLF|MLCKo->V=|ZJ1D`EQ9ikW{{0;B!%co zMme)}3Pc3x&YXSg)RZ>%#)NkI)VYupLXd<<7vF2)eJ*DAl)-8Qw_ux6=n& y=>zv;VoJCu-%HE*C-`1Y#?Rl&N%#=NVYwv_ZX&eD5#e4Icg26gJf0YS!+!uf%)BuG literal 0 HcmV?d00001 diff --git a/app/services/__pycache__/openrouter.cpython-312.pyc b/app/services/__pycache__/openrouter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46d57bd3895341e45d2e94e48ee40b4cb14b231c GIT binary patch literal 8681 zcma($Yj6|SdS_pfR$j^P_iM2YMv0LO#6VmEacqOl!w^hvFYZls7wy_!MX&6xZLCO5 zZo)L8V>!I_&#v1@EArXm02uTqaIp?n z7GVq0sx0UuI0b$Hk9D;ufzMP}J3+!%g*giLiEqYuf-M$QtVPFFwgh8g7VNB{ILd;8 z?G{w5MWJUS=WDOdfqcUL zl1}G=sKQG^V3vojK5Of6BC&}531ri<6cPc;Ha;SRr9u8AY}-0iugXu)g+fLT3=rnR zgZz*RLag$05m8e3un-Yuq%bfvzZK3RM>J_hQu_EgRpR@3d76(z;gIu!95QqV;0Vl0 zfmi&h9EYV$sWE07%{*>45CJLRpdJ{nnu0SO4a}*7{G{>48(&Wj)LdIGaa4ZKZ1+$nItV>>sS%1*%oFR1}VHk9cgKT`a_ z_l+6EqV!|)o#WwHRKa{U$O8$+r-4a~YvN_~af-rx0}WMpqP14nil3l z8b2?DfJ6ALps=+?%m|o?fLnPDc-zFR_Q8GOrpR3Qw4|t*^rI0Kn2{HN0ZIlMB7)bD zl+Yp%ClX@7u$aRF(TFC%h9Xe|RH7lcZ2`*;AOw6TOtue%Bq7p=HzkYwp~+(td^8eT zbUO3hm@09eHZWXMzbsO9c+_Zw#zk+qHUUTgD=){R* zFP$7c;U79a?ms+wN~c1hu->pBYqS0$k^Mj%pq+yCw(c%zhrx;qig5E@`~^1;SnE*Ml|AIkQHfK0Ybo4#(b)z%*$YXOK&*_ zx90cD5n1#5<1H4Uk6TvSYxUK@;_6i>-at8LUGuIKecpAy4>fMi);BM`eCg$DQ`e5% zr^rSpbX44Tpt_FRb#0kCU%Jkht>2lgc{3A;P@!Y?J z);kWT=$nq#Z0iDCvWmcjizi%N2bM>Zt-PW#?mcg+3Kx# zs@m5&Pvj`F!JT7J>o~E2m{wTlmz5i^U!*gq1uZbEvqt5Y#WT9iv@egj7?=&-f_LLz zfvXQA0=t|4$G*k#Q8R@=qME0g9TGxoF$TC!PP0-PyZkNOtC-7ibN3e^b?P0JBt$R z2(nIP?K8xcFN0%c&=zQwuwMqZJmI*E z-lHy&=iM|4P)SG9URc|iphPwSo@*RdH^w@lB;RA=aDpUt{uT}1vQ}_v#EJx27|Xp! zS$GI=%7O()t$6TYZ4tJhL}AujNOZjh_x>AdiTX>1LCK0lMX=5qSF9)DR78H~3urew z-_VV;f|J&R6h6t7?9pAaM@NE#v(v?WVhWvtb(7p*aFh9ZfmlFis8eWx_${?Scq-!^ zc}eC4a2+BuA&EDn02^1V6?OE8a6TgVM}+GU!3Ih>9lN?a;oTU4!jy*udbtT`Noa*} z3BuZsh+g>_Xb3t%h{QZnp#BZ9L=fnF3yl^@V>i@yJfQMWiWe%S9~ax9m;hTqfiDL- zoAmbUG`5H;wk$laY=P<#yLU#4fWN8ii|;I>pg@=7h{A6i%)2)f4G1B1pSLjSCBUIR z4aI+<2XCNzh<>W!P8)x_wPRVk{Ol_EH~Uw`wOwOr@7R3?HMifTp#PzYPyc2IPsGVS zK28quar#7bf&UI4JYJorDfSUeYQ`zA*E5b_%;c6&J8A%jW9&Zy1+6jpUI zCh83Mei1>Zz>4U0MS@r{An9~4Do1n%9F&Nr@UR}3SFm!?S0vqKT01Zu*wX9l=$U|I zI8GjyvKdAy9$Y+y=j4?g#!O+2N5R^9h2dC8q91=y*kCh;Sbho_AZLN}fNc;)*EU@VulTPGuhs0i$vvCx+LGxSNOuinx}HmSJ$GGR?;6Xrjirue zYxzvAH(l#piBiy!=tPFuXkO4@j4)V{w6n+ zt*!s%`2XY>81x7A&)^Q;9I}mWB5trld!YJ^s2tI38Ndq))Dr zALR`;I0$8P4ngc}KJgUI1*pXkjSbBcBk*TALt`0Q0xg<~or-r%3iHL_NaeLCKl$Pj zL#bmYe8OW>aH>G3AS4C9vJg}1Txq-rIhbNlh}|L_+Kt7e(C`h=#+m=Alg`bPRVmtlI}q{xrCkW`4|2t5|!OdV}+;a0tH^^+P5o-Z7pZ zRiZUY`ei_?z72(;GwxK@W-2?ss_e{iH6UuuUH1^ndQ)S!TY9tg?bspB*0n7ixpXAk z+MQ|jrdz$)_RX309qINR*|wfcTYtK(Kik;3^!laO@4M`+?$qI26>?Q)xVALcmf`p` z$FH>B;FLpvZ;Y!A!dBOgh29u9))k%lU+&O=PUp)8k&w`CovbkV+>AOBq2dD`sSZO3;w3 z1Ah!ZSZMNMzpPh8d2rRd_KdJrh-!Q>x5tN0d|PfSkVfd}+5wyLf6?yl~e1jKJd2KL&UkmsiD-FY)e>C|!PHomt2d=@`KLFzEyb_tmDvx%SHf%U&)nn&vRvhbL+1`HH?M48>0Wtp z<;(}kE6Ggn-gNKY>yw{!ulF8U+j4NNaqJd1o@4RScR9q>-{Gn*96NU`!*!;)&Q;I# z#V-SET<0zB1cZxReO{7>sbPlDNpV_tntl&XwdwA#a0VjuNMKG;AixfknjTD8L(&~! z;NC`WbEY}NvJ5X%^pZuQoNOmZi$g)mp8_LigIKc^KcL4CL{zo}iKChp1k%g|3rTCE z+b9xtV+`zLnzTG}Ffx5KD#nfFWVvXP{)$VX6gN$Z^+U zXow_BLbRGFB%vWb;~+jGA%3JpS|;8hAUs*zE9EDXb!{VRAk zu0!!_((;&$u$V}a=jdn9Tktcel(=vxs$9|n{xFc1k_AGTbMgUefE4kcG*sw^@5gaY z;&9amZzH^HH`?&{}W2VXCYKE5b0-an`W-b3asD*2sW5UyKH=#>c0>2a6bg+8!7R zE!*h<>z^9E_-wY3{~i&pVdAb)X4^Mq+WXV({nzTRv+M0IWEx-ip2FScpnJxk2ie9K z9^{PX2WERIia)5wVX?Qkp%&r-vEO!)0^nB-Lk#sHF+|w@jsobnxbvTgA1 z!s-x7s*90;9M#!Li-Njs2GX#SqB~_s5a3@o0ZAFggKc0uumPHa4`l`lE6SY34cHx1 zf>7Yd)eI2rhTa4wmT;d37g1b4eEPhaj`8CENY<%R?)~ z%AqUnwK`vFG|Saqm^e4FJi4-d)p=$ATI8o2L7^^nWloQ60pvR*fkV*i__ z*kR&6hdQ^sA9**D9X|A*uEW=+((ay>sax*;uUv<76xHdx)6sR8f%2o zcd#vWUVvCHHdTR#hKfZrfca;%c)s4VjwmbWQil!S$NH4A-~D z^%-rRbd6`VcfDpuhTE~m?YL9fka9n~*N)sr2w1b(`5@PXT-zRGxgC$xF(A#^p+ zabvJ@c(>zI)(P;ZwZ!mV`qR2*40}CLe%43~@1s9!YQk_oF@osNwsRQXM?m>G!lj3t zfWP10VTVvjLgA^ziDR5(_xp9H-wz4@xe$gqzyH-aA!PO_*r4gQ=;n0DiL5sJ?t=1ApZX z5mijr>d3XG4<~X6!0Xxuu5vxq%p`HG=h~T%y*Y&8Pfl(ab#5y42&@kg3sWoQ8cJ@o3dDqg7u=_5CZXrJeeE;-$aqTAT01ZLp6VHR24hT|2D>N4!tV2D8f$xIp9|z8Vmz! zgVNa08^1tI!k)#S1!6yP>@}}-pYqqx0WyZvJ}5xT5ybD1`x~_B_o(uBsP=2r{WaSD YHR}Ba?fe?GfA1DZqVXQWrGf2#0i7_44gdfE literal 0 HcmV?d00001 diff --git a/app/services/__pycache__/settings.cpython-312.pyc b/app/services/__pycache__/settings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64d23fccd82b9ecf9f956752e9e4a5b98ab989df GIT binary patch literal 681 zcmY*X&1w`u5U!q?UB|ea)u^BeKER0VTs%n#s98j!n~)g9xirI0Z!%=2d(%Ceu;N8X zAmRh$8Du^94jw%U5*j>t@@CwCuqUhMN7;s{`l{#a`l`D6OS3r#xITaTwEGeL4--_Y zR)WcW1g8)|L=s3zKnPgy1S0DQBAb!1UGM_S0%ncNLxDYVyxLC_c|Xnq-Ib|@phu{<#1SEpM+6HDbCnC$Z`(JEq73`p9l54lhs2A2ZI^E6n?H8R*`f%eZ zUF*D>Q6ZJe)fttDV40R@Pg_-{_G7{O@y=_f&GagQkk#a*=rBV8|Qk}jscgr73z2+pqGe&@=0#L{%t_3HAn3Cuv{6*Ssb znba*xL(WAOX0hOk(wr=thLoRe8DvJC8#4~IfaVy^ZY_Ovmi{=Tw)h*+6m-Kh_i}A8 zim-#YN2^k$c?WqC?f%=F)tS1>#TBl|n@BcOQFqY%vI!yI;qFD Dict[str, Any]: - """Safely parse JSON string with optional LLM assistance""" - if not json_str or json_str.strip() == "": - return {} - - try: - return json.loads(json_str) - except json.JSONDecodeError as e: - logger.warning(f"JSON parsing failed: {e}") - - # Use LLM to clean JSON if available - if self.use_llm: - return self._llm_clean_json(json_str) - else: - return {} - - def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]: - """Use LLM to clean and parse malformed JSON""" - try: - prompt = f""" - The following text appears to be malformed JSON. Please clean it up and return valid JSON. - If it's not possible to create valid JSON, return an empty object {{}}. - - Original text: - {malformed_json[:2000]} # Limit length for API - - Return only the cleaned JSON, no explanations: - """ - - response = self.openai_client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": prompt}], - temperature=0, - ) - - cleaned_json = response.choices[0].message.content.strip() - return json.loads(cleaned_json) - - except Exception as e: - logger.error(f"LLM JSON cleaning failed: {e}") - return {} - - def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]: - """Extract and structure data from CSV row""" - # Parse the investment firm profile - profile_data = {} - if csv_row.investment_firm_profile: - profile_data = self.parse_json_field(csv_row.investment_firm_profile) - - # Create structured output - structured_data = { - "name": csv_row.name, - "website": csv_row.website or profile_data.get("websiteURL"), - "investor_description": profile_data.get("investorDescription", ""), - "investment_thesis_focus": profile_data.get("investmentThesisFocus", []), - "headquarters": profile_data.get("headquarters", ""), - "aum_info": profile_data.get("overallAssetsUnderManagement", {}), - "funds_info": profile_data.get("funds", []), - "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "", - "crunchbase_extract": csv_row.crunchbase_firm_extract or "", - "linkedin_profile": csv_row.linkedin_investment_profile or "", - "source_truth_profile": csv_row.source_of_truth_profile or "", - } - - return structured_data - - def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]: - """Use LLM to enhance and standardize investor data""" - if not self.use_llm: - return investor_data - - try: - # Combine all available text for context - context_text = " ".join( - [ - investor_data.get("investor_description", ""), - investor_data.get("crunchbase_extract", ""), - investor_data.get("linkedin_profile", ""), - investor_data.get("source_truth_profile", ""), - ] - ) - - if not context_text.strip(): - return investor_data - - prompt = f""" - Based on the following information about an investor, please extract and standardize: - 1. A concise investor description (2-3 sentences) - 2. Investment thesis focus areas (list of specific focus areas) - 3. Headquarters location (city, country format) - - Investor: {investor_data["name"]} - Context: {context_text[:3000]} # Limit for API - - Return in JSON format: - {{ - "enhanced_description": "concise description here", - "standardized_focus": ["focus area 1", "focus area 2", ...], - "standardized_headquarters": "City, Country" - }} - """ - - response = self.openai_client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": prompt}], - temperature=0.3, - ) - - enhanced_data = json.loads(response.choices[0].message.content) - - # Update investor data with enhanced information - if enhanced_data.get("enhanced_description"): - investor_data["enhanced_description"] = enhanced_data[ - "enhanced_description" - ] - - if enhanced_data.get("standardized_focus"): - investor_data["standardized_focus"] = enhanced_data[ - "standardized_focus" - ] - - if enhanced_data.get("standardized_headquarters"): - investor_data["standardized_headquarters"] = enhanced_data[ - "standardized_headquarters" - ] - - return investor_data - - except Exception as e: - logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}") - return investor_data - - def save_to_sql(self, investor_data: Dict[str, Any]) -> int: - """Save investor data to SQL database""" - try: - with get_session() as session: - # Check if investor already exists - existing = ( - session.query(Investor) - .filter_by(name=investor_data["name"]) - .first() - ) - - if existing: - logger.info(f"Updating existing investor: {investor_data['name']}") - investor = existing - else: - logger.info(f"Creating new investor: {investor_data['name']}") - investor = Investor() - - # Map data to investor object - investor.name = investor_data["name"] - investor.website = investor_data.get("website") - investor.investor_description = investor_data.get( - "enhanced_description" - ) or investor_data.get("investor_description") - investor.investment_thesis_focus = investor_data.get( - "standardized_focus" - ) or investor_data.get("investment_thesis_focus") - investor.headquarters = investor_data.get( - "standardized_headquarters" - ) or investor_data.get("headquarters") - - # AUM information - aum_info = investor_data.get("aum_info") or {} - investor.aum_amount = aum_info.get("aumAmount") - investor.aum_as_of_date = aum_info.get("asOfDate") - investor.aum_source_url = aum_info.get("sourceUrl") - - # Fund information - investor.funds_info = investor_data.get("funds_info", []) - - # Raw data - investor.crunchbase_urls = investor_data.get("crunchbase_urls") - investor.crunchbase_extract = investor_data.get("crunchbase_extract") - investor.linkedin_profile = investor_data.get("linkedin_profile") - investor.source_truth_profile = investor_data.get( - "source_truth_profile" - ) - - if not existing: - session.add(investor) - - session.flush() # Get the ID - return investor.id - - except Exception as e: - logger.error(f"Failed to save to SQL: {e}") - raise - - def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]): - """Save investor description and focus to ChromaDB""" - try: - # Prepare text for embedding - description_text = investor_data.get( - "enhanced_description" - ) or investor_data.get("investor_description", "") - focus_areas = investor_data.get("standardized_focus") or investor_data.get( - "investment_thesis_focus", [] - ) - - if isinstance(focus_areas, list): - focus_text = " ".join(focus_areas) - else: - focus_text = str(focus_areas) - - # Combine description and focus for embedding - combined_text = f"{description_text} {focus_text}".strip() - - if not combined_text: - logger.warning(f"No text to embed for investor {investor_data['name']}") - return - - # Create metadata - metadata = { - "investor_id": investor_id, - "name": investor_data["name"], - "website": investor_data.get("website") or "", - "headquarters": investor_data.get("standardized_headquarters") - or investor_data.get("headquarters") - or "", - "focus_areas_count": len(focus_areas) - if isinstance(focus_areas, list) - else 0, - } - - # Add to ChromaDB - self.collection.add( - documents=[combined_text], - metadatas=[metadata], - ids=[f"investor_{investor_id}"], - ) - - logger.info(f"Added investor {investor_data['name']} to vector database") - - except Exception as e: - logger.error(f"Failed to save to vector DB: {e}") - - def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None): - """Process the entire CSV file""" - logger.info(f"Starting to process CSV file: {csv_file_path}") - - # Read CSV - df = pd.read_csv(csv_file_path) - logger.info(f"Loaded {len(df)} rows from CSV") - - if limit: - df = df.head(limit) - logger.info(f"Processing limited to {limit} rows") - - processed_count = 0 - error_count = 0 - - for index, row in df.iterrows(): - try: - logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}") - - # Create CSVRow object - csv_row = CSVRow( - name=row["Name"], - website=row.get("Website"), - investment_firm_profile=row.get("Investment Firm Profile"), - crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"), - crunchbase_firm_extract=row.get("Crunchbase Firm Extract"), - linkedin_investment_profile=row.get("LinkedIn Investment Profile"), - source_of_truth_profile=row.get("Source of Truth Profile"), - ) - - # Extract structured data - structured_data = self.extract_structured_data(csv_row) - - # Enhance with LLM if enabled - enhanced_data = self.enhance_with_llm(structured_data) - - # Save to SQL database - investor_id = self.save_to_sql(enhanced_data) - - # Save to vector database - self.save_to_vector_db(investor_id, enhanced_data) - - processed_count += 1 - - # Progress update every 10 rows - if (index + 1) % 10 == 0: - logger.info( - f"Progress: {processed_count} processed, {error_count} errors" - ) - - except Exception as e: - error_count += 1 - logger.error( - f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}" - ) - continue - - logger.info( - f"Processing complete! Processed: {processed_count}, Errors: {error_count}" - ) - return processed_count, error_count - - def search_investors(self, query: str, limit: int = 10): - """Search investors using vector similarity""" - try: - results = self.collection.query(query_texts=[query], n_results=limit) - - return results - - except Exception as e: - logger.error(f"Search failed: {e}") - return None - - -def main(): - """Main function with command line interface""" - parser = argparse.ArgumentParser(description="LLM-powered Investor Parser") - parser.add_argument("--file", type=str, help="Path to CSV file to process") - parser.add_argument("--limit", type=int, help="Limit number of rows to process") - parser.add_argument( - "--use-llm", - action="store_true", - help="Enable LLM enhancement (requires OpenAI API key)", - ) - parser.add_argument("--search", type=str, help="Search query for vector database") - parser.add_argument( - "--search-limit", - type=int, - default=10, - help="Number of search results to return", - ) - - args = parser.parse_args() - - # Initialize parser - investor_parser = InvestorParser(use_llm=args.use_llm) - - if args.search: - # Perform search - logger.info(f"Searching for: {args.search}") - results = investor_parser.search_investors(args.search, args.search_limit) - - if results and results["documents"][0]: - print(f"\nFound {len(results['documents'][0])} similar investors:") - for i, (doc, metadata) in enumerate( - zip(results["documents"][0], results["metadatas"][0]) - ): - print(f"{i + 1}. {metadata['name']}") - print(f" Website: {metadata.get('website', 'N/A')}") - print(f" HQ: {metadata.get('headquarters', 'N/A')}") - print(f" Focus areas: {metadata.get('focus_areas_count', 0)}") - print(f" Similarity score: {results['distances'][0][i]:.3f}") - print() - else: - print("No results found.") - - elif args.file: - # Process CSV file - if not os.path.exists(args.file): - logger.error(f"File not found: {args.file}") - return - - processed, errors = investor_parser.process_csv_file(args.file, args.limit) - - print("\nProcessing complete!") - print(f"Successfully processed: {processed} investors") - print(f"Errors encountered: {errors}") - - # Show some search examples - print("\nTrying some example searches...") - for query in ["bioeconomy", "venture capital", "sustainability"]: - results = investor_parser.search_investors(query, 3) - if results and results["documents"][0]: - print(f"\nTop matches for '{query}':") - for i, metadata in enumerate(results["metadatas"][0][:3]): - print(f" {i + 1}. {metadata['name']}") - - else: - parser.print_help() - - -if __name__ == "__main__": - main() diff --git a/app/services/langgraph_agent.py b/app/services/langgraph_agent.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/llm_parser.py b/app/services/llm_parser.py index e22238b..4465251 100644 --- a/app/services/llm_parser.py +++ b/app/services/llm_parser.py @@ -1,28 +1,368 @@ -import asyncio -import csv +import json +import logging +import os +from typing import Any, Dict, Optional -from openai import AsyncOpenAI -from pydantic import BaseModel +import chromadb +import pandas as pd +from dotenv import load_dotenv +from openai import OpenAI + +from db import get_session, init_database +from schema import CSVRow, Investor + +# Load environment variables +load_dotenv() + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) -class RowSchema(BaseModel): - section: str - explanation: str +class LLMInvestorParser: + def __init__(self): + # Initialize OpenAI client + self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -client = AsyncOpenAI() + # Initialize ChromaDB + self.chroma_client = chromadb.PersistentClient(path="./chroma_db") + self.collection = self.chroma_client.get_or_create_collection( + name="investor_descriptions", + metadata={ + "description": "Investor descriptions and investment thesis focus" + }, + ) -async def process_row(row): - resp = await client.chat.completions.create( - model="gpt-4o-mini", - messages=[{"role": "user", "content": f"Extract relevant section:\n{row}"}], - response_format={"type": "json_object"} # ensures JSON output - ) - return RowSchema.model_validate_json(resp.choices[0].message.content) + # Initialize database + init_database() -async def main(): - with open("data.csv") as f: - reader = csv.DictReader(f) - tasks = [process_row(row) for row in reader] - return await asyncio.gather(*tasks) + def parse_json_field(self, json_str: str) -> Dict[str, Any]: + """Safely parse JSON string with LLM assistance if needed""" + if not json_str or json_str.strip() == "": + return {} -results = asyncio.run(main()) \ No newline at end of file + try: + # Try direct JSON parsing first + return json.loads(json_str) + except json.JSONDecodeError: + # If direct parsing fails, use LLM to clean and parse + logger.info("Direct JSON parsing failed, using LLM to clean JSON") + return self._llm_clean_json(json_str) + + def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]: + """Use LLM to clean and parse malformed JSON""" + try: + prompt = f""" + The following text appears to be malformed JSON. Please clean it up and return valid JSON. + If it's not possible to create valid JSON, return an empty object {{}}. + + Original text: + {malformed_json[:2000]} # Limit length for API + + Return only the cleaned JSON, no explanations: + """ + + response = self.openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt}], + temperature=0, + ) + + cleaned_json = response.choices[0].message.content.strip() + return json.loads(cleaned_json) + + except Exception as e: + logger.error(f"LLM JSON cleaning failed: {e}") + return {} + + def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]: + """Extract and structure data from CSV row using LLM""" + # Parse the investment firm profile + profile_data = {} + if csv_row.investment_firm_profile: + profile_data = self.parse_json_field(csv_row.investment_firm_profile) + + # Create structured output + structured_data = { + "name": csv_row.name, + "website": csv_row.website or profile_data.get("websiteURL"), + "investor_description": profile_data.get("investorDescription", ""), + "investment_thesis_focus": profile_data.get("investmentThesisFocus", []), + "headquarters": profile_data.get("headquarters", ""), + "aum_info": profile_data.get("overallAssetsUnderManagement", {}), + "funds_info": profile_data.get("funds", []), + "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "", + "crunchbase_extract": csv_row.crunchbase_firm_extract or "", + "linkedin_profile": csv_row.linkedin_investment_profile or "", + "source_truth_profile": csv_row.source_of_truth_profile or "", + } + + return structured_data + + def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]: + """Use LLM to enhance and standardize investor data""" + try: + # Combine all available text for context + context_text = " ".join( + [ + investor_data.get("investor_description", ""), + investor_data.get("crunchbase_extract", ""), + investor_data.get("linkedin_profile", ""), + investor_data.get("source_truth_profile", ""), + ] + ) + + if not context_text.strip(): + return investor_data + + prompt = f""" + Based on the following information about an investor, please extract and standardize: + 1. A concise investor description (2-3 sentences) + 2. Investment thesis focus areas (list of specific focus areas) + 3. Headquarters location (city, country format) + + Investor: {investor_data["name"]} + Context: {context_text[:3000]} # Limit for API + + Return in JSON format: + {{ + "enhanced_description": "concise description here", + "standardized_focus": ["focus area 1", "focus area 2", ...], + "standardized_headquarters": "City, Country" + }} + """ + + response = self.openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": prompt}], + temperature=0.3, + ) + + enhanced_data = json.loads(response.choices[0].message.content) + + # Update investor data with enhanced information + if enhanced_data.get("enhanced_description"): + investor_data["enhanced_description"] = enhanced_data[ + "enhanced_description" + ] + + if enhanced_data.get("standardized_focus"): + investor_data["standardized_focus"] = enhanced_data[ + "standardized_focus" + ] + + if enhanced_data.get("standardized_headquarters"): + investor_data["standardized_headquarters"] = enhanced_data[ + "standardized_headquarters" + ] + + return investor_data + + except Exception as e: + logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}") + return investor_data + + def save_to_sql(self, investor_data: Dict[str, Any]) -> int: + """Save investor data to SQL database""" + try: + with get_session() as session: + # Check if investor already exists + existing = ( + session.query(Investor) + .filter_by(name=investor_data["name"]) + .first() + ) + + if existing: + logger.info(f"Updating existing investor: {investor_data['name']}") + investor = existing + else: + logger.info(f"Creating new investor: {investor_data['name']}") + investor = Investor() + + # Map data to investor object + investor.name = investor_data["name"] + investor.website = investor_data.get("website") + investor.investor_description = investor_data.get( + "enhanced_description" + ) or investor_data.get("investor_description") + investor.investment_thesis_focus = investor_data.get( + "standardized_focus" + ) or investor_data.get("investment_thesis_focus") + investor.headquarters = investor_data.get( + "standardized_headquarters" + ) or investor_data.get("headquarters") + + # AUM information + aum_info = investor_data.get("aum_info", {}) + investor.aum_amount = aum_info.get("aumAmount") + investor.aum_as_of_date = aum_info.get("asOfDate") + investor.aum_source_url = aum_info.get("sourceUrl") + + # Fund information + investor.funds_info = investor_data.get("funds_info", []) + + # Raw data + investor.crunchbase_urls = investor_data.get("crunchbase_urls") + investor.crunchbase_extract = investor_data.get("crunchbase_extract") + investor.linkedin_profile = investor_data.get("linkedin_profile") + investor.source_truth_profile = investor_data.get( + "source_truth_profile" + ) + + if not existing: + session.add(investor) + + session.flush() # Get the ID + return investor.id + + except Exception as e: + logger.error(f"Failed to save to SQL: {e}") + raise + + def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]): + """Save investor description and focus to ChromaDB""" + try: + # Prepare text for embedding + description_text = investor_data.get( + "enhanced_description" + ) or investor_data.get("investor_description", "") + focus_areas = investor_data.get("standardized_focus") or investor_data.get( + "investment_thesis_focus", [] + ) + + if isinstance(focus_areas, list): + focus_text = " ".join(focus_areas) + else: + focus_text = str(focus_areas) + + # Combine description and focus for embedding + combined_text = f"{description_text} {focus_text}".strip() + + if not combined_text: + logger.warning(f"No text to embed for investor {investor_data['name']}") + return + + # Create metadata + metadata = { + "investor_id": investor_id, + "name": investor_data["name"], + "website": investor_data.get("website", ""), + "headquarters": investor_data.get("standardized_headquarters") + or investor_data.get("headquarters", ""), + "focus_areas_count": len(focus_areas) + if isinstance(focus_areas, list) + else 0, + } + + # Add to ChromaDB + self.collection.add( + documents=[combined_text], + metadatas=[metadata], + ids=[f"investor_{investor_id}"], + ) + + logger.info(f"Added investor {investor_data['name']} to vector database") + + except Exception as e: + logger.error(f"Failed to save to vector DB: {e}") + + def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None): + """Process the entire CSV file""" + logger.info(f"Starting to process CSV file: {csv_file_path}") + + # Read CSV + df = pd.read_csv(csv_file_path) + logger.info(f"Loaded {len(df)} rows from CSV") + + if limit: + df = df.head(limit) + logger.info(f"Processing limited to {limit} rows") + + processed_count = 0 + error_count = 0 + + for index, row in df.iterrows(): + try: + logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}") + + # Create CSVRow object + csv_row = CSVRow( + name=row["Name"], + website=row.get("Website"), + investment_firm_profile=row.get("Investment Firm Profile"), + crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"), + crunchbase_firm_extract=row.get("Crunchbase Firm Extract"), + linkedin_investment_profile=row.get("LinkedIn Investment Profile"), + source_of_truth_profile=row.get("Source of Truth Profile"), + ) + + # Extract structured data + structured_data = self.extract_structured_data(csv_row) + + # Enhance with LLM + enhanced_data = self.enhance_with_llm(structured_data) + + # Save to SQL database + investor_id = self.save_to_sql(enhanced_data) + + # Save to vector database + self.save_to_vector_db(investor_id, enhanced_data) + + processed_count += 1 + + # Progress update every 10 rows + if (index + 1) % 10 == 0: + logger.info( + f"Processed {processed_count} rows successfully, {error_count} errors" + ) + + except Exception as e: + error_count += 1 + logger.error( + f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}" + ) + continue + + logger.info( + f"Processing complete! Processed: {processed_count}, Errors: {error_count}" + ) + return processed_count, error_count + + def search_investors(self, query: str, limit: int = 5): + """Search investors using vector similarity""" + try: + results = self.collection.query(query_texts=[query], n_results=limit) + + return results + + except Exception as e: + logger.error(f"Search failed: {e}") + return None + + +def main(): + """Main function to run the parser""" + parser = LLMInvestorParser() + + # Process the CSV file + csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv" + + # Start with a small sample for testing + processed, errors = parser.process_csv_file(csv_file, limit=5) + + print("\nProcessing complete!") + print(f"Successfully processed: {processed} investors") + print(f"Errors encountered: {errors}") + + # Test search functionality + print("\nTesting search functionality...") + results = parser.search_investors("bioeconomy circular economy") + if results: + print(f"Found {len(results['documents'][0])} similar investors") + for i, doc in enumerate(results["documents"][0]): + print(f" {i + 1}. {results['metadatas'][0][i]['name']}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/app/services/openrouter.py b/app/services/openrouter.py new file mode 100644 index 0000000..6fa8a01 --- /dev/null +++ b/app/services/openrouter.py @@ -0,0 +1,178 @@ +import asyncio +from typing import List, Optional + +import chromadb +import pandas as pd +from db.tables import InvestorTable +from langchain_core.prompts import PromptTemplate +from langchain_openai import ChatOpenAI +from pydantic_schemas import Investor, InvestorList +from settings import settings + +# Add these imports for your databases +# from sqlalchemy.ext.asyncio import AsyncSession +# from your_vector_db import VectorDBClient + + +class InvestorProcessor: + def __init__( + self, + sql_session: Optional[object] = None, + vector_db_client: Optional[object] = None, + ): + self.template = """You are an expert data extraction assistant. Extract investor information from the provided CSV data and return it as a list of structured records. + +Given the following CSV data rows: +{question} + +For each row, extract and structure the following fields: +- name: The investor's full name +- aum: Assets under management (as integer, use 0 if not available) +- check_size: Investment check size (as string) +- sector_focus: Sector focus (as string) +- stage_focus: Investment stage focus (as string) +- region: Geographic region (as string) +- investment_thesis: Investment thesis (as string) +- investor_description: Description of the investor (as string) + +Important: +- If a field is not available in the data, use appropriate default values (empty string for text fields, 0 for numbers) +- Ensure all text fields are properly escaped and contain no control characters +- Return clean, valid JSON only + +Return the data as a structured list of investors.""" + + self.prompt = PromptTemplate( + template=self.template, input_variables=["question"] + ) + + self.llm = ChatOpenAI( + api_key=settings.OPENROUTER_API_KEY, + base_url="https://openrouter.ai/api/v1", + model="openai/gpt-oss-120b:fre", + temperature=0, + ) + + self.structured_llm = self.llm.with_structured_output(InvestorList) + self.sql_session = sql_session + self.vector_db_client = vector_db_client + + self.vector_db_client = chromadb.PersistentClient(path="./chroma_db") + self.collection = self.vector_db_client.get_or_create_collection( + name="investor_descriptions", + metadata={ + "description": "Investor descriptions and investment thesis focus" + }, + ) + + async def _process_batch(self, batch: pd.DataFrame, batch_idx: int) -> List: + """Process a single batch of data""" + # Convert batch to string representation - clean the data + batch_str = "" + for idx, row in batch.iterrows(): + # Clean values to remove control characters + cleaned_row = {} + for key, value in row.items(): + if pd.notna(value): + # Convert to string and clean control characters + clean_value = ( + str(value) + .replace("\n", " ") + .replace("\r", " ") + .replace("\t", " ") + ) + # Remove other control characters + clean_value = "".join( + char + for char in clean_value + if ord(char) >= 32 or char in ["\n", "\r", "\t"] + ) + cleaned_row[key] = clean_value + + row_str = ", ".join( + [f"{key}: {value}" for key, value in cleaned_row.items()] + ) + batch_str += f"Row {idx + 1}: {row_str}\n" + + try: + print(f"Processing batch {batch_idx + 1}...") + batch_results = await self.structured_llm.ainvoke(batch_str) + return batch_results.investor_list + except Exception as e: + print(f"Error processing batch {batch_idx + 1}: {e}") + return [] + + async def _save_to_sql(self, investors: List[Investor]) -> None: + """Save investors to SQL database""" + if not self.sql_session: + return + + # Implement SQL saving logic here + for investor in investors: + db_investor = InvestorTable( + name=investor.name, + aum=investor.aum, + check_size=investor.check_size, + sector_focus=investor.sector_focus, + stage_focus=investor.stage_focus, + region=investor.region, + ) + self.sql_session.add(db_investor) + self.sql_session.commit() + + async def _save_to_vector_db(self, investors: List[Investor]) -> None: + """Save investors to vector database""" + if not self.vector_db_client: + return + + documents = [] + metadatas = [] + ids = [] + + for i, investor in enumerate(investors): + doc_text = f"{investor.investor_description}\nInvestment Thesis: {investor.investment_thesis}" + documents.append(doc_text) + metadatas.append({"name": investor.name}) + ids.append(f"investor_{i}_{investor.name.replace(' ', '_')}") + + if documents: + # Use add method with proper parameters + self.collection.add(documents=documents, metadatas=metadatas, ids=ids) + + async def process_csv( + self, df: pd.DataFrame, batch_size: int = 10, max_concurrent: int = 10 + ) -> List: + """Process CSV data in parallel batches and save to databases""" + results = [] + + # Create batches + batches = [] + for i in range(0, len(df), batch_size): + batch = df.iloc[i : i + batch_size] + batches.append((batch, i // batch_size)) + + # Process batches with concurrency control + semaphore = asyncio.Semaphore(max_concurrent) + + async def process_with_semaphore(batch_data): + batch, batch_idx = batch_data + async with semaphore: + return await self._process_batch(batch, batch_idx) + + # Execute all batches concurrently + batch_results = await asyncio.gather( + *[process_with_semaphore(batch_data) for batch_data in batches], + return_exceptions=True, + ) + + # Collect results, filtering out exceptions + for batch_result in batch_results: + if not isinstance(batch_result, Exception): + results.extend(batch_result) + + # Save to databases + if results: + await self._save_to_sql(results) + await self._save_to_vector_db(results) + + return results diff --git a/app/services/querying.py b/app/services/querying.py new file mode 100644 index 0000000..ea80784 --- /dev/null +++ b/app/services/querying.py @@ -0,0 +1,61 @@ +from typing import Optional + +import chromadb +from langchain_openai import ChatOpenAI +from pydantic_schemas import Investor, InvestorList +from settings import settings + +# Add these imports for your databases +# from sqlalchemy.ext.asyncio import AsyncSession +# from your_vector_db import VectorDBClient + + +class QueryProcessor: + def __init__( + self, + sql_session: Optional[object] = None, + vector_db_client: Optional[object] = None, + ): + self.llm = ChatOpenAI( + api_key=settings.OPENROUTER_API_KEY, + base_url="https://openrouter.ai/api/v1", + model="openai/gpt-oss-120b:free", + temperature=0, + ) + + self.structured_llm = self.llm.with_structured_output(InvestorList) + self.sql_session = sql_session + self.vector_db_client = vector_db_client + + self.vector_db_client = chromadb.PersistentClient(path="./chroma_db") + self.collection = self.vector_db_client.get_or_create_collection( + name="investor_descriptions", + metadata={ + "description": "Investor descriptions and investment thesis focus" + }, + ) + + def query_sql_database(self, query: str) -> Optional[InvestorList]: + """Query the SQL database for investor information.""" + if not self.sql_session: + return None + + # Implement SQL querying logic here + result = self.sql_session.execute(query) + investors = result.scalars().all() + return InvestorList(investors=investors) + + def query_vector_database(self, query: str) -> Optional[InvestorList]: + """Query the vector database for investor information.""" + if not self.vector_db_client: + return None + + # Implement vector database querying logic here + results = self.vector_db_client.query(collection=self.collection, query=query) + investors = [Investor(**doc.metadata) for doc in results.documents] + return InvestorList(investors=investors) + + def process_query(self, question: str) -> InvestorList: + """Process a query using the LLM and return structured investor data.""" + response = self.structured_llm.predict(question=question) + return response diff --git a/app/settings.py b/app/settings.py index 2f9dd5b..a9376fe 100644 --- a/app/settings.py +++ b/app/settings.py @@ -1,10 +1,11 @@ from pydantic_settings import BaseSettings + class Settings(BaseSettings): - api_key: str - db_url: str + OPENROUTER_API_KEY: str class Config: env_file = ".env" -settings = Settings() \ No newline at end of file + +settings = Settings()