From 24823432b3931543dd4e703c758956c84b6f4110 Mon Sep 17 00:00:00 2001 From: kowshik Date: Thu, 12 Sep 2024 00:01:03 +0000 Subject: [PATCH] run testes on assessments predictions pipeline --- config.py | 0 .../company_id/output.csv | 6 ++ .../assessment_prediction/testid/output.csv | 6 ++ data/raw/dummy_assessment_data.csv | 6 ++ .../assessment_prediction/testid/output.csv | 6 ++ .../testid/testid_latest_data.csv | 2 + .../testid/testid_model.pkl | Bin 0 -> 129833 bytes requirements.txt | 4 +- scripts/run_assessment_prediction_trainer.py | 45 +++++++++ src/pipeline/data_preprocessor.py | 68 ++++++++++++++ src/pipeline/model_trainer.py | 88 ++++++++++++++++++ test.py | 21 ++--- tests/test_pipeline/__init__.py | 0 tests/test_pipeline/test_data_preprocessor.py | 18 ++++ 14 files changed, 254 insertions(+), 16 deletions(-) create mode 100644 config.py create mode 100644 data/processed/assessment_prediction/company_id/output.csv create mode 100644 data/processed/assessment_prediction/testid/output.csv create mode 100644 data/raw/dummy_assessment_data.csv create mode 100644 notebooks/data/processed/assessment_prediction/testid/output.csv create mode 100644 notebooks/models/assessment_prediction/testid/testid_latest_data.csv create mode 100644 notebooks/models/assessment_prediction/testid/testid_model.pkl create mode 100644 scripts/run_assessment_prediction_trainer.py create mode 100644 tests/test_pipeline/__init__.py create mode 100644 tests/test_pipeline/test_data_preprocessor.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..e69de29 diff --git a/data/processed/assessment_prediction/company_id/output.csv b/data/processed/assessment_prediction/company_id/output.csv new file mode 100644 index 0000000..36b15fa --- /dev/null +++ b/data/processed/assessment_prediction/company_id/output.csv @@ -0,0 +1,6 @@ +open_items,red_flags,num_employees,duration,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,percentage_change_open_items +10,2,30,1,0,0,1,0.0,0.0,0.0,10.0,0.0,0.0,0.0 +12,1,25,1,1,0,0,10.0,0.0,0.0,10.0,12.0,0.0,19.999999999999996 +11,3,28,1,0,1,0,0.0,12.0,0.0,10.0,12.0,11.0,-8.333333333333337 +9,1,30,1,0,0,1,0.0,0.0,11.0,9.0,12.0,11.0,-18.181818181818176 +13,4,27,1,1,0,0,9.0,0.0,0.0,9.0,13.0,11.0,44.44444444444444 diff --git a/data/processed/assessment_prediction/testid/output.csv b/data/processed/assessment_prediction/testid/output.csv new file mode 100644 index 0000000..fe2c484 --- /dev/null +++ b/data/processed/assessment_prediction/testid/output.csv @@ -0,0 +1,6 @@ +start_date,end_date,open_items,red_flags,num_employees,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,time_since_last_event,percentage_change_open_items +2023-01-01,2023-01-02,10,2,30,0,0,1,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0 +2023-01-08,2023-01-09,12,1,25,1,0,0,10.0,0.0,0.0,10.0,12.0,0.0,7.0,19.999999999999996 +2023-01-15,2023-01-16,11,3,28,0,1,0,0.0,12.0,0.0,10.0,12.0,11.0,7.0,-8.333333333333337 +2023-01-22,2023-01-23,9,1,30,0,0,1,0.0,0.0,11.0,9.0,12.0,11.0,7.0,-18.181818181818176 +2023-01-29,2023-01-30,13,4,27,1,0,0,9.0,0.0,0.0,9.0,13.0,11.0,7.0,44.44444444444444 diff --git a/data/raw/dummy_assessment_data.csv b/data/raw/dummy_assessment_data.csv new file mode 100644 index 0000000..28b8771 --- /dev/null +++ b/data/raw/dummy_assessment_data.csv @@ -0,0 +1,6 @@ +start_date,end_date,open_items,red_flags,num_employees,assessment_type +2023-01-01,2023-01-02,10,2,30,weekly +2023-01-08,2023-01-09,12,1,25,biweekly +2023-01-15,2023-01-16,11,3,28,quarterly +2023-01-22,2023-01-23,9,1,30,weekly +2023-01-29,2023-01-30,13,4,27,biweekly diff --git a/notebooks/data/processed/assessment_prediction/testid/output.csv b/notebooks/data/processed/assessment_prediction/testid/output.csv new file mode 100644 index 0000000..36b15fa --- /dev/null +++ b/notebooks/data/processed/assessment_prediction/testid/output.csv @@ -0,0 +1,6 @@ +open_items,red_flags,num_employees,duration,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,percentage_change_open_items +10,2,30,1,0,0,1,0.0,0.0,0.0,10.0,0.0,0.0,0.0 +12,1,25,1,1,0,0,10.0,0.0,0.0,10.0,12.0,0.0,19.999999999999996 +11,3,28,1,0,1,0,0.0,12.0,0.0,10.0,12.0,11.0,-8.333333333333337 +9,1,30,1,0,0,1,0.0,0.0,11.0,9.0,12.0,11.0,-18.181818181818176 +13,4,27,1,1,0,0,9.0,0.0,0.0,9.0,13.0,11.0,44.44444444444444 diff --git a/notebooks/models/assessment_prediction/testid/testid_latest_data.csv b/notebooks/models/assessment_prediction/testid/testid_latest_data.csv new file mode 100644 index 0000000..8068b2a --- /dev/null +++ b/notebooks/models/assessment_prediction/testid/testid_latest_data.csv @@ -0,0 +1,2 @@ +open_items,red_flags,num_employees,duration,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,percentage_change_open_items +13,4,27,1,1,0,0,9.0,0.0,0.0,9.0,13.0,11.0,44.44444444444444 diff --git a/notebooks/models/assessment_prediction/testid/testid_model.pkl b/notebooks/models/assessment_prediction/testid/testid_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..93129807abb023e81ad3441ae5f7f39ac4c3ab8b GIT binary patch literal 129833 zcmeHwdyE^`eeQ7IqIcz$w31d2Kf1D{UB%s6maHp3lW1VvFbEow(@-9Lf=1ze#j+NO0L@NJN$K^+)w65m|gEvou~ zo7NBAGjrzSkn`A*Ge7QdDDDD09L}7XGiSc@{hsH{QqOX+N7DXE8}cXSbZ~0 zi9(UzGe7-Wbz40qp2^N=zYONIUWn%liGs|J*r=X$F1=9Xx3I{YmCDJ9yj-q2rPdAl zg?WWFsnkL|F~7hPRR;~Fvzcrmo0&_+Q@pRIOT%%!FXO)@=hUt%mwNY3-ZXh*xtJ^s zX5y7cmM=$VO2d`S_(CF=NaxGrY?zI(jcgO!%tqN3wvBCNSFx-4V_cKWW()a3F0oKP zU0RdPCgb@Of28FzrM^u3(QK06HrpEAQ|jjzPGuZLPW1_GxB!xnGLTuraoSU3Z%8Ji{-p-ZtAE zmDsp?k8HyGeU;YfSc+#7X%0{hm2znV@8tQp(6*=M#mw%wDDU8Kv*2(tk=BII64}kB0cY zDq;PE+Fgc~D`H*z_y-OugZ%!Kp`*%Leuvt*zMx!bzDs2|dFGh1Zag-`>+sgcWzHbk zMOn@(>th4_x0zy&UpbpmHpE8xrbc9W?Z(wUjVI?GljRfhi^@oBi_LD8!Z>B|{TpMO zZ1(GwH^p}Io!JGMGe|*B=i}zfh~wh(iP`vGWpiv|<)hIk{rJ(?o|TWUv{J9IMQdwS zbM(un6Y+h@R;?eZc2=&Y?cS#KL)C8Mkvep7flhS4(qqz!T-m^YEm(R~-X5+Kj zcp=N>^6^5spxp>F_YQlccF3!J6P2DRvpH6X4)F3(UJmo}gj(}>RlnNS9OXyOvnQh0 z@-oe9j`K3Zw{PHORy{*;Z%vofkg9g=gU^(U(FkX*;p(T<&7TmSk4j(apGrT*-;Rj* z{>D_d@gx5>tp6@a`eUvXpXlH8!!PKigzJdyi&OX==hbycDU2OjxhW!jOWFSS%6F&G zKCycsg2O5o*1yzueaT~eqy5V_=q)Mr{Z?YW$fc4Qyu8;r|nL}iHdi3W?>*XivU zi@bPF{lhlOU6JLoejTo<2z|vJg?!`C+tGR`gngQm!LPFiSD=L7tlUf3cnWyV%LnxK zR*&}(`+T&UB}_fkT^Zi9l`{_Dj5r^jf4_cOQE+@{fB#^@2OU1q4wUub%fRQdDmK+W zcFKI)4!{_~rLNaUbM5j6lR?BM@PShF z7x39rbn>i*eE3Fvd}qi*clC|-JM*O*e2!OeD*AtyzD>uUkG`L} z)i-<+$kQuC$EcrY&z&2-AU6NE3143M(|Eqk!$-FZ{sJDZBa^3xfAH1A9XwpRByc3@ zc#=%||K)>+zkKNq?*BC0jZP+eQ}k^Q`W56>m;N9{(YXAA9ILJ0kza-(Xru2-yCb5% z41G#$gRl0G@zu!n-x>7{pTzR(*~zgBqMzE`9)!L<77_PLC%=yW@Z#GG(09ZhuXr%VH}s!A{gP(?<&x$*xwT(T9Ve9Vs8RH4^0oSed=ka_S?XPw9b!Ys!u?Hb9pnpSN(Bbndx8HSB z2cPxFuRHkU1F+(+Xg)r_boT5n-|$JG&)DnR7ftq@A-}xxr`P(<{MCp)gLnk+L8sp? z_x)P0Z+P$^PwnLy;#2kK2jVMH>92q(qi`M(wHtMP}1f9zx-V2 zi@x!v2Ys(cXB?Db6aK4?JrdIUM=v7Sq|x_~&tmoai|f$odm~?c&p*EMfKS*Xw)kAd zc)GCi%4ZUMu426M>|GnK4kCX7pH=qA#W$Ybb3u$>Yksx(-R*i%QUvwQ@pNp9 z{owC*$0zbT#`JyxpCTSX7eD>ZKmO!?-|$JG@7d#{-TlvIe9+_c+jt)S3!;mTtI%*um9sbwe`(-GUS(6|A9aCjrNo1 zSC)9hu88QrF}H+=HjM||Te5BeVPX`esxp=CZec+H)@W#~(=KQS(_EzS>pv)%Ezdh;3o2Sn9RR2ls|gBcB#{#C0wD-rqK3y<+3~UdI=SJ+cJ;+@r6rJ{A%Ei{I(u z5x2~I%|HK%2mS;;?BfAe`HuJza1EUG0k~S(`t$Qz1D}xRRzI)kPC9&ka__O@L6q++ zEwpGLl8y%*pVavk`j^EX~P{O%P$smy?tbr?_ylj$8&4PN9*ecH28#k zAJh5Lic_ljM6Z4PN+9ib$ammW;7^y&Hu*44{22ytwG!*aw66W_jz1+GN0LN?Pw*sg z3jQ3^uS3V5FNg103^IReu)+uY;WqfhAzq&^^Z|^I%hUe}d}6*Ga5@$d{Y>ZoeD$ZV z1adt$8+_Ev9|4X;e>ty*88rFqb$rqAC;XpQxi0#fjz8D_{CE9(-;f9YQmtGUz2e9G zyV`NU^^2y=zccKcmcS=p4-+`7oqso`|4x@bc3%4Hf^YJjAl_|1f4SZB3Bg|mPN?jW z^B?Zr9ArGyr*E}eK4J6ptCx<1?)4211o8`dmf%y^r=9p6>~YaA)b=Ut(dNhR&ObXH z==mD<{=L=9f6$-5ULc>bb^R~2`akSb*a48IboS|u-@j^K5cw0l*wo57{un`g#6CXU z?(1vq(D7j4gUTNH+0f3ZYKcIoyA@Q9M0uWr2Z^fuq{Nx)b3 z`GB4G$GKfk^_j1Y`?ua20Uqq-mpMMsZt+~iBMhFPl3yc_KmVy9^MkEFO&V_V_e(-f zK+mPqZ|`~MGx;Fnq00VRVSRyi;|J)szyn>p@=p#vKkOSG$o)n7`X@I2c1i#9>JNnd z-lLz-qQA7bzn;XOxr+0(*F7W$Iv&&q;Ati1Q`PTpx8SSA*MIUlzDV%N{q0WcMv2Ek z=g)k4;_X}z`4jwQ9}lw5H+1r6TJWiLuvuSkop_Erd>ZX1!RHd>xmE8M_@s*m#m4q4 zzTwlu`!&Fuwf5jdMC+fWPz3lD{pCDgrb=Jb@W^1~hsF4V{(8~$K2bwHdgV`l>KpBM z#$zMzQv*+#aS7g{!{>iq``)x~@}0nbx6g;S>YoCCZsoJ_e9iVd#)&vT;_|gxKCGFq=;Y~d9NX068((>_m%GG3to=m(q>a6d1HI~7W_fCs=gqdaVP`{64dceN z67!dWD9_#4%dmf77mr0mcVN8WcRK&){%~lEZ{vl)UKahMe`38LiP~N^uGyn8w z3qi(bRs74yKex%xfj}8CH zG4b3h>u~JSx3QltzWl8>4_(uKe7?r=ejKOqV&QWQxb22lL!LO}wGloC9(U-t9BUdr zUpzk^==bA*7d2Q(;Pd+NY36fro(?`&oS$2X*EAoWAHDTAU-XSX3H*)FI|QzfPeDCD z*U%^3$y;K7<5Khd&;57a_fQ89mo5oBNf-3eF+SVwczEg0zc=6;9(=S%AopN53mm*L z=M_eBInOA-^>O1RqUhEMkISM_H&K?;%o4DS$@j~GL1nvdiq3>DO`vu<3 z{v}Jirz!p=1269Q*RUtt)i3IZ8v2?$e9+;`-Svs_$=9nA^S^fMcrvL7>l6>EA}YOzNszURdN! zC!SkJJa?xqFQ9ifPoJ5+^7kfu;|Bu!)E1u&#LIME(ThV7C;Jq~ftOI$$4`bn0lOA< zf+gQUyHLCNca8gZ=4UVVDfFr4kC&&9#qaQKym;Vq=vDUdgw=cB2kl-{?ChSw-68Z-4 zN#{TLxci^yeZwb#eG0iPaH;$Eq>Cp1hhdLEPe56pzTnmWVbpi#uSW9o$MtbYkB5`L zyrb7QJTy(7TGz7`cyT*l*wCNM@e%N*I6s%CO<51uxIS=2U*GF~^}%(%@n;*zcMIN|BHs;t5po@6ee&JV51K3A!RvH< ze$yWu^>6;22Yf>Bv*+_pz5((pg#0JQ=PJJ6?Z-QN{L3FB&_6BlAMJrq=a+iTH>d|6 z4cN<-^^Cw*Jih;~K%!JkvTNA2#iT3>HlW8wd)biwZJ4cN>)8f2GR-zdyVxckLfw2= zuPw3BX|`pWZC!bhE$s^Im!dOljO}38on|}FuwA@XbPwAdmDuU*TrRN~ zj_2f5Hj^*pim5_ak;GBq%#7Hpgrd?(r7IOrWoP76J}hU{FJYxSrjG5T(i2l<<9wnp zm)WcIKBM#p!iUWd0nE~gjfvx~BvSJuZ=9eAdg<5$jRlnt>_ zzM079<$OLZX9{ukYVqXUW3qf=eo+~TZL!&{6cf3Eoa6g9#x~jP*DG&|?dChP3vwnt zSCG^BxcM^TxcGcxHojNc9Gh7AXf#Scel)gc<>M=@)GKV!+FI2d{qpHVe4nya>xZhH zm8)sHw`u)Qwc9wkwtstUJ8#T_oJ;XOOw7vh6iZ}yVboBrk_*b!$;ssx_<`ylyC*ux z?v3`aBYcEJd3oQX|Bs79w&xg6HK=ni)u4Rqk>fx9*LXG9`6#KyyLw6Wa-Zhqud@gF zr&DgjZ&q>}2BAx-CFIib0dW(Onv-x;%Z?b~gZ=Z-UQ512Z)Hr`FJr{n1w?{ryV+J*7X@h$taf1|n4J1AODq+fj0ahK*hX zo^0_M7<~Z!?$+OG;M>`rX+R&4-nZa_3J;&_PWt!$6c2o*%3}2oyd|~}54Ohp#D2HW z)4+q-e_k7J9F4R(-#O#45&wBtMD*vdBX3dRbM5j6fzE%k$EP)40=x-a17~J`sp$81 z^H&=3oED#h6`YEG1212kqQd8+@2C7*AB;erR^`8~u%3Ur#iM(4TmTOenSOQn2VXrL zWc{jx2aPGn@bH%}-QnMQk{}8v2=GiHL9Y*>i^c zaaZ5CK8^eu(DSK=Ogsf1rjuVI*MDcUUHIzR$*~IpXVm^-c16!@~p2=@#l-@)W5LKm%u0FJ{>+E{)_X+f{4#9o%rnSWh;MzC%wu?=+V%( zu`SLI|6=R%FAY9{OW+Yaig8Gn?@{>obfDLh3Xef$@0>_PAx>_YGZ z9X`Ku`(6IcSM^{IcIklY;8Tn^y*n;o9Kya3+lIdGEU#J{EFND=k;p%`%uMt zHOA%J|Rl@ z%c=2TJWqFcFz`epzu=!iyXf@ohu`|a!64(oUZ0`Pze2ye)rY9@ur?y_4?T^}U-!-j zTmAFbd9c4a`ix+z`tnWd!DA!#*XrprzyI-Ffqs9hz5J>_9%qSHG$kHq;KQB#gWl3i zzH9`aefm5xbKC>GPA5-yzV^t0Aj{LLe3`2C(y8?;ulT}qlUiQiernA=$u~R@D-j=ozYBVvy?xud{OOK= z4gPoLuSWE}u85#-_|d>y;4wOVZ{(}*`Nvlt@ClsNi%-#?ZuNQN`2yFtu6_KWb@5Gu z&psWWz$N4<`iTyoXYbl@wXgc8%k`ju&nkI(@r|eVTo5?y=p6!zY2h2R;+e0rnBKy$reKmVfaasqAI&70yeChnw$T-X3H; z2ps62j`Q=t`);2n#-aE6&ho4^@+&qje74UwJP_zt_VUY`Ujf{>)1NU8B?){4{VHhr z8P0fYM1FxUFHSk-Sy1Kwd^GxMD9HE(?nH@rx^=xx9M@cYUM}m`fxicuzJA?1?>?9D z4W9)16>z7OOSeVDcDw6WX1@j9dK$44KM!QSBJ4rPHGu=@+pA{}R>|{##{(LXr(OEE zgIqQ1|8)4g;gjD!;u~KP=zI45+SS7+^uOBi(eC~g|GC4b;V-3;@4zA6qXK;tzthd%xpeql|K8u~ zLB0bQe%nX(#U> z8vIMcpM4S0Vel*A{J<+6f4&^PXEDg~y#`(tf4CK&IL_G8f)_=J zcz0+19sFscUn;ytlux*N^Y6|}e_ilRz7xc|G5<(B2dsD9?(;{$U-b6!`49JQ4l*9< z)3>0Pc+UrHo;+(LpRjrQ)k{Z0_xgqh5Aq9kX{{VX{zE5zS1Z3@KT^d|d-afVJpHBQ~=HFJ&|8h6JfFnBnQ_N$r=8cK* zO6T8uujiXj1QDO$&l>)c;NM%l{D&(3*7C!Lo09)v;L9C84gRO$Pn^#@pQX}Hvh`u6 zo_N7Oe~t(H9sFsJ&-(q(7XEDtzVJN#*I#MyY4$(EE`q&8mp}H4zkBkrAoFJxe^u$9 zR{0JbyW^hxt9g6L@U?eAO>q|1MVw<8Nw)rr6jg20jx+pf-mT9*2 z$TMFX_iw#50zBBuFEig%#*xeSnHuB0d3@1GJQwi@w2N-NppnO)|5T9q!CwA!MY=Eo zw6b0C06T!re|670pUDRq4^{Tp3hN8B8y=dM@9`%GpC9%O4<7tQwc}qWgG#>l>e=IF z{iW6Y^)&W48 zo}PF+7exMqU%(u16#Y}bf541SQQxeuw+_CVr|-AsdIGVr{fe*n>}LtKPd8-`u$$Q} z(f*TVCSC$L%kDnS4o8QJ$pJhU@F4J6L!Xs+W&Q8hsL~gIba3sBkyghi>>bFBs`js- z@3zkPa@H?`@qUe4zrN4E_!NQv#GcPTB=)Bb=;O${zTk|PM*Jt39|60M&i_C3!oEP- zN8nEAU&F;2wuLVT=s@$Jl)ZuC$17vX;fKI!69|MmR2WRUS`kB`;MH*O#O zuMvFq>HG=aY@Yvf;8Pz4`u!UA_-yy}pUr$lCr^Ll*rpy|`O0T|*e|_A+=A@*MmLKLOo%*&hyV@ol^i*vmri(?2WmBWimYdX`&z8P9R0{G>+gW#r2N zCva}fwT>k`iZw($Q{>IgtA7RdaY4v!IGk-Or?}3-h^CR#(9X>zv=EScAnXl~S z8@0aIYJ3Gh3h+Re5A(>6zH!JmJb1`|fZkv0$AkaE%2!VQJu{wKoo{NKZG7Vs{Rf@5 zFMz*szhc|KGZkOGcuxImetbFePk*)$WPDcnFRIqNv*zdEc|i^X*H!hk_*ZlCYrrdR z^)mxst%A?m`LLDtMZ|cZ$9=fLe_}uCb$MeVdaN=`;kJcS8?)co0C(d|n#6E&Q z9Js8EhluDuIzE5#{CJ?>j{|?8=ohRHSwB95=Ks9h@cGeOfAdA(_>;ijXrE6(JwLa? z4=&{`f4r=wegM|bhJ6A1)SNFN?x$m)>d!SIN;>=W z>0|Lbd>t{G~Jfe`rj%<|SNJ`MhKXAc?mo$=8~em?SDffG9Y^W-n@==BW`9`aowPwn-u z)sv_8^4yF^0}t--%f}J4x3+p9Z zBwl~RE53~S&iH7AuW&x#jt&oV`#N-SCj~Dco{RB9 zXODEh`rta>__Gbm@?klnehDkxF?EO}m7bU?V&)Ttxy)Xr_Zg+{sM3E* z891t}8Rr4h!IMf@KjBPaae>#nBG$!^f8d}pc;@seW$37~c04N1^#$e1>FMbss)eb2 zM5hYMaPrJCW!-peh}Yq*kIU(W`Rt-B=auy_Rh676=J=Jf8D&Fkly4^Tc{!g?%b7x4 zy;?ju_n0i7m|s*zVq0u>E5$^vAm{l0jj>HO`}NA3V!Qdy?1G$$&lTi!K5o8@I4(Y) zn2qmMHpeDbJ{pbEj~|WgS^4-%EARP8oS quI=9*+s+%aAm>uN4->O;JjD_jUKll$tK@=mb#ihUBU9jT^8W*CsKpfk literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt index 838a4f2..fd72837 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ python-dotenv pypdf pypandoc Spire.Doc -plum-dispatch==1.7.4 \ No newline at end of file +plum-dispatch==1.7.4 +pandas +scikit-learn \ No newline at end of file diff --git a/scripts/run_assessment_prediction_trainer.py b/scripts/run_assessment_prediction_trainer.py new file mode 100644 index 0000000..2618b86 --- /dev/null +++ b/scripts/run_assessment_prediction_trainer.py @@ -0,0 +1,45 @@ +import os +import logging +from logging.handlers import RotatingFileHandler +from sklearn.ensemble import RandomForestRegressor +from sklearn.multioutput import MultiOutputRegressor +from src.pipeline.data_preprocessor import DataPreprocessor +from src.pipeline.model_trainer import ModelTrainer + +# Set up logging +handler = RotatingFileHandler('/root/ds_erp_ai/logs/prediction_pipeline.log', maxBytes=100000, backupCount=3) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(handler) + +# Example of DataPreprocessor and ModelTrainer classes from the previous steps +class CompanyModelPipeline: + def __init__(self, company_ids, input_base_path): + self.company_ids = company_ids + self.input_base_path = input_base_path + + def run_pipeline(self): + for company_id in self.company_ids: + try: + # Define paths for the company + input_path = os.path.join(self.input_base_path, f'{company_id}_raw_data.csv') + + logger.info(f"Starting preprocessing for company {company_id}.") + + # Step 1: Preprocess the data + preprocessor = DataPreprocessor(input_path=input_path, company_id=company_id) + processed_data_path = preprocessor.run() + logger.info(f"Data preprocessing completed for company {company_id}. Processed data saved to {processed_data_path}.") + + # Step 2: Train and save the model + model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)) + trainer = ModelTrainer(preprocessed_data_path=processed_data_path, company_id=company_id, model=model) + model_path, latest_data_path, evaluation_results = trainer.run() + + logger.info(f"Model training and evaluation completed for company {company_id}.") + logger.info(f"Model saved to {model_path} and latest data saved to {latest_data_path}.") + logger.info(f"Evaluation Results for company {company_id}: {evaluation_results}") + + except Exception as e: + logger.error(f"An error occurred while processing company {company_id}: {e}") + diff --git a/src/pipeline/data_preprocessor.py b/src/pipeline/data_preprocessor.py index e69de29..8bfa129 100644 --- a/src/pipeline/data_preprocessor.py +++ b/src/pipeline/data_preprocessor.py @@ -0,0 +1,68 @@ +import pandas as pd +import os + +class DataPreprocessor: + def __init__(self, input_path, company_id): + self.input_path = input_path + self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id) + self.company_id = company_id + self.df = None + + def load_data(self): + self.df = pd.read_csv(self.input_path) + + def preprocess(self): + # Convert 'start_date' and 'end_date' to datetime + self.df['start_date'] = pd.to_datetime(self.df['start_date']) + self.df['end_date'] = pd.to_datetime(self.df['end_date']) + + # Add duration (in days) by subtracting start_date from end_date + self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days + + # Drop the 'start_date' and 'end_date' columns as they are not needed for training + self.df.drop(columns=['start_date', 'end_date'], inplace=True) + + # Convert 'assessment_type' to categorical (one-hot encoding) + self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False) + + # Convert boolean columns to 1s and 0s + self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int) + self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int) + self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int) + + # Function to create lagged features based on assessment type + def create_lagged_features(df, col, assessment_col): + lagged_col = f"{col}_{assessment_col}_lag_1" + df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1) + return df + + # Create lagged features for each assessment type + self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly') + self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly') + self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly') + + # Fill NaNs with 0 instead of dropping rows + self.df.fillna(0, inplace=True) + + # Create moving averages for each assessment type + self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0) + self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0) + self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0) + + # Add percentage change in open items + self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100 + + def save_data(self): + os.makedirs(self.output_dir, exist_ok=True) # Ensure output directory exists + output_path = os.path.join(self.output_dir, 'output.csv') + self.df.to_csv(output_path, index=False) + return output_path + + def run(self): + self.load_data() + self.preprocess() + return self.save_data() + +# Example usage: +# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123') +# processed_data_path = preprocessor.run() diff --git a/src/pipeline/model_trainer.py b/src/pipeline/model_trainer.py index e69de29..dd7b769 100644 --- a/src/pipeline/model_trainer.py +++ b/src/pipeline/model_trainer.py @@ -0,0 +1,88 @@ +import pandas as pd +import os +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestRegressor +from sklearn.multioutput import MultiOutputRegressor +import joblib +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +import logging +from logging.handlers import RotatingFileHandler + + +handler = RotatingFileHandler('/root/ds_erp_ai/logs/prediction_pipeline.log', maxBytes=100000, backupCount=3) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(handler) + +class ModelTrainer: + def __init__(self, preprocessed_data_path, company_id, model): + self.preprocessed_data_path = preprocessed_data_path + self.output_dir = os.path.join('models', 'assessment_prediction', company_id) + self.company_id = company_id + self.df = None + self.model = model # Model passed as an argument + self.X_test = None + self.y_test = None + + def load_data(self): + self.df = pd.read_csv(self.preprocessed_data_path) + + def train_model(self): + # Split data into features (X) and target variables (y) + X = self.df.drop(columns=['open_items', 'red_flags']) + y = self.df[['open_items', 'red_flags']] # Multi-target for open items and red flags + + # Split into training and test sets with 10% as test size + X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42) + + # Train the model + self.model.fit(X_train, y_train) + + # Save the trained model + os.makedirs(self.output_dir, exist_ok=True) + model_path = os.path.join(self.output_dir, f'{self.company_id}_model.pkl') + joblib.dump(self.model, model_path) + print(f"Model saved to {model_path}") + + # Save the latest row (last assessment data) for inference + latest_data_path = os.path.join(self.output_dir, f'{self.company_id}_latest_data.csv') + self.df.tail(1).to_csv(latest_data_path, index=False) + print(f"Latest assessment data saved to {latest_data_path}") + + # Return the model path and latest data path + return model_path, latest_data_path + + def evaluate_model(self): + # Predict using the test data + y_pred = self.model.predict(self.X_test) + + # Calculate evaluation metrics + mae = mean_absolute_error(self.y_test, y_pred) + mse = mean_squared_error(self.y_test, y_pred) + r2 = r2_score(self.y_test, y_pred) + + print("Model Evaluation Metrics:") + print(f"Mean Absolute Error (MAE): {mae}") + print(f"Mean Squared Error (MSE): {mse}") + print(f"R-squared (R²): {r2}") + + # Return evaluation results + return {'mae': mae, 'mse': mse, 'r2': r2} + + def run(self): + # Load data and train the model + self.load_data() + model_path, latest_data_path = self.train_model() + + # Evaluate the model immediately after training + evaluation_results = self.evaluate_model() + + return model_path, latest_data_path, evaluation_results + +# Example usage +'''model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)) +trainer = ModelTrainer(preprocessed_data_path=res, company_id='testid', model=model) +model_path, latest_data_path, evaluation_results = trainer.run() +print(f"The model was saved at: {model_path}") +print(f"The latest data was saved at: {latest_data_path}") +print(f"Evaluation Results: {evaluation_results}")''' diff --git a/test.py b/test.py index 4cbb3f2..1e47022 100644 --- a/test.py +++ b/test.py @@ -1,16 +1,7 @@ -# Example usage of the Chatbot class: -from src.services.chatbot import Chatbot -from src.utils.document_loader import load_document -if __name__ == "__main__": - chatbot = Chatbot() +# Example usage +from scripts.run_assessment_prediction_trainer import CompanyModelPipeline +company_ids = ['company_123', 'company_456', 'company_789'] +input_base_path = '/root/ds_erp_ai/data/raw/dummy_assessment_data.csv' # The base path where the raw data for each company is stored - # Example inputs - path = r"C:\Users\User\Desktop\Blessing_AI\MKD\test_erp_ai\erp_ai\test\erp_ai\data\raw\coding_task_completion_document.pdf" - - question = "Have you completed Task X?" - user_input = "Yes" - docs = load_document(path) - - # Validate the worker's answer using the provided document - validation_result = chatbot.validate_worker(question, docs) - print(validation_result) \ No newline at end of file +pipeline = CompanyModelPipeline(company_ids=company_ids, input_base_path=input_base_path) +pipeline.run_pipeline() diff --git a/tests/test_pipeline/__init__.py b/tests/test_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_pipeline/test_data_preprocessor.py b/tests/test_pipeline/test_data_preprocessor.py new file mode 100644 index 0000000..95abb7a --- /dev/null +++ b/tests/test_pipeline/test_data_preprocessor.py @@ -0,0 +1,18 @@ +import unittest +from src.pipeline.data_preprocessor import DataPreprocessor +import os +class TestDataPreprocessor(unittest.TestCase): + + def setUp(self): + self.dp = DataPreprocessor( + input_path="/root/ds_erp_ai/data/raw/dummy_assessment_data.csv", + company_id="company_id" + ) + + def test_run(self): + res = self.dp.run() + self.assertIsNotNone(res) # Check that the result is not None + self.assertTrue(os.path.exists(res)) # Check that the output file exists + +if __name__ == '__main__': + unittest.main() \ No newline at end of file