From ff1fcdee4244f251b53cc99f82d5fef50c483a0a Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 16 Apr 2024 08:34:17 +0200 Subject: [PATCH 1/9] add some docs and fix bug in cv prediction --- docs/src/assets/sample_splitting.png | Bin 0 -> 19353 bytes docs/src/index.md | 37 +++++++++++++-- docs/src/user_guide/estimation.md | 55 +++++++++++++++++++--- src/counterfactual_mean_based/gradient.jl | 3 ++ src/estimates.jl | 54 +++++++++++++++++---- src/utils.jl | 13 ++++- test/estimators_and_estimates.jl | 23 ++++++++- 7 files changed, 166 insertions(+), 19 deletions(-) create mode 100644 docs/src/assets/sample_splitting.png diff --git a/docs/src/assets/sample_splitting.png b/docs/src/assets/sample_splitting.png new file mode 100644 index 0000000000000000000000000000000000000000..b1d724d8a0c7053679521a6e5dd319ad1cc6b5aa GIT binary patch literal 19353 zcmeIaWmp}}w>^lvyB{19-0fh&gG;cW!GpUcXn^4E?he7-LU4C?hv4qhdGr47y)*N1 z=F@$CJbj+i)m7EiRkeH9+N;(fSV=(&6^RH50s;b6Mp|420s^uY_^>0u0as#HX#61{ zkn%0W#FS*j#K@H#Y)vh!OduengX0t6l~fP#`Z_#qJ+fgjqOd;G`#C{LBMLelkiVrs zq=>`7_6yO~(&?%rr-T{|srXLch$n)FTV?PO=68-wl!T}*J~B7sDU{g!3eRbqyV>sY zDbK5C`(?W0o)-r^#M=NWJhQI15XPYnG|Ik0SKoe7^Lrr}KtO%(L|Et3DNe#ABt%k5 z)qb@(wShMO(q<@_akRGfs-e1bp05LuB^{^gu42PI*ryEI454X)ApcNg)5@y>9zCWn zl^ustZs!~IlOZG8o0fS2V`5|PAVa_%hlp(0QA0t{Hy0_4Z6S?=9)^bc$x;D8r~zF|InQ?bW0J9xe4ducE+8z>lM z6-$p{MhtnUQOmqH9y4)dF2T@ z)wc&s`sCR34?%)GF=_NYS{7LPvp9~_Q+tE#Mz|jQndSYNnPk|)J?=jn&)y`MN(RaG z-Mw`RW_+(KtdFzL@zusB=ho^#{{goe*VA#z$PcH@vEXPgRC9_iA2DRn?Thg1L6W9K zEAO0?eF86o3g*}tlG^@ct9j3=&Q(y_Dsk~$AYs~i{~I__a;$M*;UDnKosee^_p%Os zsb?dYEb$1i2Te|R9xZoH_1`TyLTzj1mi4KN@- ziBM`nG?J4VLnCUUB=~b>Vc_^P83+_X%owonLZJEQW)ar<)~r)DA~kn0jl*vnV1VJN z)=3Hc$q~?%DTwk=2}KH`aE;N~C=~laNC7!n;`FdsB9zi_OhL1vJxYjbkjK7lnOn8R*FC5_Up55Vn3p8$@>)%P^S% zl|LH{_?}3}Bp}{Na$<`8A~K|GmxhV?mPdhQOr!EGj)F@XdNRa`k_UtD0_y&Im!UZU ze=pHi1me$MpBj@SlH!xXlaPiX<*;|byKn^{4!`@5{7=qR>Y6-^#Hl{G2KYwERVMS# z^Vsvq^B?C$e+!VH6$Trv>)44litE7H5ZX}L;8$SIMI3dZu1lV~wQ{c_o%%+0I-WJ% zy?au4qIe?w!s3PT>MKG^fbkm<@*B0^_a6yLlDyKq^7}MknlQ`{-wVWfKEXsw>AYW- zTb8n-(W67d>XR;^h8yt55gQ~gr-+XwlE#-tm$vOUuEt!52PZO8SJEcNMn``oB}v71C3b4+C2A#UYE8N9CGS=1RKt}<)K`i; zl&-4}qlolIz2m zWEX4{VHQ@2=c;VQ1?pf=a84Opnp)V*f=k>#UQVI35tb1?5fYnn4)3R*>sb~gG~nr% zN|!1hCCt)UH`jN#6x^sEn;r-86OhWLH}Qu$yBz6^yR)f&EqIhW8O}+TGFzPsaAtDO zbxJ%qnQ1E!C@HMOFQS`WDt1*k4QMZDZ}1BADt@teHiS;`E%Y7FnqCj<;%J6Tml(@Q zoEn>wLUBg*M==TXL}@|k3nP%MlC%sb3MV6q4NLB|=*{mX?WLlHqcx-*l0TF`NyU;+ zE;uLC9skd%FQ_MO#6Id(%#?N`&0ls>SqycQ8kpagjMfU zE!R+1+iC;*k?$j7xr^5I@Xt|)befb897hpcJt-#XX1ca3wz(GApKE3d7It^6Vi#m; z8b(jjo@ZtUxQARatuxb8bbqX9x2U(w-6Y)TJidQSMNO7=gjPvXPI4d7LVC(S zk@hofR=}C}tGx^0MR<78Tap^{8r7PX4NEwvl8KWOb*d_aDoS;hbcCDvT6UUAogb%G z76qrCvmVDudP!c=zBD9pdLvjQDxt1?<)B8c~wyhuMB0D zVpbOP5fnUzn0ec7)6Vc^=^3L#q=T}9*_-QC^A+h;3Q`i%&*#A>TBNYk#P?WqJ}WA# zT-0^m!-}~jLJ;dWRx9P*7X%t={yfTU`y!Jf>RK)oG-R|Tyc_amP92<85=hE<%q2=P zNp?zni5GT!9v%mqon<;@7v-T`{#=U*3FS=+)$=3lzBUOoj=8V4s;siE`Ro7mPT7rCkKv|1!?40e zDfG-w%Kw|h-$?nHq!m(;5X*_G{H54-Td6?F?&^QyPIG9~x&+*DJVf*TP`LBRa z_mmA1OZef^@2DU1qnz4nX*5VlLn;P_P1Xh;F2R>?FNGK*K7O$ZKE#@cv6N*>w9$Ov z6efQyG)Xd9HX$8CO{~;f?eMGnH1^)yoCap&o z-KXYRs5ZYDSm-raFAe=7HTpNLHLG90Uf(922CTVS+s*%Q?4G04n~49kv{}75M%+YP z`DAb_c8|Wsoo{`(_`B8p7H=7EB4O!cTRj=QGSdJH;XGb_d2yVcw%UWs~Mm;NEKp^q*pzn7~o9VI+^%HL-W_P$*yLH%&*dhT+4ag~$rt0_D$_wT9=(MyGY4-Thkqt~_I@s^N51rk$!|s71huYjkliZXwMlZ`afg#1~$7_90lIKQSAAz$c*Mrkcc;pPF%s4?VH*Fqc;c0FX z*5zlb)tauo%{?O)q*tZq+{66)p)GHZS9fb_L*ujf?U{x-;x+S*;;%<{^+ucyR}(G2 zU5!pdh0MKq&!X?EE>)ij3yngBw}cbDtL`QrxVDe{Z5P(sgZZ#Vf6@D7p%vyso;Bk^ ze5Qn`wSkbnq%}K0^rzi>MTB|{rrSG4bcD;ACHCRP$yJAD724!b(vYl^o;GTQ{Joxa zR@~(JNfN|EMZ+0@&(A0FRagnlh&H z@(^!;0zM@5CIF9>nGKC?KbbJQS=;@o0s(g82TrX`oD9j` ztgUPu`P~F5|0%%_od3DZLP`El5hqJQN=hDed z&wa#A9E}_-?3^rYZOH%J*YK0Avy&ht<)4QB>+kP#nz&i~cS|;o|8@)LAj_W|7B*&9 zmjAjpP!;^=D!-D2n~9Z{xP>(^JU|;l9IV{nf6D*Aa{jx;|4~!(ziV=G{NHN+kDUK! z%@2+y4q~>}K$A{F|2;GRHv2y_|E&mS`P1|NF%o}=`Jb!6I13?xS^jIzgpkhS7GxnH zge+ymMb+FOkJI7Pa3pT}Kz#2d{d|1Lb9}z%c#C4NVT!|cr$ef>PH|*lcBhB%F;JQg zhjbPvNhpc38IViki2I4t?8&{ZRb~Zd93ZyT)U>F!bv%4=DQqceewev+zAZg#E=?*F zC6M-k`EzvIL-mj*W^TqpAVvX)4~2+-FS74HS>!k{=Hu_OCDr}@N&3Kr&;2bVLXR+> z74R;+4CbFY5U50zkXYov;j4s^C5jwCq=ooT5|W5h51Lv83OKUKMf~xQxT>-KCOOSp z;r>3rJ5e7vCY;a4G=G!k<8G*bo&y5$n;!(KHvM$}`@hNYtaGA2O+aEHz(Nw$DiuY? z|4n9zZgKt71T_W{6sLK%5*35!pBAZsj#&Qf8_Q5BR7h{dOLy+` zzM7D77~z`hPhvC5^Ll=;-(k?GNCQ9Y=Q`Z)B=*TihU;{AwJkf(=~jQacW~c{*T{+S zemv%JzZjr18H{@~nJ0s-YMRPzEtg@vR6jr!38viY4aZKqoK{tgM`medJ-oA8IG8Go zWYlRKw;RGsrk9$a>Lk;im~yYM1RM8b0Ilffd0ZbBm^nNiH^^S@j+p%Vj>-47)OM}i z4_?nJK)1yO5#?Cmv^925Rjpk6gLbVsg~?C?m&wkDn=zqBIW)d)45K8;a3WlqwauQ; ze51aIi|aZ4q4PeFEdj`N+5Y13v?KM4_v_=@PIeGkOajx#_wjE(Z0CeB#}sCI@t2^S z02BF}M#R595w;-)!+fqH{Xpi+-A2&GQDsL;5GUEqzVke*0kY{*^>TKLkM&lVD5}|XwV!%Iq`j=lJ~k+{-|rIN-|uC@_71{XjU};(wLhK+ zEc`I^50)eu4PxINNi_?D^x%Gh==K#R=C)p>j+z0Dt7^Bnl#_VfZTOMBK{2LWIJPfB z_D}>Sq*mE?t&hx8uA=qgebddNCA0-g+f|I!a-*`)>yu;Cas3iZFM4&y!(nM48xe}4 z;B_3^=l-QOPuE<35m6#Yv{*XDges#xA}?-5%F$FF+k}N0vs5dC^XJ>Oq0w}KkrzK$ zWEAfqg6N?Hrr3^`hoSx;{;?FUp>6pa+k?scix~}li7KfV@oHLzPz&lPDkuU6V{%_( znA*P0=Pl6d_d$01O>vYDo{L6jR~!3vpJxUclykmmlxuTgdk@egCOv(Z5_$}BxXM1V zi0o$2Z*SGeeg~@KrhvEsN8EC9}-0U{bQH zt1hB9OBrIQVsT~sa(i5#r|Gd=*S_8+8`aKPY+iL8;WDTXdmfcHfVt*rNm=sq?gQAb#VA3Y`U#*f5Sm#INb@yUCbd$4d>jPUh1LdaaGYrMfNk zh2b!&>Zhx1ZGiDlE_uH`QyuXC{#}e2mX|BUt>ZlNLGlGKxa8{TqSQR`>cy=)Stxgk zPO=oCXaWbt^&j16*j$p)3H*UL6Mj;*u!3oOnck~cDP9i;bO#f;LHlCb;~XBl$!5cL ztLt6<@`_OR9~*3#B86WXF*lp`GB_M%%vW-6&Gw3I8aLpkQbiS!1q+QL1=Xj`Q(E^Y zDAv<>`()_Sjc<;HudK#B#ZJ*g!z1sXw__EDPy3^(BtVFLK;?ATj3+~lPhTA40$F|j&$>sX(it8Ge zpzIY`S!fqQnc2f7*!UVWW{oAC2sqc~ZMU>IMy*K{&_$duFKGNg?2WBRVeW2U0-|nz4NskD z3en?(WbmRV#~!OdyEk3tmuhlUbbSIaiWaxcGVjH@RO?M6rNffg)_~0vYvg>BqghuP zG0N`K{iU@^gtsAqf%LPer+na>$=bPnMXWrE`E1ct%}PDN*xLw!6IRDG0y0VI-UYJ9 zhoj2Sn_t^$j-zbKi=Md0!7{+0qI#kTFZwvxHtpc$s&aA~Saq>;i)f${ak7>PKVPY; z3SD%P)5dVsqnyNw@WZ?>aKwnDfV{#Q_X{#xy*XZz2-<=frP?o_&k}=o;W)&<{IFlw z#ef!wC@8?2M!x*za-`aHgy=HVJs)^m)D`%4aC{5N$ueHx8A<33s8wOXZqfMY+Ps>b zMXQcug1ySHj<)+%T?E^-o%0JyRi7lfJ5Tnm@B=~4*YH=|T~OZhy~~4; zy|s483(9pgj^=H#Uv=Lj+#^6a?ec{gYMLj?V z?y%wn2XTO@qd{Wcr5csoCOKwiL33nYcZQcrK~>9u|LVCR7@rccJq~4QXB8A{mGp53 z!E8&gpoj$}Nrj{vy(1$*_xI(?##RJR5J)@wu{sJKHh%Iz@5d+pWD+T_SQkk~W88Db zfI?MYj=#TPmT4XM_O$)cW-9e?t}>GiYaQ7?ly8vOwsm6oxp*q9_32{B!7%N1T*-DH zmMr#Wh0jj5moI+#d@{QOc!K$9NU`Cwewev=AKv|YUwAUs%d9g8SXUiTuyHt(|08u2 zo^PvL+}V7NC`5*XvKH}ueq>|5gY!L z$BjLQWIISoufb-e)gN7GemlUG8jKcQ(EE#6=u2a;hX4A3n9!zSq2%z?ZEYFCYI(!z z=ZG%*LpO}zTiZ@-3=_-BadAtQDU#f_T4`?51WtQ?9E;6QCQ{U`95YQc4k%#gjc0ro zb98P>eBGqY|#?U}RI2HaC_OPmgY&h&5rQ}^`e2oW|ALfTop=k+G4k1jwzp>G-w z^&Nikal9Lfh$nUSFRRxG&K8Zc+__DRkG1X(;HP=GK;kuAMY&WDhPpF6LAoEWsi7bQ zy3#v~Z1Rq09MU5^>GBsVi-%6s3=8@wG+Mn>_@xnuU6ky~$y%DaRDO>IF;5l%x4I7C z2$&EINXv0no<0$%h4krVe9Prm^Gc|OE1Vc{&!SJVbm2!qe=aWa9~ZxmA0%(+-A+mjs3DV-U$ZD6X?Bi{ zAQcz{f<7kimwPiM-rnsHz{O+f0N6WzPTw0?6E!My_%0H+qrh~>%T1~_9nUU&PiNgb zK#-j&m(I^MW>W2whDnd|b?c#dC*fnBLOTCsmMAnKuU*#F;S7Tr6o-Z+@Ib3;UoPdF z0DdAL28@hjW=fla8|hI*F%)%m-|B@&Sw;T8;mS#a%lg;gf_(Y0q4a7kAU8b(#kDoB zEZk9f4A0c?=n)73uWQYAc<4|LPtft}GVwHs`XEh?8=_H#mqYkFw% z%~CTK#D+w-AhB;>_-a3tst*8sNp2tlt6VLBaFHgT3;~|P@BF;k`A9N{FAYaDI-WQn z`xFuyj-by6;J~!d1h2-+wy|8Yk+e@W^xX5;(Ki6t7ytTl+hHn%>sW$vfSHwOM6_I@ zUj8}o+wO2OHXo4!2pON5Gvjl#^h)PACao76*a8}b#9sjB;^SIn>qQ9R&t5UqN>5f} zc;BblmX3r!QRISd`-zov<2GLrPh(v_kOM`mILC+a0c&}*|2;%s;EqlxdS9$Xp9U*H~e$q*ZW8yXjZX9 zW!?4_5e=U+UDC8vI25#;W#pK<2HJ*ym%HXBgHUs`@%NSGn<^Ez7Fm9up z>rjeSiXf6tgDM(U9Jhf+(@GK-OQ_b_?jBUht^G!?S1X*UG^O6U(>1W#1P|&`Io~77 zE6zQ~1aCE~(w>|T_~}vFEGw6%-h_NVsp~o{j{WY~JjyMI4=vi}dm*T1yP*094xR1| zTp^CjzTdU4x0s@x;!y32E>bl@z&d#{-&r^Vd`t3e`1?3|Ll6Cd!MeW2o1>BpCbPq( znEs_oOTb|ENh4PQSCO2dHIl-03D+7G0W{&*&(;;N&LfUkjq~0KvQ8K&-zLCa<*ocAA16jgfb3tuDgiSuCYyy)AA{wCXu)|+s?t+Xgm;|*6=gK=JDab7 zHAMCw2#1jgc)c&oVKjfCSQr4sjF560|H8C$#PO`2y_izX#?{RlJ~z=AiZqy*aygPw zBK#5U=d*6~3xL?f8yO7BnjIzP{F422(sIhJ-+8*+RP%>T-6gH`n;sbxezj7s?|qk* zjB-$>9X@7jNCQ#kwVW@DUt^HxCC5jJXES>_DxkR7U~4;TM8spGL(TM&=M44%u#A=d zFs{uXB#NRz#tcz=>Uf-AV#?Li-Uk(r2H}#wTvvL_030vOQTbQE0F{GKKteh%fcd9Z zssQW+-$ftDTRb0sR)Kq8OPJm!gD%-PD)XUp(fw{w`nRC(P8smDLb=|I1fgT4_uT-2 zYMTKKRB6>RZ~^F{7h|)AbRP4E00;N|>>wD+FvEB=Mzc!&Dpz$13@}Tnx_{r@`PN8Z z-1!^QXYCF!KqLjiTR}v>tzkL4ud%h~ei)JnF zNNkOm!!)e0d|SjTuDm;xRyd+&10dcxGk;95_|DTSegtRe4X;iYtLAQt!s+f$^?y_R z)Pwv$VWh|+j-I(Zc#Or)I}2x5q+Xul3aoW0^o*YVj!S1i^iJ619AHHpzqclL)C5r9 zdj4>zIop$Z?#B8Z&K6ezH11bNjL~=bvWXsdV#^I{UPl(lvw6N<01s%MVBQrY%uUWR z_|EG&Bj*-itrw{;kMz+HFnL`bt_}*UbU0qJK$$nSdq)S_8nl_hs7sA5c3kJ&NtIt`8gGsQb>go zm>G|M%X(r0vwYOJ_xl+=V%%R}xcI6J?&nlSIn(OXCWot=WD#a?NFSFEO;+#zB@}-o z{)H?kQt&28%LDGVf{0RaPAxP|Z%&=Ve$g}Zmj%n{l&W*PEXp)exyi& zYMEf6*jr{1_;vYCsTQ@ra)>_(9(_{jq_ndAbBqyc3fg(ng%a)Hg&x1zeA%7Nx<{VKrdI9vXw%dKB3Nw6*TH*|IHos;D)JhB;Ao!R{oVH|>!~1)8B4w**ipg3D6P z<~HJ${Ao5P9HmmaadQCWpD~d6eyj@axTLg#eNBA^9WnL7xd~qtjercGxh2WU62E&2 za^rjxa%%4G-{CRW3{vj!*#yW8X3EJ9|@!*BBXTkA}z1~DG1c_jGadxOKy`x<}}za`FJ$iO8Ncf-ixibzOn zGXQx%*-zz&G01ngk!^%LJ=A$lZHgTcfoNxUXuxTGw&PA@BXOQRxEe`44^H2Qe{c1Q zj8l-E{voE8%-Obb6!9D^$Hzaz1Pw8_4EKlP<&<-bLc7~EAJC#B$8QFn&QPAsbOIa1%_5627; zCE@xU$A|usKWRv9Gz{l~6r~ju4Swd%a-ZMaurX-_*tc*tAIJdv&TH>3SbTB8yH6o< zo;?3dUvBi-&5~_u2EL?BgwTVzqQ*6M^@1llDlRx_f*uz7%bf-(SZ<62PN~Dx_M&jI zW-aRgX>ymW3jy8ZYdz$5v%rcNgVD=ByIc-zNEpOO??C4$L_)@7mIhtuI^)RQERtqV zkYiA!Iag9U<^l~KqsrOt{o79@M7<%-2m_2A+4}VO&n0d9XF2y-KhPvfPJRs+ABcQZ2PVW?B`pMY0 zdj`gf;Ho3QEzJ+>8@$2j|DJ_z=6+l^C+|qb#E8-hfC%|8aC@e{)}RBzS-k9;K|Ef>?^i4X)K@L?Y2XKwnM(|A?e=y;J*Rc zhYpjswG@!VGO=N`B?5c890c;3&%@vCF!q?uh)R~~X^rL<85Z20W?-#Q-kJ;d_uTPM-PI$W#5J!5;$oTt2O+b0ivC_t^9G79sav0GDnR1xQA%q z4AT^b%V2lzrMsOw?{`H_NA}$nF0-ba!$ucsB*;y0Gdsw+;0(fl<42be=Z{GEMARhe z+m;o}P;i!kx7VZdZ_QJ3@H|GXEa(l0LCwg4O!VW|oHj1LgS=SrAn`wYs*{cOMzIOO z-i>z7MbkI|p&FhkIgXrAigf+ig517lI8bD*ZmdT6I>csJ)xSY_+S7XFI-bXMrj#TR z%&ewv-?I%NS2hg&;R1#4t^>db+-MIHXKTN>QO{X#lH`J@!i7cFzE3r!6b0qo5QLG~ zHH#QQc9^pQ^BP_;A7xdysm341G<34W?~6#bi_kbbF6}=o-RE4|?MW-|$=`dn(3=p< z_{%%^3dP^|8h+zBxYu8|?k7u$F6`>z!U+}nrbDS2NsuQYDovTQIW$piR5q{&yZhiD z;as%DtNUknLOC$!$CYxMhx5dSslzZf6F*vaQck6pYwwt?rIf z{F1WRZ4$2|SRa(&T!JQy|(>Z(ubjXR32xf z@8Y@2&|6^+5l+k_1i}iBZ^sxnCGdLWMgrs=|9I zp!B@E93d(VvLH*v!f01t7#P2HuKVG)CRzVzdk?O(W4a?|b0Hp6nDd{4LzRR)%_uI6 z+$RaKmz5y#pxJk!TBq&ohUJma6MKDmh7J7u^Bdx)ARctyb)TY76Nf#6lLRNfdLj}q z0vT9;L3jIZk)vxX^G6zb$ScGCUg#>Bag*Nih3*crMp30$&yLWehYI5w4JJl~WYW5N zVEvVYMwb)8VA?OyH@yR_!R)7WhMG9tVf>ZmUK1_-vp$kYC@^Hk`@UPaFjp=UsXy&V zJxt6cs1)_q3V%XFv}yead{Gi|>xKK3y$9=VKoYgqPUO2Gpaa@BIRGo@Ja6)=uyA$R zu^!^KXxp=s_8t;bVfGyg1eqJ=7J9Gy!lzJ($AwzHONSKSLk}mh+v=l-2{g?127jku z2%JL}pq}Ii{A$mV3kj`Tdg)K-JTO$v(lo)5!9YOT*%nHnxcur?(D{LkWhvJ_z|a)N zyAtEkeMYMKaaqqb3LSEcNGGTfjrDzi1?JVSb2JjkN~iXPu+Ldt0Y}JfGAJ8IX(iQ&BDK*rHuuPZUl^ySaV#&M}xP4h(!@3M9TuUuqXS~Z9$*WmyL!< zr(4dOjAnFQ-d)){0X-^%;Pw@=l25bgN;m&|5Ywcyc&D2gB`KNzIx>8MT+p%}{J<4T zA%@%Accwt=U3=^Pzz$E}s;Av5gm=dfKS^UJ4n=&1QDrKi3z^y{+v^39^whGgZ#4V* z<8RuLikzPF?n;0LP8@lKwUVDYX0OTgy6n{65mlJ*GAuljC@8xzi`e{_gv&8Y|A>;O z^E3n?8D%GJt@K+5@p5Ok{#Jbx++8}z9SYJeq2b^4uTa0LUbdol$I$)ZTs5uV^j^!= zqT@K`2)-Ju*Ye;sjyuoyxh>vb?wS&x65N{v$-O`{)dlTZ98LT|kliGKI6U43VqQAy za0ncr`ZhCLc6K%+Yycv1Lr&%?*(>%Upx&lf!u z2JlU2TF_z+#w%^7=U!h(MVON5<6KA_bMG3 zUo~|%_zqw0*LD{DU`b7tEJe+hkIG(V({!Z-7u`3Gn zV)ean%#B~H-EA+VO;3qABC?Or(dteX&o09l0JEeI*X8BEzvqSuf(9efA8Rb`dhD$A z_K2T%Og;-FXJpF`n=P!nt_RN;l~p{=1qcd717<+H4Zr2qs>HlYqZ;6JjmglUe_A|h zF}1-MPjA4>V@2`shOBY?8ynlV zM4wULZw(G*THhe9NklORQ48Bh#ELQ1L;mW{n1H<*3&SnZzZAfNB(RCL#QDzsuR?5+ zQp7(2kDKh@{U2*UqgE|X6&Lisq-vINJ)-zOT_tCPS=4{6oBD8nGZCRMpys?>v|%rs!gIKd-BDrEG)B1bJli|Hp*<~VZ=dm zYLR`tnOUV&JD{ytG_dL(deP2Yn42E#>`z77}GjHQ%lS~#$P}!e`2q&B5#e7@5tVDD&Ps+oqj@%9ys#SFFf+ACx zU3DhkaF?n+a+)g?hyIKK4D-FgVcLpVuyqUyPmYZd-Ke6)cm!XWsEYs1 zJw}cm&|1B{jY4d^`#qVqdP%^$Xh3!vF0xl2o)>rXUFhB>V!B90A>y+#i+;)70I=&b zdn~Z?#ThObb!TR(`s??JAy4zs;xMQ` zucLI0xL_@@q=Sr>R-3~XJZQo?T-Q!I2zykz>WN;!T*1UG%KkvoBUhl zYPFeYTHxMaX*9hE6hmPIeP<1g@CRSQ+U-xQ@&DS&g=AJ|Ac_8ymDeiu&IB-JqZsSY zCt)QQ)aLaEP3tI&MZy;_5%7PES8QEk_4I_E0X@bvH3NA#YsZ%m{qK@P?i&YKfC}X& zxWBR!d<2{%QOl{k5V@zX;mC6IREU3+JM)jrFT2_FWGOu`S^D6)nm0h9p@R3Scz@09 z3=>d{XDVn>{Ylct05=TDTLgZ8`bX#u0ygT-4&t~dec#crTA=L>B7cB{b#mCh|Ir(d zv~R^HAn!}asg{QCF&g(%hZ+M8s&%b9OQPXW$qr7w&5(9PcMp&BJ+R3Tg$L=@aqgvv zOo~1T?F+?W?j98#1h^w!aR~j${`zeCpff(Ik_ks;=OtT5pZ1{MDrYlhb)@%zfYk1M z0M)%?4>pUcZ?h>wK3l}o{7Vyms+yvfH2EdWS=C~w;%5V--RBJV_63z3{jmi~>}`6j zsGnCe$j>yS5lPcYdP=YZZ9hf>)SFT7N+8l<7{kwAJ`oHO)#f;L;MZBMRPI?dry$n{ z6|AU6l$*sqTq@%6!ptjzvbkJ$rc^IZjT!kY5m1_2~5-x zHg=kETTqEtcP3@?2<=y^f0~o4MsLRW5c`1wa>{P2E`t>Ro$t+fb?#tX4mrYIy3g?0 zH`}`^^3gHJC=pT2^*TMJJk`t~I3_NtUFleeG$^Z6#t}+40$@Gyne%Wd3Wj46xWdqW z!`fkx4)gA#jCvSM97N^y&khvxaU^~C?05(?$S=37wTC7>UtrwI9}RFeX;HN)exg4X zqYqVAB795dInr)1Oy{Fv>%iIsP?QxQDAOK;Zbcy-tUNYDFir|tXtfTC zV<6ReYx@|Hu2QDS_D0L^IE(cl)tSQg2L^&r-A@&c*=~%avm-2b?`5lnTKAUe@}@`~ z*U?uRY7XxOZTG8@)9rr$+t^jzkI7$Z@GD}HFB4?Do8$96@-H0bWViBEQ=fO1W3{On z>97$SI#|_BnKa*+%={qpQX*}7_)V1?lFsKH@O!yVR-1E`=dIAhXyC|_@yjGvwbzz< zv3_aw??Og{G|ZE)CTYVjM)6k4VqVh}k2EH6MPGV@LLn*=+4J8^7st>*;eKrM3RPPv zv?9;I3{doeD?f9Ns>I?f*3k7zsS@91)?;epCyhJPxlAaF9s<3HXSBAL0=ZlHb{5k=@M#medkYV6AW`U2|=rO zv;9t1seuuQN5h}wdsdHU)gCJ zh#(jLBi!u@?GO7Q7k8`vTPUhJ=Ou-v=gkO}a+RjW_;*wX0~^8v%QU4s5uuUshw-j^ z!~*v1Wh*~EEH;#|WPah1YEQ0KvN@R2Bbq74YDkllteCdExs@p0&I;tpQ@LkPX5@D= znaB!I$e+M1(WwfvnqDP*cLZ%IMfY&?fcl_Q!Cz_}$0p{G%l~0#cyuIx%kVLExhE7q z_VwnNyl|OSjlaHvMXI6fl!4}z`#g*AVu6#8^*8)NmLyWP)4;S?CUJvRY!&uyYIBFP z2Fty37t(?Il-<cXw+N)w%ji=4zqu40o4f491q zv($xZryjVb%`mE*m^tgkw=0WWY;#}y7h6uDuZEeAoEw)Pt`e2qIhrnS=98=4htLQw z|Hb$^EO!Gni33r}Q|#_3O7T5p+C|fZWhgcf-!4pKWu`o4wucZew_h|mFl*KH8+UMG zZ+gZqxBJBoC0F*7^!?6>ue)`MOS(C#9Bq%arxW4z9%oUu)EG4u>5fMSD=ESdgSI0FTGg5ze4r5SQk-xq9C&Q?^XHq1sj9iI^;KXdp-A;$YIo&6KEhZOuN5+F`D|-=wvnN}I4&sh2iLV_3wGH><#A~V8*HibxOH)Iy9G55o zG7zgxQ@hMOl(jb6vLOeJ(~eZMN%zx3wLEbCftW}rQm^`<9?CiD(}C-jnF`UkmfQpG zbPZ7)#;35M8EYHxGx2QLc+{Nc1UsEyV;Bu=S^`H;djmVt?Y;;9bVep*b^fu|9uE^3 z_iwMNx$)YHkKiNTzg_cMjCs17aFolCS@aO^{B`kW4t>VG5Q}{?yC&anwTL^bM_bv` zs>*QCwgpwCb(T&$oW>c=a^#`BDk%9mRKbGjZm8g_KXomYf|$2aXZ(2F3M;&No67g# z^RU4*#<-a0-cT}s^$*>X`#F0D^*w+2Cz9)3Ce`L|zeeQ^flw>MXln)jn(Mhifc;|09D3P^XYS&-@KS;cQnUo&J( znA-M8tF4z@2c4XoqvBDjPzuc9RJQu+ZOct`3X zN7)xvNyiceD?3N|@#px3&(~P~5^6AO_&3FlOez*{POS=+TW@~9Jjn44G1_B77Z%-; zyr;*iG2^FN3HI%Zi~?7a(mHkhy&@%UUZ)5Rtyyz_bjY@yVW_usdKdO>#=& z4_J1WDKwVATi_(B#VGZfEL#x^B;f9{gAMCLpTZj&A1fSs)E|q%j ztL5DEm4ndB4bQ#l1!(u7p|0|y5on97LreE^=ss}k7|cqNv>DK$Zu{MG-%5{@QkAQf zf1iP^3J&#jw5_-oBbREQ@Dt9C$n}ZDsl!3Zes7^5zPAMM)66fp^QoBF8wjkdsxwQ- z-S|o0ds{dWl8ni&Ix=A-QCL7!>y{K}jNgV{`_`UmBY(B7KP98--|X8N#F^#sv4UQ8 zulEbtT{c10){kh8cwf5mtk=3GbwO&kFQql-jXJGgbf?b_SR4}u#E+vXJ6r7cDpvYF zf!cHf?XZ1_y|(LcdB5!w6SJD9_1nZiV?{fW6toPj6f6P+Qpf!@T|5 zb6q?NO&#P)S8h@JyB6|nZ+|)Vu&gdpF?HVMjO0GByNRX0CCE8dteB~cUDuZ6&9=_1 z9b`m;2Uc&*A?ON0tur%dn_nRP=K0o-fg3`^u50iS+K&hx zouN=R!c$w7R#&zpZHER<^}qB`Yb6zQ=UGovO}%PSx*3cTc}Y}vC*gg9nnhU0{ycJ1 zX@l~s6r>pK{y-De(2+#gBs@NJI`JaXrBk|=<*dHtI}xgQeMG*u-;yZx*h+2>>J+&6 zmP=%oM(z;|fCD<=gVqJM#8xX0cht`xqE}b62N(KIU5p3rWWI5V5SU)C$0PGd*8R>^SXqY{nC^oIJfyD3T5e9L+ZD#4jnt_dXs!KM|AUoe_nS zF7~n!NIuQ^gHl;`3?djhpho>|r8b{lvd5qJGmmyAvzjWpP=+Gspc2Y7S=w5DzwlOd z+zC%_9qE3hsEhP!{ zhe8iwgHJUlvwE=J@;{pYSh?QhZE8|%>1;_(fI>mhE2>b=f7k9cHk}dK!AA#>wvOt< zT0StW_ttwX add TMLE To run an estimation procedure, we need 3 ingredients: -1. A dataset: here a simulation dataset. +### 1. A dataset: here a simulation dataset For illustration, assume we know the actual data generating process is as follows: @@ -52,7 +52,7 @@ dataset = (Y=Y, T=categorical(T), W=W) nothing # hide ``` -2. A quantity of interest: here the Average Treatment Effect (ATE). +### 2. A quantity of interest: here the Average Treatment Effect (ATE) The Average Treatment Effect of ``T`` on ``Y`` confounded by ``W`` is defined as: @@ -64,7 +64,7 @@ The Average Treatment Effect of ``T`` on ``Y`` confounded by ``W`` is defined as ) ``` -3. An estimator: here a Targeted Maximum Likelihood Estimator (TMLE). +### 3. An estimator: here a Targeted Maximum Likelihood Estimator (TMLE) ```@example quick-start tmle = TMLEE() @@ -79,3 +79,34 @@ using Test # hide @test pvalue(OneSampleTTest(result, 2.5)) > 0.05 # hide nothing # hide ``` + +## Scope and Distinguishing Features + +The goal of this package is to provide an entry point for semi-parametric asymptotic unbiased and efficient estimation in Julia. The two main general estimators that are known to achieve these properties are the One-Step estimator and the Targeted Maximum-Likelihood estimator. Most of the current effort as been centered around estimands that are composite of the counterfactual mean. + +Distinguishing Features: + +- Estimands: Counterfactual Mean, Average Treatment Effect, Interactions, Any composition thereof +- Estimators: TMLE, One-Step +- Machine-Learning: Any [MLJ](https://alan-turing-institute.github.io/MLJ.jl/stable/) compatible model +- Treatments Variables: + - Multiple treatment variables (with their own set of confounders) + - Categorical treatment variables (factorial analysis) + +## Citing TMLE.jl + +If you use TMLE.jl for your own work and would like to cite us, here are the BibTeX and APA formats: + +- BibTeX + +```bibtex +@software{Labayle_TMLE_jl, + author = {Labayle, Olivier and Beentjes, Sjoerd and Khamseh, Ava and Ponting, Chris}, + title = {{TMLE.jl}}, + url = {https://github.com/olivierlabayle/TMLE.jl} +} +``` + +- APA + +Labayle, O., Beentjes, S., Khamseh, A., & Ponting, C. TMLE.jl [Computer software]. https://github.com/olivierlabayle/TMLE.jl \ No newline at end of file diff --git a/docs/src/user_guide/estimation.md b/docs/src/user_guide/estimation.md index 61bd2a86..a855eab7 100644 --- a/docs/src/user_guide/estimation.md +++ b/docs/src/user_guide/estimation.md @@ -111,21 +111,64 @@ Again, required nuisance functions are fitted and stored in the cache. ## CV-Estimation -Both TMLE and OSE can be used with sample-splitting, which, for an additional computational cost, further reduces the assumptions we need to make regarding our data generating process ([see here](https://arxiv.org/abs/2203.06469)). Note that this sample-splitting procedure should not be confused with the sample-splitting happening in Super Learning. Using both CV-TMLE and Super-Learning will result in two nested sample-splitting loops. +When performing "vanilla" semi-parametric estimation, we are essentially using the dataset twice, once for the estimation of the nuisance functions and once for the estimation of the parameter of interest. This means that there is a risk of over-fitting and residual bias ([see here](https://arxiv.org/abs/2203.06469) for some discussion). One way to address this limitation is to use a technique called sample-splitting / cross-validating. -To leverage sample-splitting, simply specify a `resampling` strategy when building an estimator: +### Usage + +To create a cross-validated estimator simply specify the `resampling` keyword argument: ```@example estimation -cvtmle = TMLEE(resampling=CV()) -cvresult₁, _ = cvtmle(Ψ₁, dataset); +TMLEE(resampling=StratifiedCV()); ``` -Similarly, one could build CV-OSE: +or ```julia -cvose = OSE(resampling=CV(nfolds=3)) +OSE(resampling=StratifiedCV(nfolds=3)); +``` + +We further explain below the procedure for both Targeted Maximum-Likelihood estimation and One-Step estimation and provide some considerations when using sample-splitting. + +### Preliminaries + +Even though the idea behind cross-validated estimators is simple, the procedure may seem obscure at first. The purpose of this subsection is to explain how it works, and it will be useful to introduce some notation. Let `resampling` be a `MLJ.ResamplingStrategy` partitioning the dataset into K folds. According to this `resampling` method, each sample ``i`` in the dataset belongs to a specific (validation) fold ``k``: + +- ``k(i)`` denotes the (validation) fold sample ``i`` belongs to ``k(i) \in [1, K]``. +- ``-k(i)`` denotes the remaining (training) folds. + +```@raw html +
+ +
+``` + +The estimators we are considering are asymptotically linear. This means they can be written as an average over their influence function. If we denote by ``\phi`` this influence function, which itself depends on nuisance functions (e.g. outcome mean, propensity score...), then ``i \mapsto \hat{\phi}^{-k(i)}(i)`` is a cross-validated estimator of this influence function. The notation means that the nuisance functions learnt on all folds but the one containing sample ``i`` are used to make the prediction for sample ``i``. Here, we loosely refer to ``i`` as a sample (e.g. ``(Y, T, W)_i``), regardless of the actual data structure. + +We are now ready to define the cross-validated One-Step and Targeted Maximum-Likelihood estimators. + +### CV-OSE + +The cross-validated One-Step estimator is the average of the cross-validated influence function, it can be compactly written as: + +```math +\hat{\Psi}_{CV} = \frac{1}{n} \sum_{i=1}^n \hat{\phi}^{-k(i)}(i) ``` +And the associated variance estimator: + +```math +\hat{V}_{\Psi, CV} = \frac{1}{n-1} \sum_{i=1}^n (\hat{\phi}^{-k(i)}(i) - \hat{\Psi}_{CV})^2 +``` + +### CV-TMLE + + +### Further Considerations + +- Choice of `resampling` Strategy: The theory behind sample-splitting requires the nuisance functions to be sufficiently well estimated on **each and every** fold. A practical aspect of it is that each fold should contain a sample representative of the dataset. In particular, when the treatment and outcome variables are categorical it is important to make sure the proportions are preserved. This is typically done using `StratifiedCV`. +- Computational Complexity: Sample-splitting results in ``K`` fits of the nuisance functions, drastically increasing computational complexity. In particular, if the nuisance functions are estimated using (P-fold) Super-Learning, this will result in two nested cross-validation loops and ``K \times P`` fits. +- Caching of Nuisance Functions: Because the `resampling` strategy typically needs to preserve the outcome and treatment proportions, very little reuse of cached models is possible (see below). + ## Caching model fits Let's now see how the `cache` can be reused with a new estimand, say the Total Average Treatment Effect of both `T₁` and `T₂`. diff --git a/src/counterfactual_mean_based/gradient.jl b/src/counterfactual_mean_based/gradient.jl index 882e325e..4fd6c066 100644 --- a/src/counterfactual_mean_based/gradient.jl +++ b/src/counterfactual_mean_based/gradient.jl @@ -26,6 +26,9 @@ end compute_estimate(ctf_aggregate, ::Nothing) = mean(ctf_aggregate) +""" +Accounting for possibly slitghly different folds' sizes. +""" compute_estimate(ctf_aggregate, train_validation_indices) = mean(compute_estimate(ctf_aggregate[val_indices], nothing) for (_, val_indices) in train_validation_indices) diff --git a/src/estimates.jl b/src/estimates.jl index f49eee45..78fe4a52 100644 --- a/src/estimates.jl +++ b/src/estimates.jl @@ -50,14 +50,52 @@ string_repr(estimate::SampleSplitMLConditionalDistribution) = Base.typename(typeof(first(estimate.machines).model)).wrapper ) +function fold_prediction(estimate::SampleSplitMLConditionalDistribution, X, val_idx, fold) + Xval = selectrows(X, val_idx) + return predict(estimate.machines[fold], Xval) +end + +""" +Default unoptimized method +""" +function cv_predict(target_scitype::Type{<:Any}, estimate, X) + fold_to_val_idx = [(fold, val_idx) for (fold, (_, val_idx)) in enumerate(estimate.train_validation_indices)] + first_fold, first_val_idx = first(fold_to_val_idx) + first_preds = fold_prediction(estimate, X, first_val_idx, first_fold) + + ŷ = Vector{eltype(first_preds)}(undef, nrows(X)) + ŷ[first_val_idx] = first_preds + for (fold, val_idx) in fold_to_val_idx[2:end] + preds = fold_prediction(estimate, X, val_idx, fold) + ŷ[val_idx] = preds + end + return ŷ +end + +function update_probs!(probs, prob_given_ref, first_val_idx) + for (key, vals) in prob_given_ref + probs[first_val_idx, key] = vals + end +end + +function cv_predict(target_scitype::Type{<:AbstractVector{<:Finite}}, estimate, X) + fold_to_val_idx = [(fold, val_idx) for (fold, (_, val_idx)) in enumerate(estimate.train_validation_indices)] + first_fold, first_val_idx = first(fold_to_val_idx) + first_preds = fold_prediction(estimate, X, first_val_idx, first_fold) + probs = Matrix{Float64}(undef, nrows(X), length(first_preds.prob_given_ref)) + update_probs!(probs, first_preds.prob_given_ref, first_val_idx) + for (fold, val_idx) in fold_to_val_idx[2:end] + preds = fold_prediction(estimate, X, val_idx, fold) + update_probs!(probs, preds.prob_given_ref, val_idx) + end + return UnivariateFinite(support(first_preds), probs) +end + + function MLJBase.predict(estimate::SampleSplitMLConditionalDistribution, dataset) X = selectcols(dataset, estimate.estimand.parents) - ŷs = [] - for (fold, (_, validation_indices)) in enumerate(estimate.train_validation_indices) - Xval = selectrows(X, validation_indices) - push!(ŷs, predict(estimate.machines[fold], Xval)) - end - return vcat(ŷs...) + target_scitype = MLJBase.target_scitype(first(estimate.machines).model) + return cv_predict(target_scitype, estimate, X) end ##################################################################### @@ -76,7 +114,7 @@ function likelihood(estimate::ConditionalDistributionEstimate, dataset) return pdf.(ŷ, y) end -function compute_offset(ŷ::UnivariateFiniteVector{<:Union{OrderedFactor{2}, Multiclass{2}}}) +function compute_offset(ŷ::AbstractVector{<:UnivariateFinite{<:Union{OrderedFactor{2}, Multiclass{2}}}}) μy = expected_value(ŷ) logit!(μy) return μy @@ -84,7 +122,7 @@ end compute_offset(ŷ::AbstractVector{<:Distributions.UnivariateDistribution}) = expected_value(ŷ) -compute_offset(ŷ::AbstractVector{<:Real}) = expected_value(ŷ) +compute_offset(ŷ::AbstractVector{T}) where T<:Real = expected_value(ŷ) function compute_offset(estimate::ConditionalDistributionEstimate, X) ŷ = predict(estimate, X) diff --git a/src/utils.jl b/src/utils.jl index 2a3a6e52..c23d9aa1 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -9,7 +9,18 @@ key(estimand, estimator) = (key(estimand), key(estimator)) unique_sorted_tuple(iter) = Tuple(sort(unique(Symbol(x) for x in iter))) +""" +For "vanilla" estimators, missingness management is deferred to the nuisance function estimators. +This is in order to maximize data usage. +""" choose_initial_dataset(dataset, nomissing_dataset, resampling::Nothing) = dataset + +""" +For cross-validated estimators, missing data are removed early on based on all columns relevant to the estimand. +This is to avoid the complications of: + - Equally distributing missing across folds + - Tracking sample_ids +""" choose_initial_dataset(dataset, nomissing_dataset, resampling) = nomissing_dataset selectcols(data, cols) = data |> TableOperations.select(cols...) |> Tables.columntable @@ -46,7 +57,7 @@ function indicator_values(indicators, T) return indic end -expected_value(ŷ::UnivariateFiniteVector{<:Union{OrderedFactor{2}, Multiclass{2}}}) = pdf.(ŷ, levels(first(ŷ))[2]) +expected_value(ŷ::AbstractArray{<:UnivariateFinite{<:Union{OrderedFactor{2}, Multiclass{2}}}}) = pdf.(ŷ, levels(first(ŷ))[2]) expected_value(ŷ::AbstractVector{<:Distributions.UnivariateDistribution}) = mean.(ŷ) expected_value(ŷ::AbstractVector{<:Real}) = ŷ diff --git a/test/estimators_and_estimates.jl b/test/estimators_and_estimates.jl index f7fb5c23..8557ec0e 100644 --- a/test/estimators_and_estimates.jl +++ b/test/estimators_and_estimates.jl @@ -7,11 +7,16 @@ using DataFrames using MLJGLMInterface using MLJModels using LogExpFunctions +using Distributions verbosity = 1 n = 100 X, y = make_moons(n) dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2) + +X, y = make_regression(n) +continuous_dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2) + estimand = TMLE.ConditionalDistribution(:Y, [:X₁, :X₂]) fit_log = string("Estimating: ", TMLE.string_repr(estimand)) reuse_log = string("Reusing estimate for: ", TMLE.string_repr(estimand)) @@ -42,7 +47,7 @@ end @testset "Test SampleSplitMLConditionalDistributionEstimator" begin nfolds = 3 - train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=nfolds), 1:n, dataset)) + train_validation_indices = Tuple(MLJBase.train_test_pairs(StratifiedCV(nfolds=nfolds), 1:n, dataset, dataset.Y)) model = LinearBinaryClassifier() estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( model, @@ -54,6 +59,7 @@ end expected_features = collect(estimand.parents) @test all(fitted_params(mach).features == expected_features for mach in conditional_density_estimate.machines) ŷ = predict(conditional_density_estimate, dataset) + @test ŷ isa UnivariateFiniteVector μ̂ = TMLE.expected_value(conditional_density_estimate, dataset) for foldid in 1:nfolds train, val = train_validation_indices[foldid] @@ -87,6 +93,21 @@ end @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity) end +@testset "Test SampleSplitMLConditionalDistributionEstimator: Continuous outcome" begin + nfolds = 3 + train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=nfolds), 1:n, continuous_dataset)) + model = MLJGLMInterface.LinearRegressor() + estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( + model, + train_validation_indices + ) + conditional_density_estimate = estimator(estimand, continuous_dataset; verbosity=verbosity) + ŷ = predict(conditional_density_estimate, continuous_dataset) + @test ŷ isa Vector{Distributions.Normal{Float64}} + μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) + @test μ̂ isa Vector{Float64} +end + @testset "Test compute_offset MLConditionalDistributionEstimator" begin dataset = ( T = categorical(["a", "b", "c", "a", "a", "b", "a"]), From 187ceee321a3a8c2e164bd3ffa15d75443ad360d Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 16 Apr 2024 12:14:32 +0200 Subject: [PATCH 2/9] refactor some methods --- src/estimates.jl | 72 ++++++++++++++++++++------------ test/estimators_and_estimates.jl | 15 ++++++- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/estimates.jl b/src/estimates.jl index 78fe4a52..175b5706 100644 --- a/src/estimates.jl +++ b/src/estimates.jl @@ -50,52 +50,70 @@ string_repr(estimate::SampleSplitMLConditionalDistribution) = Base.typename(typeof(first(estimate.machines).model)).wrapper ) -function fold_prediction(estimate::SampleSplitMLConditionalDistribution, X, val_idx, fold) - Xval = selectrows(X, val_idx) +""" +Prediction for the subset of X identified by idx and fold. +""" +function fold_prediction(estimate::SampleSplitMLConditionalDistribution, X, idx, fold) + Xval = selectrows(X, idx) return predict(estimate.machines[fold], Xval) end """ -Default unoptimized method +In the case where newpreds is a UnivariateFiniteVector, we update the probability matrix. """ -function cv_predict(target_scitype::Type{<:Any}, estimate, X) - fold_to_val_idx = [(fold, val_idx) for (fold, (_, val_idx)) in enumerate(estimate.train_validation_indices)] - first_fold, first_val_idx = first(fold_to_val_idx) - first_preds = fold_prediction(estimate, X, first_val_idx, first_fold) - - ŷ = Vector{eltype(first_preds)}(undef, nrows(X)) - ŷ[first_val_idx] = first_preds - for (fold, val_idx) in fold_to_val_idx[2:end] - preds = fold_prediction(estimate, X, val_idx, fold) - ŷ[val_idx] = preds +function update_preds!(probs::Matrix, newpreds::UnivariateFiniteVector, idx) + for (key, vals) in newpreds.prob_given_ref + probs[idx, key] = vals end - return ŷ end -function update_probs!(probs, prob_given_ref, first_val_idx) - for (key, vals) in prob_given_ref - probs[first_val_idx, key] = vals - end -end +update_preds!(ŷ, preds, idx) = ŷ[idx] = preds + +""" +In the case where predictions are a UnivariateFiniteVector, we store a Matrix of probabilities. +""" +initialize_cv_preds(first_preds::UnivariateFiniteVector, n) = + Matrix{Float64}(undef, n, length(first_preds.prob_given_ref)) -function cv_predict(target_scitype::Type{<:AbstractVector{<:Finite}}, estimate, X) +""" +As a default, we initialize predictions with a Vector of the type corresponding to the +predictions from the first machine. +""" +initialize_cv_preds(first_preds, n) = + Vector{eltype(first_preds)}(undef, n) + +""" +In the case where predictions are a UnivariateFiniteVector, we create a special +UnivariateFinite vector for downstream optimizaton. +""" +finalize_cv_preds(probs, first_preds::UnivariateFiniteVector) = UnivariateFinite(support(first_preds), probs) + +""" +As a default we simply return the vector +""" +finalize_cv_preds(ŷ, first_preds) = ŷ + +""" +Out of fold prediction, predictions for fold k are made from machines trained on fold k̄. +We distinguish the case where preidctions are a UnivariateFiniteVector that requires specific attention. +""" +function cv_predict(estimate, X) fold_to_val_idx = [(fold, val_idx) for (fold, (_, val_idx)) in enumerate(estimate.train_validation_indices)] first_fold, first_val_idx = first(fold_to_val_idx) first_preds = fold_prediction(estimate, X, first_val_idx, first_fold) - probs = Matrix{Float64}(undef, nrows(X), length(first_preds.prob_given_ref)) - update_probs!(probs, first_preds.prob_given_ref, first_val_idx) + + ŷ = initialize_cv_preds(first_preds, nrows(X)) + update_preds!(ŷ, first_preds, first_val_idx) for (fold, val_idx) in fold_to_val_idx[2:end] preds = fold_prediction(estimate, X, val_idx, fold) - update_probs!(probs, preds.prob_given_ref, val_idx) + update_preds!(ŷ, preds, val_idx) end - return UnivariateFinite(support(first_preds), probs) + return finalize_cv_preds(ŷ, first_preds) end - function MLJBase.predict(estimate::SampleSplitMLConditionalDistribution, dataset) X = selectcols(dataset, estimate.estimand.parents) - target_scitype = MLJBase.target_scitype(first(estimate.machines).model) - return cv_predict(target_scitype, estimate, X) + return cv_predict(estimate, X) end ##################################################################### diff --git a/test/estimators_and_estimates.jl b/test/estimators_and_estimates.jl index 8557ec0e..57002bad 100644 --- a/test/estimators_and_estimates.jl +++ b/test/estimators_and_estimates.jl @@ -8,6 +8,7 @@ using MLJGLMInterface using MLJModels using LogExpFunctions using Distributions +using MLJLinearModels verbosity = 1 n = 100 @@ -45,7 +46,7 @@ reuse_log = string("Reusing estimate for: ", TMLE.string_repr(estimand)) @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity) end -@testset "Test SampleSplitMLConditionalDistributionEstimator" begin +@testset "Test SampleSplitMLConditionalDistributionEstimator: Binary outcome" begin nfolds = 3 train_validation_indices = Tuple(MLJBase.train_test_pairs(StratifiedCV(nfolds=nfolds), 1:n, dataset, dataset.Y)) model = LinearBinaryClassifier() @@ -96,6 +97,7 @@ end @testset "Test SampleSplitMLConditionalDistributionEstimator: Continuous outcome" begin nfolds = 3 train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=nfolds), 1:n, continuous_dataset)) + # Probabilistic Model model = MLJGLMInterface.LinearRegressor() estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( model, @@ -106,6 +108,17 @@ end @test ŷ isa Vector{Distributions.Normal{Float64}} μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) @test μ̂ isa Vector{Float64} + # Deterministic Model + model = MLJLinearModels.LinearRegressor() + estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( + model, + train_validation_indices + ) + conditional_density_estimate = estimator(estimand, continuous_dataset; verbosity=verbosity) + ŷ = predict(conditional_density_estimate, continuous_dataset) + @test ŷ isa Vector{Float64} + μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) + @test μ̂ isa Vector{Float64} end @testset "Test compute_offset MLConditionalDistributionEstimator" begin From 3cc78ea02a864b6ec0386eeece30819c69dbde8d Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 16 Apr 2024 20:15:17 +0200 Subject: [PATCH 3/9] update test for conditional distribution estimates --- test/estimators_and_estimates.jl | 138 +++++++++++++++---------------- 1 file changed, 68 insertions(+), 70 deletions(-) diff --git a/test/estimators_and_estimates.jl b/test/estimators_and_estimates.jl index 57002bad..f0c146f7 100644 --- a/test/estimators_and_estimates.jl +++ b/test/estimators_and_estimates.jl @@ -13,7 +13,7 @@ using MLJLinearModels verbosity = 1 n = 100 X, y = make_moons(n) -dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2) +binary_dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2) X, y = make_regression(n) continuous_dataset = DataFrame(Y=y, X₁=X.x1, X₂=X.x2) @@ -22,82 +22,116 @@ estimand = TMLE.ConditionalDistribution(:Y, [:X₁, :X₂]) fit_log = string("Estimating: ", TMLE.string_repr(estimand)) reuse_log = string("Reusing estimate for: ", TMLE.string_repr(estimand)) -@testset "Test MLConditionalDistributionEstimator" begin +@testset "Test MLConditionalDistributionEstimator: binary outcome" begin + # Check predict / expected_value / compute_offset estimator = TMLE.MLConditionalDistributionEstimator(LinearBinaryClassifier()) # Fitting with no cache cache = Dict() - conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, dataset; cache=cache, verbosity=verbosity) + conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity) expected_features = collect(estimand.parents) @test conditional_density_estimate isa TMLE.MLConditionalDistribution @test fitted_params(conditional_density_estimate.machine).features == expected_features - ŷ = predict(conditional_density_estimate, dataset) - mach_ŷ = predict(conditional_density_estimate.machine, dataset[!, expected_features]) + ŷ = predict(conditional_density_estimate, binary_dataset) + mach_ŷ = predict(conditional_density_estimate.machine, binary_dataset[!, expected_features]) @test all(ŷ[i].prob_given_ref == mach_ŷ[i].prob_given_ref for i in eachindex(ŷ)) - μ̂ = TMLE.expected_value(conditional_density_estimate, dataset) + μ̂ = TMLE.expected_value(conditional_density_estimate, binary_dataset) @test μ̂ == [ŷ[i].prob_given_ref[2] for i in eachindex(ŷ)] - @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, dataset)) # The pdf is not necessarily between 0 and 1 - # Uses the cache instead of fitting + offset = TMLE.compute_offset(conditional_density_estimate, binary_dataset) + @test offset == logit.(μ̂) + @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, binary_dataset)) # The pdf is not necessarily between 0 and 1 + # Check cache management + ## Uses the cache instead of fitting new_estimator = TMLE.MLConditionalDistributionEstimator(LinearBinaryClassifier()) @test TMLE.key(new_estimator) == TMLE.key(estimator) - @test_logs (:info, reuse_log) estimator(estimand, dataset; cache=cache, verbosity=verbosity) - # Changing the model leads to refit + @test_logs (:info, reuse_log) estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity) + ## Changing the model leads to refit new_estimator = TMLE.MLConditionalDistributionEstimator(LinearBinaryClassifier(fit_intercept=false)) @test TMLE.key(new_estimator) != TMLE.key(estimator) - @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity) + @test_logs (:info, fit_log) new_estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity) +end + +@testset "Test MLConditionalDistributionEstimator: continuous outcome" begin + # Check predict / expected_value / compute_offset + ## Probabilistic Model + model = MLJGLMInterface.LinearRegressor() + estimator = TMLE.MLConditionalDistributionEstimator(model) + conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, continuous_dataset; cache=Dict(), verbosity=verbosity) + ŷ = predict(conditional_density_estimate, continuous_dataset) + @test ŷ isa Vector{Normal{Float64}} + μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) + @test [ŷᵢ.μ for ŷᵢ in ŷ] == μ̂ + offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset) + @test offset == μ̂ + + ## Deterministic Model + model = MLJLinearModels.LinearRegressor() + estimator = TMLE.MLConditionalDistributionEstimator(model) + conditional_density_estimate = estimator(estimand, continuous_dataset; cache=Dict(), verbosity=verbosity) + ŷ = predict(conditional_density_estimate, continuous_dataset) + @test ŷ isa Vector{Float64} + μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) + @test ŷ == μ̂ + offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset) + @test offset == μ̂ end @testset "Test SampleSplitMLConditionalDistributionEstimator: Binary outcome" begin + # Check predict / expected_value / compute_offset nfolds = 3 - train_validation_indices = Tuple(MLJBase.train_test_pairs(StratifiedCV(nfolds=nfolds), 1:n, dataset, dataset.Y)) + train_validation_indices = Tuple(MLJBase.train_test_pairs(StratifiedCV(nfolds=nfolds), 1:n, binary_dataset, binary_dataset.Y)) model = LinearBinaryClassifier() estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( model, train_validation_indices ) cache = Dict() - conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, dataset;cache=cache, verbosity=verbosity) + conditional_density_estimate = @test_logs (:info, fit_log) estimator(estimand, binary_dataset;cache=cache, verbosity=verbosity) @test conditional_density_estimate isa TMLE.SampleSplitMLConditionalDistribution expected_features = collect(estimand.parents) @test all(fitted_params(mach).features == expected_features for mach in conditional_density_estimate.machines) - ŷ = predict(conditional_density_estimate, dataset) + ŷ = predict(conditional_density_estimate, binary_dataset) @test ŷ isa UnivariateFiniteVector - μ̂ = TMLE.expected_value(conditional_density_estimate, dataset) for foldid in 1:nfolds train, val = train_validation_indices[foldid] # The predictions on validation samples are made from # the machine trained on the train sample - ŷfold = predict(conditional_density_estimate.machines[foldid], dataset[val, expected_features]) + ŷfold = predict(conditional_density_estimate.machines[foldid], binary_dataset[val, expected_features]) @test [ŷᵢ.prob_given_ref for ŷᵢ ∈ ŷ[val]] == [ŷᵢ.prob_given_ref for ŷᵢ ∈ ŷfold] - @test μ̂[val] == [ŷᵢ.prob_given_ref[2] for ŷᵢ ∈ ŷfold] end - @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, dataset)) - # Uses the cache instead of fitting + μ̂ = TMLE.expected_value(conditional_density_estimate, binary_dataset) + @test [ŷᵢ.prob_given_ref[2] for ŷᵢ ∈ ŷ] == μ̂ + offset = TMLE.compute_offset(conditional_density_estimate, binary_dataset) + @test offset == logit.(μ̂) + @test all(0. <= x <= 1. for x in TMLE.likelihood(conditional_density_estimate, binary_dataset)) + # Check cache management + ## Uses the cache instead of fitting new_estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( LinearBinaryClassifier(), train_validation_indices ) @test TMLE.key(new_estimator) == TMLE.key(estimator) - @test_logs (:info, reuse_log) estimator(estimand, dataset;cache=cache, verbosity=verbosity) - # Changing the model leads to refit + @test_logs (:info, reuse_log) estimator(estimand, binary_dataset;cache=cache, verbosity=verbosity) + ## Changing the model leads to refit new_model = LinearBinaryClassifier(fit_intercept=false) new_estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( new_model, train_validation_indices ) - @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity) - # Changing the train/validation splits leads to refit - train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=4), 1:n, dataset)) + @test_logs (:info, fit_log) new_estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity) + ## Changing the train/validation splits leads to refit + train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=4), 1:n, binary_dataset)) new_estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( new_model, train_validation_indices ) - @test_logs (:info, fit_log) new_estimator(estimand, dataset; cache=cache, verbosity=verbosity) + @test_logs (:info, fit_log) new_estimator(estimand, binary_dataset; cache=cache, verbosity=verbosity) end @testset "Test SampleSplitMLConditionalDistributionEstimator: Continuous outcome" begin + # Check predict / expected_value / compute_offset nfolds = 3 train_validation_indices = Tuple(MLJBase.train_test_pairs(CV(nfolds=nfolds), 1:n, continuous_dataset)) - # Probabilistic Model + ## Probabilistic Model model = MLJGLMInterface.LinearRegressor() estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( model, @@ -108,7 +142,11 @@ end @test ŷ isa Vector{Distributions.Normal{Float64}} μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) @test μ̂ isa Vector{Float64} - # Deterministic Model + @test μ̂ == [ŷᵢ.μ for ŷᵢ in ŷ] + offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset) + @test offset == μ̂ + + ## Deterministic Model model = MLJLinearModels.LinearRegressor() estimator = TMLE.SampleSplitMLConditionalDistributionEstimator( model, @@ -118,50 +156,10 @@ end ŷ = predict(conditional_density_estimate, continuous_dataset) @test ŷ isa Vector{Float64} μ̂ = TMLE.expected_value(conditional_density_estimate, continuous_dataset) - @test μ̂ isa Vector{Float64} + @test μ̂ == ŷ + offset = TMLE.compute_offset(conditional_density_estimate, continuous_dataset) + @test offset == μ̂ end - -@testset "Test compute_offset MLConditionalDistributionEstimator" begin - dataset = ( - T = categorical(["a", "b", "c", "a", "a", "b", "a"]), - Ycont = [1., 2., 3, 4, 5, 6, 7], - Ycat = categorical([1, 0, 0, 1, 1, 1, 0]), - W = rand(7), - ) - μYcont = mean(dataset.Ycont) - μYcat = mean(float(dataset.Ycat)) - # The model is probabilistic continuous, the offset is the mean of - # the conditional distribution - conditional_density_estimate = TMLE.MLConditionalDistributionEstimator(ConstantRegressor())( - TMLE.ConditionalDistribution(:Ycont, [:W, :T]), - dataset, - verbosity=0 - ) - offset = TMLE.compute_offset(conditional_density_estimate, dataset) - @test offset == mean.(predict(conditional_density_estimate, dataset)) - @test offset == repeat([μYcont], 7) - # The model is deterministic, the offset is simply the output - # of the predict function which is assumed to correspond to the mean - # if the squared loss was optimized for by the underlying model - conditional_density_estimate = TMLE.MLConditionalDistributionEstimator(DeterministicConstantRegressor())( - TMLE.ConditionalDistribution(:Ycont, [:W, :T]), - dataset, - verbosity=0 - ) - offset = TMLE.compute_offset(conditional_density_estimate, dataset) - @test offset == predict(conditional_density_estimate, dataset) - @test offset == repeat([μYcont], 7) - # The model is probabilistic binary, the offset is the logit - # of the mean of the conditional distribution - conditional_density_estimate = TMLE.MLConditionalDistributionEstimator(ConstantClassifier())( - TMLE.ConditionalDistribution(:Ycat, [:W, :T]), - dataset, - verbosity=0 - ) - offset = TMLE.compute_offset(conditional_density_estimate, dataset) - @test offset == repeat([logit(μYcat)], 7) -end - end true \ No newline at end of file From 6ae8d8ae98b175d8da0c38c7058de998d3a1cb80 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 16 Apr 2024 21:06:17 +0200 Subject: [PATCH 4/9] add stratifiedCV to double robustness tests --- test/counterfactual_mean_based/3points_interactions.jl | 2 +- test/counterfactual_mean_based/double_robustness_ate.jl | 4 ++-- test/counterfactual_mean_based/double_robustness_iate.jl | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/counterfactual_mean_based/3points_interactions.jl b/test/counterfactual_mean_based/3points_interactions.jl index 436aaa6d..9f032841 100644 --- a/test/counterfactual_mean_based/3points_interactions.jl +++ b/test/counterfactual_mean_based/3points_interactions.jl @@ -37,7 +37,7 @@ end treatment_confounders = (T₁=[:W], T₂=[:W], T₃=[:W]) ) models = ( - Y = TreatmentTransformer() |> InteractionTransformer(order=3) |> LinearRegressor(), + Y = with_encoder(InteractionTransformer(order=3) |> LinearRegressor()), T₁ = LogisticClassifier(lambda=0), T₂ = LogisticClassifier(lambda=0), T₃ = LogisticClassifier(lambda=0) diff --git a/test/counterfactual_mean_based/double_robustness_ate.jl b/test/counterfactual_mean_based/double_robustness_ate.jl index f87dbc4b..acc73e4d 100644 --- a/test/counterfactual_mean_based/double_robustness_ate.jl +++ b/test/counterfactual_mean_based/double_robustness_ate.jl @@ -153,7 +153,7 @@ end Y = with_encoder(ConstantClassifier()), T = with_encoder(LogisticClassifier(lambda=0)) ) - dr_estimators = double_robust_estimators(models) + dr_estimators = double_robust_estimators(models, resampling=StratifiedCV()) results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0) test_mean_inf_curve_almost_zero(results.tmle; atol=1e-6) test_mean_inf_curve_almost_zero(results.ose; atol=1e-6) @@ -166,7 +166,7 @@ end Y = with_encoder(LogisticClassifier(lambda=0)), T = with_encoder(ConstantClassifier()) ) - dr_estimators = double_robust_estimators(models) + dr_estimators = double_robust_estimators(models, resampling=StratifiedCV()) results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0) test_mean_inf_curve_almost_zero(results.tmle; atol=1e-6) end diff --git a/test/counterfactual_mean_based/double_robustness_iate.jl b/test/counterfactual_mean_based/double_robustness_iate.jl index 349471e5..9afabba5 100644 --- a/test/counterfactual_mean_based/double_robustness_iate.jl +++ b/test/counterfactual_mean_based/double_robustness_iate.jl @@ -182,7 +182,7 @@ end T₁ = with_encoder(LogisticClassifier(lambda=0)), T₂ = with_encoder(LogisticClassifier(lambda=0)), ) - dr_estimators = double_robust_estimators(models) + dr_estimators = double_robust_estimators(models, resampling=StratifiedCV()) results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0) test_mean_inf_curve_almost_zero(results.tmle; atol=1e-9) test_mean_inf_curve_almost_zero(results.ose; atol=1e-9) @@ -197,7 +197,7 @@ end T₁ = with_encoder(ConstantClassifier()), T₂ = with_encoder(ConstantClassifier()), ) - dr_estimators = double_robust_estimators(models) + dr_estimators = double_robust_estimators(models, resampling=StratifiedCV()) results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0) test_mean_inf_curve_almost_zero(results.tmle; atol=1e-9) test_mean_inf_curve_almost_zero(results.ose; atol=1e-9) @@ -267,7 +267,7 @@ end T₁ = with_encoder(LogisticClassifier(lambda=0)), T₂ = with_encoder(LogisticClassifier(lambda=0)) ) - dr_estimators = double_robust_estimators(models) + dr_estimators = double_robust_estimators(models, resampling=StratifiedCV()) results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0) test_mean_inf_curve_almost_zero(results.tmle; atol=1e-5) test_mean_inf_curve_almost_zero(results.ose; atol=1e-10) @@ -283,7 +283,7 @@ end T₁ = with_encoder(ConstantClassifier()), T₂ = with_encoder(ConstantClassifier()), ) - dr_estimators = double_robust_estimators(models) + dr_estimators = double_robust_estimators(models, resampling=StratifiedCV()) results, cache = test_coverage_and_get_results(dr_estimators, Ψ, Ψ₀, dataset; verbosity=0) test_mean_inf_curve_almost_zero(results.tmle; atol=1e-5) test_mean_inf_curve_almost_zero(results.ose; atol=1e-10) From 7db4b1159d308bc741260ff458c9facdfdcb307e Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Sun, 21 Apr 2024 21:16:09 +0100 Subject: [PATCH 5/9] add estimators cheatsheet --- docs/src/estimators_cheatsheet.md | 264 ++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 docs/src/estimators_cheatsheet.md diff --git a/docs/src/estimators_cheatsheet.md b/docs/src/estimators_cheatsheet.md new file mode 100644 index 00000000..044cae12 --- /dev/null +++ b/docs/src/estimators_cheatsheet.md @@ -0,0 +1,264 @@ +# Estimators' Cheatsheet + +This section is an effort to succinctly summarize the definition of semi-parametric estimators available in this package. As such, it is not self-contained, rather, it is intended as a mathematical memo that can be quickly searched. Gradients, One-Step and Targeted Maximum-Likelihood estimators are provided for the Counterfactual Mean, Average Treatment Effect and Average Interaction Effect. Estimators are presented in both their canonical and cross-validated versions. + +One major difficulty I personally faced when entering the field, was the overwhelming notational burden. Unfortunately, this burden is necessary to understand how the various mathematical objects are handled by the procedures presented below. It is thus worth the effort to make sure you understand what each notation means. The reward? After reading this document, you should be able to implement any estimator present in this page. + +Finally, if you find inconsistencies or imprecision, please report it, so we can keep improving! + +!!! note + This page is still under construction and more content will be added in the coming months. + +## Where it all begins + +### Notations + +This is the notation we use throughout: + +- The observed data: We assume we observe the realization of a random vector ``\bold{Z}_n = (Z_1, ..., Z_n)``. The components of ``\bold{Z}`` are assumed independent and identically distributed according to ``\mathbb{P}``, i.e. ``\forall i \in \{1, ..., n\},Z_i \sim \mathbb{P}``. Note that each ``Z_i`` is usually a vector as well, for us: ``(W_i, T_i, Y_i)``. +- The Empirical Distribution : ``\mathbb{P}_n(A) = \frac{1}{n}\sum_{i=1}^n 1_A(Z_i)``. For each set ``A``, it computes the mean number of points falling into ``A``. +- Expected value of a function ``f`` with respect to a distribution ``\mathbb{P}``: ``\mathbb{E}_{\mathbb{P}} f = \mathbb{P}f``. Note that for the empirical distribution ``\mathbb{P}_n``, this is simply: ``\mathbb{P}_nf = \frac{1}{n}\sum_{i=1}^nf(Z_i)``, in other words, the sample mean. +- The Estimand: The unknown finite dimensional quantity we are interested in. ``\Psi`` is defined as a functional, i.e. ``\mathbb{P} \mapsto \Psi(\mathbb{P}) \in \mathbb{R}^d``. In this document ``d=1``. +- The Gradient of ``\Psi`` at the distribution ``\mathbb{P}`` : ``\phi_{\mathbb{P}}``, a function of the data, i.e. ``Z \mapsto \phi_{\mathbb{P}}(Z)``. The gradient satisfies: ``\mathbb{P}\phi_{\mathbb{P}} = 0`` and ``Var[\phi_{\mathbb{P}}] < \infty``. +- An estimator, is a function of the data ``\bold{Z}_n``, and thus a random variable, that seeks to approximate an unknown quantity. For instance, ``\hat{\Psi}_n`` denotes an estimator for ``\Psi``. Notice that the empirical distribution is an estimator for ``\mathbb{P}``, but the hat is omitted to distinguish it from other estimators that will be defined later on. + +### The Counterfactual Mean + +Let ``\bold{Z}=(W, T, Y)_{i=1..n}`` be a dataset generated according to the following structural causal model: + +```math +\begin{aligned} +W &= f_W(U_W) \\ +T &= f_T(W, U_T) \\ +Y &= f_Y(T, W, U_Y) +\end{aligned} +``` + +This is certainly the most common scenario in causal inference where ``Y`` is an outcome of interest, ``T`` a set of treatment variables and ``W`` a set of confounding variables. We are generally interested in the effect of ``T`` on ``Y``. In this document we will consider the counterfactual mean as a workhorse. We will show that the estimators for the Average Treatment Effect and Average Interaction Effect can easily be derived from it. Under usual conditions, the counterfactual mean identifies to the following statistical estimand: + +```math +CM_t(\mathbb{P}) = \Psi_t(\mathbb{P}) = \mathbb{E}_{\mathbb{P}}[\mathbb{E}_{\mathbb{P}}[Y | W, T = t]] = \int \mathbb{E}_{\mathbb{P}}[Y | W, T = t] d\mathbb{P}(w) +``` + +From this definition, we can see that ``\Psi_t`` depends on ``\mathbb{P}`` only through two relevant factors: + +```math +\begin{aligned} +Q_Y(W, T) &= \mathbb{E}_{\mathbb{P}}[Y | W, T] \\ +Q_W(W) &= \mathbb{P}(W) +\end{aligned} +``` + +So that ``\Psi(\mathbb{P}) = \Psi(Q_Y, Q_W)``, which makes explicit that we don't need to estimate ``\mathbb{P}`` but only the relevant factors ``(Q_Y, Q_W)`` to obtain a plugin estimate of ``\Psi(\mathbb{P})``. Finally, it will be useful to define an additional factor: + +```math +g(W, T) = \mathbb{P}(T | W) +``` + +Since the gradient of ``\Psi``: + +```math +\phi_{CM_{t}, \mathbb{P}}(W, T, Y) = \frac{\mathbb{1}(T = t)}{g(W, t)}(Y − Q_Y(W, t)) + Q_Y(W, t) − \Psi(\mathbb{P}) +``` + +which is the foundation of semi-parametric estimation, depends on this so-called nuisance parameter. + +### Average Treatment Effect and Average Interaction Effect? + +They are simple linear combinations of counterfactual means and so is their gradients. + +#### Average Treatment Effect + +For two values of ``T: t_{control} \rightarrow t_{case}``, the Average Treatment Effect (ATE) is defined by: + +```math +ATE_{t_{case}, t_{control}}(\mathbb{P}) = CM_{t_{case}}(\mathbb{P}) - CM_{t_{control}}(\mathbb{P}) +``` + +And it's associated gradient is: + +```math +\begin{aligned} +\phi_{ATE}(W, T, Y) &= \phi_{CM_{t_{case}}}(W, T, Y) - \phi_{CM_{t_{control}}}(W, T, Y) \\ +&= H(W, T)(Y − Q_Y(W, T)) + C(W) − ATE_{t_{case}, t_{control}}(\mathbb{P}) +\end{aligned} +``` + +with: + +```math +\begin{equation} + \begin{cases} + H(W, T) &= \frac{(-\mathbb{1}(T \in (t_{case}, t_{control})))^{T=t_{control}}}{g(W, T)} \\ + C(W) &= (Q_Y(W, t_{case}) - Q_Y(W, t_{control})) + \end{cases} +\end{equation} +``` + +``H`` is also known as the clever covariate in the Targeted Learning literature (see later). + +#### Average Interaction Effect + +For simplicity we only consider two treatments ``T_1: t_{1,control} \rightarrow t_{1,case}`` and ``T_2: t_{2,control} \rightarrow t_{2,case}``. + +```math +AIE(\mathbb{P}) = CM_{t_{1, case}, t_{2, case}}(\mathbb{P}) - CM_{t_{1, case}, t_{2, control}}(\mathbb{P}) - CM_{t_{1, control}, t_{2, case}}(\mathbb{P}) + CM_{t_{1, control}, t_{2, control}}(\mathbb{P}) +``` + +The gradient is similarly given by: + +```math +\begin{aligned} +\phi_{AIE, }(W, T, Y) &= \phi_{CM_{t_{1,case}, t_{2,case}}}(W, T, Y) - \phi_{CM_{t_{1,case}, t_{2,control}}}(W, T, Y) - \phi_{CM_{t_{1,control}, t_{2,case}}}(W, T, Y) + \phi_{CM_{t_{1,control}, t_{2,control}}}(W, T, Y) \\ +&= H(W, T)(Y − Q_Y(W, T)) + C(W) − ATE_{t_{case}, t_{control}}(\mathbb{P}) +\end{aligned} +``` + +with: + +```math +\begin{equation} + \begin{cases} + H(W, T) &= \frac{(-\mathbb{1}(T \in (t_{case}, t_{control})))^{T=t_{control}}}{g(W, T)} \\ + C(W) &= (Q_Y(W, t_{case}) - Q_Y(W, t_{control})) + \end{cases} +\end{equation} +``` + +## Motivation for the Estimators + +The theory is based on the following von Mises expansion (functional equivalent of Taylor's expansion) which reduces due to the fact that the gradient satisfies: ``\mathbb{E}_{\mathbb{P}}[\phi_{\mathbb{P}}(Z)] = 0``. + +```math +\begin{aligned} +\Psi(\hat{\mathbb{P}}) − \Psi(\mathbb{P}) &= \int \phi_{\hat{\mathbb{P}}}(z) \mathrm{d}(\hat{\mathbb{P}} − \mathbb{P})(z) + R_2(\hat{\mathbb{P}}, \mathbb{P}) \\ +&= - \int \phi_{\hat{\mathbb{P}}}(z) \mathrm{d}\mathbb{P}(z) + R_2(\hat{\mathbb{P}}, \mathbb{P}) +\end{aligned} +``` + +This suggests that a plugin estimator, one that simply evaluates ``\Psi`` at an estimator ``\hat{\mathbb{P}}`` of ``\mathbb{P}``, is biased. This bias can be elegantly decomposed in four terms by reworking the previous expression: + +```math +\Psi(\hat{\mathbb{P}}) − \Psi(\mathbb{P}) = \mathbb{P}_n\phi_{\mathbb{P}}(Z) - \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z) + (\mathbb{P}_n − \mathbb{P})(ϕ_{\mathbb{\hat{P}}}(Z) − \phi_{\mathbb{P}}(Z)) + R_2(\mathbb{\hat{P}}, \mathbb{P}) +``` + +- The asymptotically linear term: + +```math +\mathbb{P}_n\phi_{\mathbb{P}}(Z) +``` + +By the central limit theorem, it is asymptotically normal with variance ``Var[\phi]/n``, it is used to build confidence interval. + +- The bias term: + +```math +- \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z) +``` + +This is the term both the One-Step estimator and the Targeted Maximum-Likelihood estimator deal with. + +- The empirical process term: + +```math +(\mathbb{P}_n − \mathbb{P})(ϕ_{\mathbb{\hat{P}}}(Z) − \phi_{\mathbb{P}}(Z)) +``` + +This is usually negligible under minimal assumptions. + +- The second-order remainder term: + +```math +R_2(\mathbb{\hat{P}}, \mathbb{P}) +``` + +This is often negligible, but see the cross-validated estimators section. + +## Inference + +According to the previous section, the OSE and TMLE will be asymptotically linear with efficient influence curve the gradient ``\phi``. This means the central limit theorem applies and the variance of the estimators can be estimated by: + +```math +\begin{aligned} +\hat{Var}(\hat{\Psi}_n) &= \frac{\hat{Var}(\hat{\phi}_n)}{n} \\ +&= \frac{1}{n(n-1)}\sum_{i=1}^n \hat{\phi}(W_i, T_i, Y_i)^2 +\end{aligned} +``` + +because the gradient has mean 0. + +## One-Step Estimator + +### Canonical OSE + +The One-Step estimator is very intuitive, it simply corrects the initial plugin estimator by adding in the residual bias term. As such, it corrects for the bias in the estimand's space: + +```math +\hat{\Psi}_{n, OSE} = \Psi(\hat{P}) + P_n{\phi_{\hat{P}}(Z)} +``` + +### CV-OSE + +Assume the realization of ``n`` random variables ``K_i`` assigning each sample to one of ``{1, ..., K}`` folds. For a given sample ``i``, we denote by ``k(i)`` the validation fold it belongs to and ``-k(i)`` the remaining training fold. Similarly, we denote by ``\hat{Q}^{k}`` an estimator for ``Q`` obtained from samples in the validation fold ``k`` and ``\hat{Q}^{-k}`` an estimator for ``Q`` obtained from samples in the (training) fold ``\{1, ..., K\}-\{k\}``. + +The cross-validated One-Step estimator can be compactly written as an average over the folds of sub one-step estimators: + +```math +\begin{aligned} +\hat{\Psi}_{n, CV-OSE} &= \sum_{k=1}^K \frac{N_k}{n} (\Psi(\hat{Q}_Y^{-k}, \hat{Q}_W^{k}) + \hat{\mathbb{P}}_n^k \phi^{-k}) \\ +&= \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} (\hat{Q}_Y^{-k}(W_i, T_i) + \phi^{-k}(W_i, T_i, Y_i)) +\end{aligned} +``` + +The important thing to note is that for each sub one-step estimator, the sum runs over the validation samples while ``\hat{Q}_Y^{-k}`` and ``\hat{\phi}^{-k}`` are estimated using the training samples. + +## Targeted Maximum-Likelihood Estimation + +Unlike the One-Step estimator, the Targeted Maximum-Likelihood Estimator corrects the bias term in distribution space. That is, it moves the initial estimate ``\hat{\mathbb{P}}^0=\hat{\mathbb{P}}`` to a corrected ``\hat{\mathbb{P}}^*`` (notice the new superscript notation). Then the plugin principle can be applied and the targeted estimator is simply ``\hat{\Psi}_{n, TMLE} = \Psi(\hat{\mathbb{P}}^*)``. This means TMLE always respects the natural range of the estimand, giving it an upper hand on the One-Step estimator. + +### Canonical TMLE + +The way ``\hat{\mathbb{P}}`` is modified is by means of a parametric sub-model also known as a fluctuation. It can be shown that for the conditional mean, it is sufficient to fluctuate ``\hat{Q}_{n, Y}``. The fluctuations that are used in practice are: + +- ``\hat{Q}_{Y, \epsilon}(W, T) = \hat{Q}_{n, Y}(T, W) + \epsilon \hat{H}(T, W)``, for continuous outcomes ``Y``. +- ``\hat{Q}_{Y, \epsilon}(W, T) = expit(logit(\hat{Q}_{n, Y}(T, W)) + \epsilon \hat{H}(T, W))``, for binary outcomes ``Y``. + +where ``\hat{H}(T, W) = \frac{1(T=t)}{\hat{g}_n(W)}`` is known as the clever covariate. The value of ``\epsilon`` is obtained by minimizing the loss ``L`` associated with ``Q_Y``, that is the mean-squared error for continuous outcomes and negative log-likelihood for binary outcomes. This can easily be done via linear and logistic regression respectively. + +!!! note + Just like the gradient is linear in ``\Psi``, the clever covariate used to fluctuate the initial ``\hat{Q}_Y`` is as presented in [Average Treatment Effect and Average Interaction Effect?](@ref) + +### CV-TMLE + +Using the same notation as the cross-validated One-Step estimator, the fluctuated distribution is obtained by solving: + +```math +\epsilon^* = \underset{\epsilon}{\arg \min} \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} L(Y_i, \hat{Q}_{Y, \epsilon}^{-k}(W_i, T_i, Y_i)) +``` + +where ``\hat{Q}_{Y, \epsilon}`` and ``L`` are the respective fluctuations and loss for continuous and binary outcomes. This leads to a targeted ``\hat{Q}_Y^{*}`` such that: + +```math +\forall i, \hat{Q}_Y^{*}(W_i, T_i) = \hat{Q}_{Y, \epsilon^*}^{-k(i)}(W_i, T_i) +``` + +That is, the predictions of ``\hat{Q}_Y^{*}`` for sample ``i`` are based on the + +Then, the CV-TMLE is: + +```math +\begin{aligned} +\hat{\Psi}_{n, CV-TMLE} &= \sum_{k=1}^K \frac{N_k}{n} \Psi(\hat{Q}_Y^{*}, \hat{Q}_W^{k}) \\ +&= \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} \hat{Q}_Y^{*}(W_i) +\end{aligned} +``` + +Notice that while ``\hat{\Psi}_{n, CV-TMLE}`` is not a plugin estimator anymore, it still respects the natural range of the parameter because it is an average of plugin estimators. Also, because ``\hat{Q}_Y^{*,-k}`` is based both on training and validation samples, the elements of the sum are not truly independent. + +## Acknowledgements + +The content of this page was largely inspired from: + +- [Semiparametric doubly robust targeted double machine learning: a review](https://arxiv.org/pdf/2203.06469.pdf). +- [Introduction to Modern Causal Inference](https://alejandroschuler.github.io/mci/). +- [Targeted Learning, Causal Inference for Observational and Experimental Data](https://link.springer.com/book/10.1007/978-1-4419-9782-1). From 4a1d0c40a1a04ff7afcac78f3c8627086373a851 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Mon, 22 Apr 2024 14:39:48 +0100 Subject: [PATCH 6/9] add some more docs --- docs/make.jl | 1 + docs/src/estimators_cheatsheet.md | 118 +++++++++++++++++++----------- docs/src/index.md | 9 ++- docs/src/resources.md | 8 +- docs/src/user_guide/estimation.md | 101 +++++++++++++------------ 5 files changed, 142 insertions(+), 95 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 1b5acea6..69c5463e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -34,6 +34,7 @@ makedocs(; joinpath("examples", "super_learning.md"), joinpath("examples", "double_robustness.md") ], + "Estimators' Cheat Sheet" => "estimators_cheatsheet.md", "Resources" => "resources.md", "API Reference" => "api.md" ], diff --git a/docs/src/estimators_cheatsheet.md b/docs/src/estimators_cheatsheet.md index 044cae12..b439cdb8 100644 --- a/docs/src/estimators_cheatsheet.md +++ b/docs/src/estimators_cheatsheet.md @@ -6,9 +6,6 @@ One major difficulty I personally faced when entering the field, was the overwhe Finally, if you find inconsistencies or imprecision, please report it, so we can keep improving! -!!! note - This page is still under construction and more content will be added in the coming months. - ## Where it all begins ### Notations @@ -49,7 +46,7 @@ Q_W(W) &= \mathbb{P}(W) \end{aligned} ``` -So that ``\Psi(\mathbb{P}) = \Psi(Q_Y, Q_W)``, which makes explicit that we don't need to estimate ``\mathbb{P}`` but only the relevant factors ``(Q_Y, Q_W)`` to obtain a plugin estimate of ``\Psi(\mathbb{P})``. Finally, it will be useful to define an additional factor: +So that ``\Psi(\mathbb{P}) = \Psi(Q_Y, Q_W)`` (the ``t`` subscript is dropped as it is unambiguous). This makes explicit that we don't need to estimate ``\mathbb{P}`` but only the relevant factors ``(Q_Y, Q_W)`` to obtain a plugin estimate of ``\Psi(\mathbb{P})``. Finally, it will be useful to define an additional factor: ```math g(W, T) = \mathbb{P}(T | W) @@ -69,7 +66,7 @@ They are simple linear combinations of counterfactual means and so is their grad #### Average Treatment Effect -For two values of ``T: t_{control} \rightarrow t_{case}``, the Average Treatment Effect (ATE) is defined by: +In all generality, for two values of a categorical treatment variable ``T: t_{control} \rightarrow t_{case}``, the Average Treatment Effect (ATE) is defined by: ```math ATE_{t_{case}, t_{control}}(\mathbb{P}) = CM_{t_{case}}(\mathbb{P}) - CM_{t_{control}}(\mathbb{P}) @@ -87,25 +84,25 @@ And it's associated gradient is: with: ```math -\begin{equation} +\begin{aligned} \begin{cases} H(W, T) &= \frac{(-\mathbb{1}(T \in (t_{case}, t_{control})))^{T=t_{control}}}{g(W, T)} \\ C(W) &= (Q_Y(W, t_{case}) - Q_Y(W, t_{control})) \end{cases} -\end{equation} +\end{aligned} ``` ``H`` is also known as the clever covariate in the Targeted Learning literature (see later). #### Average Interaction Effect -For simplicity we only consider two treatments ``T_1: t_{1,control} \rightarrow t_{1,case}`` and ``T_2: t_{2,control} \rightarrow t_{2,case}``. +For simplicity, we only consider two treatments ``T = (T_1, T_2)`` such that: ``T_1: t_{1,control} \rightarrow t_{1,case}`` and ``T_2: t_{2,control} \rightarrow t_{2,case}``. The Average Interaction Effect (AIE) of ``(T_1, T_2)`` is defined as: ```math AIE(\mathbb{P}) = CM_{t_{1, case}, t_{2, case}}(\mathbb{P}) - CM_{t_{1, case}, t_{2, control}}(\mathbb{P}) - CM_{t_{1, control}, t_{2, case}}(\mathbb{P}) + CM_{t_{1, control}, t_{2, control}}(\mathbb{P}) ``` -The gradient is similarly given by: +And its gradient is given by: ```math \begin{aligned} @@ -117,17 +114,19 @@ The gradient is similarly given by: with: ```math -\begin{equation} +\begin{aligned} \begin{cases} - H(W, T) &= \frac{(-\mathbb{1}(T \in (t_{case}, t_{control})))^{T=t_{control}}}{g(W, T)} \\ - C(W) &= (Q_Y(W, t_{case}) - Q_Y(W, t_{control})) + H(W, T) &= \frac{(-\mathbb{1}(T_1 \in (t_{1, case}, t_{1, control})))^{T_1=t_{1, control}}(-\mathbb{1}(T_2 \in (t_{2, case}, t_{2, control})))^{T_2=t_{2, control}}}{g(W, T)} \\ + C(W) &= (Q_Y(W, t_{1, case}, t_{2, case}) - Q_Y(W, t_{1, case}, t_{2, control}) - Q_Y(W, t_{1, control}, t_{2, case}) + Q_Y(W, t_{1, control}, t_{2, control})) \end{cases} -\end{equation} +\end{aligned} ``` -## Motivation for the Estimators +For higher-order interactions the two factors ``H`` and ``C`` can be similarly inferred. + +## Asymptotic Analysis of Plugin Estimators -The theory is based on the following von Mises expansion (functional equivalent of Taylor's expansion) which reduces due to the fact that the gradient satisfies: ``\mathbb{E}_{\mathbb{P}}[\phi_{\mathbb{P}}(Z)] = 0``. +The theory is based on the von Mises expansion (functional equivalent of Taylor's expansion) which reduces due to the fact that the gradient satisfies: ``\mathbb{E}_{\mathbb{P}}[\phi_{\mathbb{P}}(Z)] = 0``. ```math \begin{aligned} @@ -142,7 +141,7 @@ This suggests that a plugin estimator, one that simply evaluates ``\Psi`` at an \Psi(\hat{\mathbb{P}}) − \Psi(\mathbb{P}) = \mathbb{P}_n\phi_{\mathbb{P}}(Z) - \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z) + (\mathbb{P}_n − \mathbb{P})(ϕ_{\mathbb{\hat{P}}}(Z) − \phi_{\mathbb{P}}(Z)) + R_2(\mathbb{\hat{P}}, \mathbb{P}) ``` -- The asymptotically linear term: +### 1. The Asymptotically Linear Term ```math \mathbb{P}_n\phi_{\mathbb{P}}(Z) @@ -150,7 +149,7 @@ This suggests that a plugin estimator, one that simply evaluates ``\Psi`` at an By the central limit theorem, it is asymptotically normal with variance ``Var[\phi]/n``, it is used to build confidence interval. -- The bias term: +### 2. The Bias Term ```math - \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z) @@ -158,43 +157,71 @@ By the central limit theorem, it is asymptotically normal with variance ``Var[\p This is the term both the One-Step estimator and the Targeted Maximum-Likelihood estimator deal with. -- The empirical process term: +### 3. The Empirical Process Term ```math (\mathbb{P}_n − \mathbb{P})(ϕ_{\mathbb{\hat{P}}}(Z) − \phi_{\mathbb{P}}(Z)) ``` -This is usually negligible under minimal assumptions. +This can be shown to be of order ``o_{\mathbb{P}}(\frac{1}{\sqrt{n}})`` if ``\phi_{\hat{\mathbb{P}}}`` converges to ``\phi_{\mathbb{P}}`` in ``L_2(\mathbb{P})`` norm, that is: -- The second-order remainder term: +```math +\int (\phi_{\hat{\mathbb{P}}}(Z) - \phi_{\mathbb{P}}(Z) )^2 d\mathbb{P}(Z) = o_{\mathbb{P}}(\frac{1}{\sqrt{n}}) +``` + +and, any of the following holds: + +- ``\phi`` (or equivalently its components) is [[Donsker](https://en.wikipedia.org/wiki/Donsker_classes)], i.e., not too complex. +- The estimator is constructed using sample-splitting (see cross-validated estimators). + +### 4. The Second-Order Remainder Term ```math R_2(\mathbb{\hat{P}}, \mathbb{P}) ``` -This is often negligible, but see the cross-validated estimators section. +This term is usually more complex to analyse. However, for the counterfactual mean, it can be shown that if ``g(W,T) \geq \frac{1}{\eta}`` (positivity constraint): + +```math +|R_2(\mathbb{\hat{P}}, \mathbb{P})| \leq \eta ||\hat{Q}_{n, Y} - Q_{Y}|| \cdot ||\hat{g}_{n} - g|| +``` + +and thus, if the estimators ``\hat{Q}_{n, Y}`` and ``\hat{g}_{n}`` converge at a rate ``o_{\mathbb{P}}(n^{-\frac{1}{4}})``, the second-order remainder will be ``o_{\mathbb{P}}(\frac{1}{\sqrt{n}})``. This is the case for many popular estimators like random forests, neural networks, etc... -## Inference +## Asymptotic Linearity and Inference -According to the previous section, the OSE and TMLE will be asymptotically linear with efficient influence curve the gradient ``\phi``. This means the central limit theorem applies and the variance of the estimators can be estimated by: +According to the previous section, the OSE and TMLE will be asymptotically linear with efficient influence curve the gradient ``\phi``: ```math -\begin{aligned} -\hat{Var}(\hat{\Psi}_n) &= \frac{\hat{Var}(\hat{\phi}_n)}{n} \\ -&= \frac{1}{n(n-1)}\sum_{i=1}^n \hat{\phi}(W_i, T_i, Y_i)^2 -\end{aligned} +\sqrt{n}(\hat{\Psi} - \Psi) = \frac{1}{\sqrt{n}} \sum_{i=1}^n \phi(Z_i) + o_{\mathbb{P}}(\frac{1}{\sqrt{n}}) ``` -because the gradient has mean 0. +By the [Central Limit Theorem](https://en.wikipedia.org/wiki/Central_limit_theorem) and [Slutsky's Theorem](https://en.wikipedia.org/wiki/Slutsky%27s_theorem) we have: + +```math +\sqrt{n}(\hat{\Psi} - \Psi) \leadsto \mathcal{N}(0, Std(\phi)) +``` + +Now, if we consider ``S_n = \hat{Std}(\phi_{\hat{\mathbb{P}}})`` as a consistent estimator for ``Std(\phi)``, we have by Slutsky's Theorem again, the following pivot: + +```math +\frac{\sqrt{n}(\hat{\Psi} - \Psi)}{S_n} \leadsto \mathcal{N}(0, 1) +``` + +which can be used to build confidence intervals: + +```math +\underset{n \to \infty}{\lim} P(\hat{\Psi}_n - \frac{S_n}{\sqrt{n}}z_{\alpha} \leq \Psi \leq \hat{\Psi}_n + \frac{S_n}{\sqrt{n}}z_{\alpha}) = 1 - 2\alpha +``` ## One-Step Estimator ### Canonical OSE -The One-Step estimator is very intuitive, it simply corrects the initial plugin estimator by adding in the residual bias term. As such, it corrects for the bias in the estimand's space: +The One-Step estimator is very intuitive, it simply corrects the initial plugin estimator by adding in the residual bias term. As such, it corrects for the bias in the estimand's space. Let ``\hat{P}= (\hat{Q}_{n,Y}, \hat{Q}_{n,W})`` be an estimator of the relevant factors of ``\mathbb{P}`` as well as ``\hat{g}_n`` an estimator of the nuisance function ``g``. The OSE is: ```math -\hat{\Psi}_{n, OSE} = \Psi(\hat{P}) + P_n{\phi_{\hat{P}}(Z)} +\hat{\Psi}_{n, OSE} = \Psi(\hat{P}) + P_n{\phi_{\hat{P}}} ``` ### CV-OSE @@ -212,21 +239,27 @@ The cross-validated One-Step estimator can be compactly written as an average ov The important thing to note is that for each sub one-step estimator, the sum runs over the validation samples while ``\hat{Q}_Y^{-k}`` and ``\hat{\phi}^{-k}`` are estimated using the training samples. -## Targeted Maximum-Likelihood Estimation +## Targeted Maximum-Likelihood Estimator Unlike the One-Step estimator, the Targeted Maximum-Likelihood Estimator corrects the bias term in distribution space. That is, it moves the initial estimate ``\hat{\mathbb{P}}^0=\hat{\mathbb{P}}`` to a corrected ``\hat{\mathbb{P}}^*`` (notice the new superscript notation). Then the plugin principle can be applied and the targeted estimator is simply ``\hat{\Psi}_{n, TMLE} = \Psi(\hat{\mathbb{P}}^*)``. This means TMLE always respects the natural range of the estimand, giving it an upper hand on the One-Step estimator. ### Canonical TMLE -The way ``\hat{\mathbb{P}}`` is modified is by means of a parametric sub-model also known as a fluctuation. It can be shown that for the conditional mean, it is sufficient to fluctuate ``\hat{Q}_{n, Y}``. The fluctuations that are used in practice are: +The way ``\hat{\mathbb{P}}`` is modified is by means of a parametric sub-model also known as a fluctuation. It can be shown that for the conditional mean, it is sufficient to fluctuate ``\hat{Q}_{n, Y}`` only once using the following fluctuations: - ``\hat{Q}_{Y, \epsilon}(W, T) = \hat{Q}_{n, Y}(T, W) + \epsilon \hat{H}(T, W)``, for continuous outcomes ``Y``. -- ``\hat{Q}_{Y, \epsilon}(W, T) = expit(logit(\hat{Q}_{n, Y}(T, W)) + \epsilon \hat{H}(T, W))``, for binary outcomes ``Y``. +- ``\hat{Q}_{Y, \epsilon}(W, T) = \frac{1}{1 + e^{-(logit(\hat{Q}_{n, Y}(T, W)) + \epsilon \hat{H}(T, W))}}``, for binary outcomes ``Y``. where ``\hat{H}(T, W) = \frac{1(T=t)}{\hat{g}_n(W)}`` is known as the clever covariate. The value of ``\epsilon`` is obtained by minimizing the loss ``L`` associated with ``Q_Y``, that is the mean-squared error for continuous outcomes and negative log-likelihood for binary outcomes. This can easily be done via linear and logistic regression respectively. !!! note - Just like the gradient is linear in ``\Psi``, the clever covariate used to fluctuate the initial ``\hat{Q}_Y`` is as presented in [Average Treatment Effect and Average Interaction Effect?](@ref) + For the ATE and AIE, just like the gradient is linear in ``\Psi``, the clever covariate used to fluctuate the initial ``\hat{Q}_{n, Y}`` is as presented in [Average Treatment Effect and Average Interaction Effect?](@ref) + +If we denote by ``\epsilon^*`` the value of ``\epsilon`` minimizing the loss, the TMLE is: + +```math +\hat{\Psi}_{n, TMLE} = \Psi(\hat{Q}_{n, Y, \epsilon^*}, \hat{Q}_{n, W}) +``` ### CV-TMLE @@ -236,29 +269,30 @@ Using the same notation as the cross-validated One-Step estimator, the fluctuate \epsilon^* = \underset{\epsilon}{\arg \min} \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} L(Y_i, \hat{Q}_{Y, \epsilon}^{-k}(W_i, T_i, Y_i)) ``` -where ``\hat{Q}_{Y, \epsilon}`` and ``L`` are the respective fluctuations and loss for continuous and binary outcomes. This leads to a targeted ``\hat{Q}_Y^{*}`` such that: +where ``\hat{Q}_{Y, \epsilon}`` and ``L`` are the respective fluctuations and loss for continuous and binary outcomes. This leads to a targeted ``\hat{Q}_{n,Y}^{*}`` such that: ```math -\forall i, \hat{Q}_Y^{*}(W_i, T_i) = \hat{Q}_{Y, \epsilon^*}^{-k(i)}(W_i, T_i) +\forall i \in \{1, ..., n\}, \hat{Q}_{n, Y}^{*}(W_i, T_i) = \hat{Q}_{Y, \epsilon^*}^{-k(i)}(W_i, T_i) ``` -That is, the predictions of ``\hat{Q}_Y^{*}`` for sample ``i`` are based on the +That is, the predictions of ``\hat{Q}_{n, Y}^{*}`` for sample ``i`` are based on the out of fold predictions of ``\hat{Q}_{Y}^{-k(i)}`` and the "pooled" fluctuation given by ``\epsilon^*``. Then, the CV-TMLE is: ```math \begin{aligned} -\hat{\Psi}_{n, CV-TMLE} &= \sum_{k=1}^K \frac{N_k}{n} \Psi(\hat{Q}_Y^{*}, \hat{Q}_W^{k}) \\ -&= \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} \hat{Q}_Y^{*}(W_i) +\hat{\Psi}_{n, CV-TMLE} &= \sum_{k=1}^K \frac{N_k}{n} \Psi(\hat{Q}_{n, Y}^{*}, \hat{Q}_W^{k}) \\ +&= \frac{1}{n} \sum_{k=1}^K \sum_{\{i: k(i) = k\}} \hat{Q}_{n, Y}^{*}(W_i, T_i) \end{aligned} ``` -Notice that while ``\hat{\Psi}_{n, CV-TMLE}`` is not a plugin estimator anymore, it still respects the natural range of the parameter because it is an average of plugin estimators. Also, because ``\hat{Q}_Y^{*,-k}`` is based both on training and validation samples, the elements of the sum are not truly independent. +Notice that, while ``\hat{\Psi}_{n, CV-TMLE}`` is not a plugin estimator anymore, it still respects the natural range of the parameter because it is an average of plugin estimators. Also, because ``\hat{Q}_{n,Y}^*`` is based on the entire data, it seems this estimator is still somehow double-dipping. -## Acknowledgements +## References -The content of this page was largely inspired from: +The content of this page is largely inspired from: - [Semiparametric doubly robust targeted double machine learning: a review](https://arxiv.org/pdf/2203.06469.pdf). - [Introduction to Modern Causal Inference](https://alejandroschuler.github.io/mci/). - [Targeted Learning, Causal Inference for Observational and Experimental Data](https://link.springer.com/book/10.1007/978-1-4419-9782-1). +- [STATS 361: Causal Inference](https://web.stanford.edu/~swager/stats361.pdf) diff --git a/docs/src/index.md b/docs/src/index.md index f5f60725..f3500c58 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,7 +6,7 @@ CurrentModule = TMLE ## Overview -TMLE.jl is a Julia implementation of the Targeted Minimum Loss-Based Estimation ([TMLE](https://link.springer.com/book/10.1007/978-1-4419-9782-1)) framework. If you are interested in efficient and unbiased estimation of causal effects, you are in the right place. Since TMLE uses machine-learning methods to estimate nuisance estimands, the present package is based upon [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/). +TMLE.jl is a Julia implementation of the Targeted Minimum Loss-Based Estimation ([TMLE](https://link.springer.com/book/10.1007/978-1-4419-9782-1)) framework. If you are interested in leveraging the power of modern machine-learning methods while preserving interpretability and statistical inference guarantees, you are in the right place. TMLE.jl is compatible with any [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) compliant algorithm and any dataset respecting the [Tables](https://tables.juliadata.org/stable/) interface. ## Installation @@ -89,9 +89,10 @@ Distinguishing Features: - Estimands: Counterfactual Mean, Average Treatment Effect, Interactions, Any composition thereof - Estimators: TMLE, One-Step - Machine-Learning: Any [MLJ](https://alan-turing-institute.github.io/MLJ.jl/stable/) compatible model -- Treatments Variables: - - Multiple treatment variables (with their own set of confounders) - - Categorical treatment variables (factorial analysis) +- Dataset: Any dataset respecting the [Tables](https://tables.juliadata.org/stable/) interface (e.g. [DataFrames.jl](https://dataframes.juliadata.org/stable/)) +- Factorial Treatment Variables: + - Multiple treatments + - Categorical treatment values ## Citing TMLE.jl diff --git a/docs/src/resources.md b/docs/src/resources.md index ea3542cd..f39d4901 100644 --- a/docs/src/resources.md +++ b/docs/src/resources.md @@ -9,9 +9,15 @@ These are two very clear introductions to causal inference and semi-parametric e - [Introduction to Modern Causal Inference](https://alejandroschuler.github.io/mci/) (Alejandro Schuler, Mark J. van der Laan). - [A Ride in Targeted Learning Territory](https://achambaz.github.io/tlride/) (David Benkeser, Antoine Chambaz). -## Text Books +## Youtube + +- [Targeted Learning Webinar Series](https://youtube.com/playlist?list=PLy_CaFomwGGGH10tbq9zSyfHVrdklMaLe&si=BfJZ2fvDtGUZwELy) +- [TL Briefs](https://youtube.com/playlist?list=PLy_CaFomwGGFMxFtf4gkmC70dP9J6Q3Wa&si=aBZUnjJtOidIjhwR) + +## Books and Lecture Notes - [Targeted Learning](https://link.springer.com/book/10.1007/978-1-4419-9782-1) (Mark J. van der Laan, Sherri Rose). +- [STATS 361: Causal Inference](https://web.stanford.edu/~swager/stats361.pdf) ## Journal articles diff --git a/docs/src/user_guide/estimation.md b/docs/src/user_guide/estimation.md index a855eab7..f676f48c 100644 --- a/docs/src/user_guide/estimation.md +++ b/docs/src/user_guide/estimation.md @@ -4,7 +4,7 @@ CurrentModule = TMLE # Estimation -## Estimating a single Estimand +## Constructing and Using Estimators ```@setup estimation using Random @@ -51,11 +51,12 @@ scm = SCM([ ) ``` -Once a statistical estimand has been defined, we can proceed with estimation. At the moment, we provide 3 main types of estimators: +Once a statistical estimand has been defined, we can proceed with estimation. There are two semi-parametric efficient estimators in TMLE.jl: -- Targeted Maximum Likelihood Estimator (`TMLEE`) -- One-Step Estimator (`OSE`) -- Naive Plugin Estimator (`NAIVE`) +- The Targeted Maximum-Likelihood Estimator (`TMLEE`) +- The One-Step Estimator (`OSE`) + +While they have similar asymptotic properties, their finite sample performance may be different. They also have a very distinguishing feature, the TMLE is a plugin estimator, which means it respects the natural bounds of the estimand of interest. In contrast, the OSE may in theory report values outside these bounds. In practice, this is not often the case and the estimand of interest may not impose any restriction on its domain. Drawing from the example dataset and `SCM` from the Walk Through section, we can estimate the ATE for `T₁`. Let's use TMLE: @@ -72,27 +73,25 @@ result₁ nothing # hide ``` -We see that both models corresponding to variables `Y` and `T₁` were fitted in the process but that the model for `T₂` was not because it was not necessary to estimate this estimand. - -The `cache` contains estimates for the nuisance functions that were necessary to estimate the ATE. For instance, we can see what is the value of ``\epsilon`` corresponding to the clever covariate. +The `cache` (see below) contains estimates for the nuisance functions that were necessary to estimate the ATE. For instance, we can see what is the value of ``\epsilon`` corresponding to the clever covariate. ```@example estimation ϵ = last_fluctuation_epsilon(cache) ``` -The `result₁` structure corresponds to the estimation result and should report 3 main elements: +The `result₁` structure corresponds to the estimation result and will display the result of a T-Test including: - A point estimate. - A 95% confidence interval. - A p-value (Corresponding to the test that the estimand is different than 0). -This is only summary statistics but since both the TMLE and OSE are asymptotically linear estimators, standard Z/T tests from [HypothesisTests.jl](https://juliastats.org/HypothesisTests.jl/stable/) can be performed. +Both the TMLE and OSE are asymptotically linear estimators, standard Z/T tests from [HypothesisTests.jl](https://juliastats.org/HypothesisTests.jl/stable/) can be performed and `confint` and `pvalue` methods used. ```@example estimation -tmle_test_result₁ = OneSampleTTest(result₁) +tmle_test_result₁ = pvalue(OneSampleTTest(result₁)) ``` -We could now get an interest in the Average Treatment Effect of `T₂` that we will estimate with an `OSE`: +Let us now turn to the Average Treatment Effect of `T₂`, we will estimate it with a `OSE`: ```@example estimation Ψ₂ = ATE( @@ -109,67 +108,73 @@ nothing # hide Again, required nuisance functions are fitted and stored in the cache. -## CV-Estimation +## Specifying Models -When performing "vanilla" semi-parametric estimation, we are essentially using the dataset twice, once for the estimation of the nuisance functions and once for the estimation of the parameter of interest. This means that there is a risk of over-fitting and residual bias ([see here](https://arxiv.org/abs/2203.06469) for some discussion). One way to address this limitation is to use a technique called sample-splitting / cross-validating. +By default, TMLE.jl uses generalized linear models for the estimation of relevant and nuisance factors such as the outcome mean and the propensity score. However, this is not the recommended usage since the estimators' performance is closely related to how well we can estimate these factors. More sophisticated models can be provided using the `models` keyword argument of each estimator which is essentially a `NamedTuple` mapping variables' names to their respective model. -### Usage +Rather than specifying a specific model for each variable it may be easier to override the default models using the `default_models` function: -To create a cross-validated estimator simply specify the `resampling` keyword argument: +For example one can override all default models with XGBoost models from `MLJXGBoostInterface`: ```@example estimation -TMLEE(resampling=StratifiedCV()); -``` - -or - -```julia -OSE(resampling=StratifiedCV(nfolds=3)); +using MLJXGBoostInterface +xgboost_regressor = XGBoostRegressor() +xgboost_classifier = XGBoostClassifier() +models = default_models( + Q_binary=xgboost_classifier, + Q_continuous=xgboost_regressor, + G=xgboost_classifier +) +tmle_gboost = TMLEE(models=models) ``` -We further explain below the procedure for both Targeted Maximum-Likelihood estimation and One-Step estimation and provide some considerations when using sample-splitting. +The advantage of using `default_models` is that it will automatically prepend each model with a [ContinuousEncoder](https://alan-turing-institute.github.io/MLJ.jl/dev/transformers/#MLJModels.ContinuousEncoder) to make sure the correct types are passed to the downstream models. -### Preliminaries +Super Learning ([Stack](https://alan-turing-institute.github.io/MLJ.jl/dev/model_stacking/#Model-Stacking)) as well as variable specific models can be defined as well. Here is a more customized version: -Even though the idea behind cross-validated estimators is simple, the procedure may seem obscure at first. The purpose of this subsection is to explain how it works, and it will be useful to introduce some notation. Let `resampling` be a `MLJ.ResamplingStrategy` partitioning the dataset into K folds. According to this `resampling` method, each sample ``i`` in the dataset belongs to a specific (validation) fold ``k``: +```@example estimation +lr = LogisticClassifier(lambda=0.) +stack_binary = Stack( + metalearner=lr, + xgboost=xgboost_classifier, + lr=lr +) -- ``k(i)`` denotes the (validation) fold sample ``i`` belongs to ``k(i) \in [1, K]``. -- ``-k(i)`` denotes the remaining (training) folds. +models = ( + T₁ = with_encoder(xgboost_classifier), # T₁ with XGBoost prepended with a Continuous Encoder + default_models( # For all other variables use the following defaults + Q_binary=stack_binary, # A Super Learner + Q_continuous=xgboost_regressor, # An XGBoost + # Unspecified G defaults to Logistic Regression + )... +) -```@raw html -
- -
+tmle_custom = TMLEE(models=models) ``` -The estimators we are considering are asymptotically linear. This means they can be written as an average over their influence function. If we denote by ``\phi`` this influence function, which itself depends on nuisance functions (e.g. outcome mean, propensity score...), then ``i \mapsto \hat{\phi}^{-k(i)}(i)`` is a cross-validated estimator of this influence function. The notation means that the nuisance functions learnt on all folds but the one containing sample ``i`` are used to make the prediction for sample ``i``. Here, we loosely refer to ``i`` as a sample (e.g. ``(Y, T, W)_i``), regardless of the actual data structure. - -We are now ready to define the cross-validated One-Step and Targeted Maximum-Likelihood estimators. +Notice that `with_encoder` is simply a shorthand to construct a pipeline with a `ContinuousEncoder` and that the resulting `models` is simply a `NamedTuple`. -### CV-OSE +## CV-Estimation -The cross-validated One-Step estimator is the average of the cross-validated influence function, it can be compactly written as: +Canonical TMLE/OSE are essentially using the dataset twice, once for the estimation of the nuisance functions and once for the estimation of the parameter of interest. This means that there is a risk of over-fitting and residual bias ([see here](https://arxiv.org/abs/2203.06469) for some discussion). One way to address this limitation is to use a technique called sample-splitting / cross-validating. In order to activate the sample-splitting mode, simply provide a `MLJ.ResamplingStrategy` using the `resampling` keyword argument: -```math -\hat{\Psi}_{CV} = \frac{1}{n} \sum_{i=1}^n \hat{\phi}^{-k(i)}(i) +```@example estimation +TMLEE(resampling=StratifiedCV()); ``` -And the associated variance estimator: +or -```math -\hat{V}_{\Psi, CV} = \frac{1}{n-1} \sum_{i=1}^n (\hat{\phi}^{-k(i)}(i) - \hat{\Psi}_{CV})^2 +```julia +OSE(resampling=StratifiedCV(nfolds=3)); ``` -### CV-TMLE - - -### Further Considerations +There are some practical considerations - Choice of `resampling` Strategy: The theory behind sample-splitting requires the nuisance functions to be sufficiently well estimated on **each and every** fold. A practical aspect of it is that each fold should contain a sample representative of the dataset. In particular, when the treatment and outcome variables are categorical it is important to make sure the proportions are preserved. This is typically done using `StratifiedCV`. - Computational Complexity: Sample-splitting results in ``K`` fits of the nuisance functions, drastically increasing computational complexity. In particular, if the nuisance functions are estimated using (P-fold) Super-Learning, this will result in two nested cross-validation loops and ``K \times P`` fits. -- Caching of Nuisance Functions: Because the `resampling` strategy typically needs to preserve the outcome and treatment proportions, very little reuse of cached models is possible (see below). +- Caching of Nuisance Functions: Because the `resampling` strategy typically needs to preserve the outcome and treatment proportions, very little reuse of cached models is possible (see [Caching Models](@ref)). -## Caching model fits +## Caching Models Let's now see how the `cache` can be reused with a new estimand, say the Total Average Treatment Effect of both `T₁` and `T₂`. From b68434edf73536e23510724c66600b18b58238e7 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Mon, 22 Apr 2024 15:11:16 +0100 Subject: [PATCH 7/9] update estimate computation --- src/counterfactual_mean_based/estimators.jl | 26 ++++++++++++++------- src/counterfactual_mean_based/gradient.jl | 12 +++------- test/counterfactual_mean_based/gradient.jl | 6 ++--- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/counterfactual_mean_based/estimators.jl b/src/counterfactual_mean_based/estimators.jl index e06fb92e..2df3944b 100644 --- a/src/counterfactual_mean_based/estimators.jl +++ b/src/counterfactual_mean_based/estimators.jl @@ -221,7 +221,7 @@ function (tmle::TMLEE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict() machine_cache=tmle.machine_cache ) # Estimation results after TMLE - IC, Ψ̂ = gradient_and_estimate(Ψ, targeted_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound) + IC, Ψ̂ = gradient_and_estimate(tmle, Ψ, targeted_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound) σ̂ = std(IC) n = size(IC, 1) verbosity >= 1 && @info "Done." @@ -229,6 +229,9 @@ function (tmle::TMLEE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict() return TMLEstimate(Ψ, Ψ̂, σ̂, n, IC), cache end +gradient_and_estimate(::TMLEE, Ψ, factors, dataset; ps_lowerbound=1e-8) = + gradient_and_plugin_estimate(Ψ, factors, dataset; ps_lowerbound=ps_lowerbound) + ##################################################################### ### OSE ### ##################################################################### @@ -267,14 +270,14 @@ ose = OSE() OSE(;models=default_models(), resampling=nothing, ps_lowerbound=1e-8, machine_cache=false) = OSE(models, resampling, ps_lowerbound, machine_cache) -function (estimator::OSE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict(), verbosity=1) +function (ose::OSE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dict(), verbosity=1) # Check the estimand against the dataset check_treatment_levels(Ψ, dataset) # Initial fit of the SCM's relevant factors initial_factors = get_relevant_factors(Ψ) nomissing_dataset = nomissing(dataset, variables(initial_factors)) - initial_factors_dataset = choose_initial_dataset(dataset, nomissing_dataset, estimator.resampling) - initial_factors_estimator = CMRelevantFactorsEstimator(estimator.resampling, estimator.models) + initial_factors_dataset = choose_initial_dataset(dataset, nomissing_dataset, ose.resampling) + initial_factors_estimator = CMRelevantFactorsEstimator(ose.resampling, ose.models) initial_factors_estimate = initial_factors_estimator( initial_factors, initial_factors_dataset; @@ -283,16 +286,21 @@ function (estimator::OSE)(Ψ::StatisticalCMCompositeEstimand, dataset; cache=Dic ) # Get propensity score truncation threshold n = nrows(nomissing_dataset) - ps_lowerbound = ps_lower_bound(n, estimator.ps_lowerbound) + ps_lowerbound = ps_lower_bound(n, ose.ps_lowerbound) # Gradient and estimate - IC, Ψ̂ = gradient_and_estimate(Ψ, initial_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound) - IC_mean = mean(IC) - IC .-= IC_mean + IC, Ψ̂ = gradient_and_estimate(ose, Ψ, initial_factors_estimate, nomissing_dataset; ps_lowerbound=ps_lowerbound) σ̂ = std(IC) n = size(IC, 1) verbosity >= 1 && @info "Done." - return OSEstimate(Ψ, Ψ̂ + IC_mean, σ̂, n, IC), cache + return OSEstimate(Ψ, Ψ̂, σ̂, n, IC), cache +end + +function gradient_and_estimate(::OSE, Ψ, factors, dataset; ps_lowerbound=1e-8) + IC, Ψ̂ = gradient_and_plugin_estimate(Ψ, factors, dataset; ps_lowerbound=ps_lowerbound) + IC_mean = mean(IC) + IC .-= IC_mean + return IC, Ψ̂ + IC_mean end ##################################################################### diff --git a/src/counterfactual_mean_based/gradient.jl b/src/counterfactual_mean_based/gradient.jl index 4fd6c066..0cbf7424 100644 --- a/src/counterfactual_mean_based/gradient.jl +++ b/src/counterfactual_mean_based/gradient.jl @@ -24,13 +24,7 @@ function counterfactual_aggregate(Ψ::StatisticalCMCompositeEstimand, Q, dataset return ctf_agg end -compute_estimate(ctf_aggregate, ::Nothing) = mean(ctf_aggregate) - -""" -Accounting for possibly slitghly different folds' sizes. -""" -compute_estimate(ctf_aggregate, train_validation_indices) = - mean(compute_estimate(ctf_aggregate[val_indices], nothing) for (_, val_indices) in train_validation_indices) +plugin_estimate(ctf_aggregate) = mean(ctf_aggregate) """ @@ -56,11 +50,11 @@ function ∇YX(Ψ::StatisticalCMCompositeEstimand, Q, G, dataset; ps_lowerbound= end -function gradient_and_estimate(Ψ::StatisticalCMCompositeEstimand, factors, dataset; ps_lowerbound=1e-8) +function gradient_and_plugin_estimate(Ψ::StatisticalCMCompositeEstimand, factors, dataset; ps_lowerbound=1e-8) Q = factors.outcome_mean G = factors.propensity_score ctf_agg = counterfactual_aggregate(Ψ, Q, dataset) - Ψ̂ = compute_estimate(ctf_agg, train_validation_indices_from_factors(factors)) + Ψ̂ = plugin_estimate(ctf_agg) IC = ∇YX(Ψ, Q, G, dataset; ps_lowerbound = ps_lowerbound) .+ ∇W(ctf_agg, Ψ̂) return IC, Ψ̂ end diff --git a/test/counterfactual_mean_based/gradient.jl b/test/counterfactual_mean_based/gradient.jl index f610ad7a..8a2e7f7a 100644 --- a/test/counterfactual_mean_based/gradient.jl +++ b/test/counterfactual_mean_based/gradient.jl @@ -24,7 +24,7 @@ function one_treatment_dataset(;n=100) ) end -@testset "Test gradient_and_estimate" begin +@testset "Test gradient_and_plugin_estimate" begin ps_lowerbound = 1e-8 Ψ = ATE( outcome = :Y, @@ -60,8 +60,8 @@ end expectedΨ̂ = mean(ctf_agg) ∇W = TMLE.∇W(ctf_agg, expectedΨ̂) @test ∇W == ctf_agg .- expectedΨ̂ - # gradient_and_estimate - IC, Ψ̂ = TMLE.gradient_and_estimate(Ψ, η̂ₙ, dataset; ps_lowerbound=ps_lowerbound) + # gradient_and_plugin_estimate + IC, Ψ̂ = TMLE.gradient_and_plugin_estimate(Ψ, η̂ₙ, dataset; ps_lowerbound=ps_lowerbound) @test expectedΨ̂ == Ψ̂ @test IC == ∇YX .+ ∇W end From eaa9ffa6c3ad0376fe4dae9b996876a38610612d Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Mon, 29 Apr 2024 17:19:49 +0200 Subject: [PATCH 8/9] take review comments into account --- docs/src/estimators_cheatsheet.md | 60 ++++++++++--------- docs/src/index.md | 4 +- test/counterfactual_mean_based/fluctuation.jl | 4 ++ 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/docs/src/estimators_cheatsheet.md b/docs/src/estimators_cheatsheet.md index b439cdb8..05c1d27c 100644 --- a/docs/src/estimators_cheatsheet.md +++ b/docs/src/estimators_cheatsheet.md @@ -12,11 +12,11 @@ Finally, if you find inconsistencies or imprecision, please report it, so we can This is the notation we use throughout: -- The observed data: We assume we observe the realization of a random vector ``\bold{Z}_n = (Z_1, ..., Z_n)``. The components of ``\bold{Z}`` are assumed independent and identically distributed according to ``\mathbb{P}``, i.e. ``\forall i \in \{1, ..., n\},Z_i \sim \mathbb{P}``. Note that each ``Z_i`` is usually a vector as well, for us: ``(W_i, T_i, Y_i)``. +- The observed data: We assume we observe the realization of a random vector ``\bold{Z}_n = (Z_1, ..., Z_n)``. The components of ``\bold{Z}`` are assumed independent and identically distributed according to ``\mathbb{P}``, i.e. ``\forall i \in \{1, ..., n\},Z_i \sim \mathbb{P}``. Note that each ``Z_i`` is usually a vector as well, for us: ``Z_i = (W_i, T_i, Y_i)``. - The Empirical Distribution : ``\mathbb{P}_n(A) = \frac{1}{n}\sum_{i=1}^n 1_A(Z_i)``. For each set ``A``, it computes the mean number of points falling into ``A``. -- Expected value of a function ``f`` with respect to a distribution ``\mathbb{P}``: ``\mathbb{E}_{\mathbb{P}} f = \mathbb{P}f``. Note that for the empirical distribution ``\mathbb{P}_n``, this is simply: ``\mathbb{P}_nf = \frac{1}{n}\sum_{i=1}^nf(Z_i)``, in other words, the sample mean. +- Expected value of a function ``f`` of the data ``Z`` with respect to a distribution ``\mathbb{P}``: ``\mathbb{P}f \equiv \mathbb{E}_{\mathbb{P}}[f(Z)]``. Note that for the empirical distribution ``\mathbb{P}_n``, this is simply: ``\mathbb{P}_nf = \frac{1}{n}\sum_{i=1}^nf(Z_i)``, in other words, the sample mean. - The Estimand: The unknown finite dimensional quantity we are interested in. ``\Psi`` is defined as a functional, i.e. ``\mathbb{P} \mapsto \Psi(\mathbb{P}) \in \mathbb{R}^d``. In this document ``d=1``. -- The Gradient of ``\Psi`` at the distribution ``\mathbb{P}`` : ``\phi_{\mathbb{P}}``, a function of the data, i.e. ``Z \mapsto \phi_{\mathbb{P}}(Z)``. The gradient satisfies: ``\mathbb{P}\phi_{\mathbb{P}} = 0`` and ``Var[\phi_{\mathbb{P}}] < \infty``. +- The Gradient of ``\Psi`` at the distribution ``\mathbb{P}`` : ``\phi_{\mathbb{P}}``, a function of the data, i.e. ``Z \mapsto \phi_{\mathbb{P}}(Z)``. The gradient satisfies two properties: (i) ``\mathbb{P}\phi_{\mathbb{P}} = 0``, and (ii) ``Var[\phi_{\mathbb{P}}] < \infty``. - An estimator, is a function of the data ``\bold{Z}_n``, and thus a random variable, that seeks to approximate an unknown quantity. For instance, ``\hat{\Psi}_n`` denotes an estimator for ``\Psi``. Notice that the empirical distribution is an estimator for ``\mathbb{P}``, but the hat is omitted to distinguish it from other estimators that will be defined later on. ### The Counterfactual Mean @@ -34,7 +34,7 @@ Y &= f_Y(T, W, U_Y) This is certainly the most common scenario in causal inference where ``Y`` is an outcome of interest, ``T`` a set of treatment variables and ``W`` a set of confounding variables. We are generally interested in the effect of ``T`` on ``Y``. In this document we will consider the counterfactual mean as a workhorse. We will show that the estimators for the Average Treatment Effect and Average Interaction Effect can easily be derived from it. Under usual conditions, the counterfactual mean identifies to the following statistical estimand: ```math -CM_t(\mathbb{P}) = \Psi_t(\mathbb{P}) = \mathbb{E}_{\mathbb{P}}[\mathbb{E}_{\mathbb{P}}[Y | W, T = t]] = \int \mathbb{E}_{\mathbb{P}}[Y | W, T = t] d\mathbb{P}(w) +CM_t(\mathbb{P}) = \Psi_t(\mathbb{P}) = \mathbb{E}_{\mathbb{P}}[\mathbb{E}_{\mathbb{P}}[Y | W, T = t]] = \int \mathbb{E}_{\mathbb{P}}[Y | W=w, T = t] d\mathbb{P}(w) ``` From this definition, we can see that ``\Psi_t`` depends on ``\mathbb{P}`` only through two relevant factors: @@ -52,7 +52,7 @@ So that ``\Psi(\mathbb{P}) = \Psi(Q_Y, Q_W)`` (the ``t`` subscript is dropped as g(W, T) = \mathbb{P}(T | W) ``` -Since the gradient of ``\Psi``: +This is because the gradient of ``\Psi``: ```math \phi_{CM_{t}, \mathbb{P}}(W, T, Y) = \frac{\mathbb{1}(T = t)}{g(W, t)}(Y − Q_Y(W, t)) + Q_Y(W, t) − \Psi(\mathbb{P}) @@ -60,23 +60,23 @@ Since the gradient of ``\Psi``: which is the foundation of semi-parametric estimation, depends on this so-called nuisance parameter. -### Average Treatment Effect and Average Interaction Effect? +### Average Treatment Effect and Average Interaction Effect -They are simple linear combinations of counterfactual means and so is their gradients. +They are simple linear combinations of counterfactual means and so are their gradients. #### Average Treatment Effect In all generality, for two values of a categorical treatment variable ``T: t_{control} \rightarrow t_{case}``, the Average Treatment Effect (ATE) is defined by: ```math -ATE_{t_{case}, t_{control}}(\mathbb{P}) = CM_{t_{case}}(\mathbb{P}) - CM_{t_{control}}(\mathbb{P}) +ATE_{t_{case}, t_{control}}(\mathbb{P}) = (CM_{t_{case}} - CM_{t_{control}})(\mathbb{P}) ``` And it's associated gradient is: ```math \begin{aligned} -\phi_{ATE}(W, T, Y) &= \phi_{CM_{t_{case}}}(W, T, Y) - \phi_{CM_{t_{control}}}(W, T, Y) \\ +\phi_{ATE}(W, T, Y) &= (\phi_{CM_{t_{case}}} - \phi_{CM_{t_{control}}})(W, T, Y) \\ &= H(W, T)(Y − Q_Y(W, T)) + C(W) − ATE_{t_{case}, t_{control}}(\mathbb{P}) \end{aligned} ``` @@ -86,7 +86,7 @@ with: ```math \begin{aligned} \begin{cases} - H(W, T) &= \frac{(-\mathbb{1}(T \in (t_{case}, t_{control})))^{T=t_{control}}}{g(W, T)} \\ + H(W, T) &= \frac{ \mathbb{1}(T = t_{case}) - \mathbb{1}(T = t_{control})}{g(W, T)} \\ C(W) &= (Q_Y(W, t_{case}) - Q_Y(W, t_{control})) \end{cases} \end{aligned} @@ -96,17 +96,21 @@ with: #### Average Interaction Effect -For simplicity, we only consider two treatments ``T = (T_1, T_2)`` such that: ``T_1: t_{1,control} \rightarrow t_{1,case}`` and ``T_2: t_{2,control} \rightarrow t_{2,case}``. The Average Interaction Effect (AIE) of ``(T_1, T_2)`` is defined as: +For simplicity, we only consider two treatments ``T = (T_1, T_2)`` such that ``T_1: t_{1,control} \rightarrow t_{1,case}`` and ``T_2: t_{2,control} \rightarrow t_{2,case}``. The Average Interaction Effect (AIE) of ``(T_1, T_2)`` is defined as: ```math -AIE(\mathbb{P}) = CM_{t_{1, case}, t_{2, case}}(\mathbb{P}) - CM_{t_{1, case}, t_{2, control}}(\mathbb{P}) - CM_{t_{1, control}, t_{2, case}}(\mathbb{P}) + CM_{t_{1, control}, t_{2, control}}(\mathbb{P}) +\begin{aligned} +AIE(\mathbb{P}) &= (CM_{t_{1, case}, t_{2, case}} - CM_{t_{1, case}, t_{2, control}} \\ +&- CM_{t_{1, control}, t_{2, case}} + CM_{t_{1, control}, t_{2, control}})(\mathbb{P}) +\end{aligned} ``` And its gradient is given by: ```math \begin{aligned} -\phi_{AIE, }(W, T, Y) &= \phi_{CM_{t_{1,case}, t_{2,case}}}(W, T, Y) - \phi_{CM_{t_{1,case}, t_{2,control}}}(W, T, Y) - \phi_{CM_{t_{1,control}, t_{2,case}}}(W, T, Y) + \phi_{CM_{t_{1,control}, t_{2,control}}}(W, T, Y) \\ +\phi_{AIE, }(W, T, Y) &= (\phi_{CM_{t_{1,case}, t_{2,case}}} - \phi_{CM_{t_{1,case}, t_{2,control}}} \\ +&- \phi_{CM_{t_{1,control}, t_{2,case}}} + \phi_{CM_{t_{1,control}, t_{2,control}}})(W, T, Y) \\ &= H(W, T)(Y − Q_Y(W, T)) + C(W) − ATE_{t_{case}, t_{control}}(\mathbb{P}) \end{aligned} ``` @@ -116,7 +120,7 @@ with: ```math \begin{aligned} \begin{cases} - H(W, T) &= \frac{(-\mathbb{1}(T_1 \in (t_{1, case}, t_{1, control})))^{T_1=t_{1, control}}(-\mathbb{1}(T_2 \in (t_{2, case}, t_{2, control})))^{T_2=t_{2, control}}}{g(W, T)} \\ + H(W, T) &= \frac{(\mathbb{1}(T_1 = t_{1, case}) - \mathbb{1}(T_1 = t_{1, control}))(\mathbb{1}(T_2 = t_{2, case}) - \mathbb{1}(T_2 = t_{2, control}))}{g(W, T)} \\ C(W) &= (Q_Y(W, t_{1, case}, t_{2, case}) - Q_Y(W, t_{1, case}, t_{2, control}) - Q_Y(W, t_{1, control}, t_{2, case}) + Q_Y(W, t_{1, control}, t_{2, control})) \end{cases} \end{aligned} @@ -147,9 +151,9 @@ This suggests that a plugin estimator, one that simply evaluates ``\Psi`` at an \mathbb{P}_n\phi_{\mathbb{P}}(Z) ``` -By the central limit theorem, it is asymptotically normal with variance ``Var[\phi]/n``, it is used to build confidence interval. +By the central limit theorem, it is asymptotically normal with variance ``Var[\phi]/n``, it will be used to construct a confidence interval for our final estimate. -### 2. The Bias Term +### 2. The First-Order Bias Term ```math - \mathbb{P}_n\phi_{\mathbb{\hat{P}}}(Z) @@ -166,7 +170,7 @@ This is the term both the One-Step estimator and the Targeted Maximum-Likelihood This can be shown to be of order ``o_{\mathbb{P}}(\frac{1}{\sqrt{n}})`` if ``\phi_{\hat{\mathbb{P}}}`` converges to ``\phi_{\mathbb{P}}`` in ``L_2(\mathbb{P})`` norm, that is: ```math -\int (\phi_{\hat{\mathbb{P}}}(Z) - \phi_{\mathbb{P}}(Z) )^2 d\mathbb{P}(Z) = o_{\mathbb{P}}(\frac{1}{\sqrt{n}}) +\int (\phi_{\hat{\mathbb{P}}}(Z) - \phi_{\mathbb{P}}(Z) )^2 d\mathbb{P}(Z) = o_{\mathbb{P}}\left(\frac{1}{\sqrt{n}}\right) ``` and, any of the following holds: @@ -174,13 +178,13 @@ and, any of the following holds: - ``\phi`` (or equivalently its components) is [[Donsker](https://en.wikipedia.org/wiki/Donsker_classes)], i.e., not too complex. - The estimator is constructed using sample-splitting (see cross-validated estimators). -### 4. The Second-Order Remainder Term +### 4. The Exact Second-Order Remainder Term ```math R_2(\mathbb{\hat{P}}, \mathbb{P}) ``` -This term is usually more complex to analyse. However, for the counterfactual mean, it can be shown that if ``g(W,T) \geq \frac{1}{\eta}`` (positivity constraint): +This term is usually more complex to analyse. Note however, that it is entirely defined by the von Mises expansion, and for the counterfactual mean, it can be shown that if ``g(W,T) \geq \frac{1}{\eta}`` (positivity constraint): ```math |R_2(\mathbb{\hat{P}}, \mathbb{P})| \leq \eta ||\hat{Q}_{n, Y} - Q_{Y}|| \cdot ||\hat{g}_{n} - g|| @@ -193,7 +197,7 @@ and thus, if the estimators ``\hat{Q}_{n, Y}`` and ``\hat{g}_{n}`` converge at a According to the previous section, the OSE and TMLE will be asymptotically linear with efficient influence curve the gradient ``\phi``: ```math -\sqrt{n}(\hat{\Psi} - \Psi) = \frac{1}{\sqrt{n}} \sum_{i=1}^n \phi(Z_i) + o_{\mathbb{P}}(\frac{1}{\sqrt{n}}) +\sqrt{n}(\hat{\Psi} - \Psi) = \frac{1}{\sqrt{n}} \sum_{i=1}^n \phi(Z_i) + o_{\mathbb{P}}\left(\frac{1}{\sqrt{n}}\right) ``` By the [Central Limit Theorem](https://en.wikipedia.org/wiki/Central_limit_theorem) and [Slutsky's Theorem](https://en.wikipedia.org/wiki/Slutsky%27s_theorem) we have: @@ -214,6 +218,8 @@ which can be used to build confidence intervals: \underset{n \to \infty}{\lim} P(\hat{\Psi}_n - \frac{S_n}{\sqrt{n}}z_{\alpha} \leq \Psi \leq \hat{\Psi}_n + \frac{S_n}{\sqrt{n}}z_{\alpha}) = 1 - 2\alpha ``` +Here, ``z_{\alpha}`` denotes the ``\alpha``-quantile function of the standard normal distribution + ## One-Step Estimator ### Canonical OSE @@ -221,12 +227,12 @@ which can be used to build confidence intervals: The One-Step estimator is very intuitive, it simply corrects the initial plugin estimator by adding in the residual bias term. As such, it corrects for the bias in the estimand's space. Let ``\hat{P}= (\hat{Q}_{n,Y}, \hat{Q}_{n,W})`` be an estimator of the relevant factors of ``\mathbb{P}`` as well as ``\hat{g}_n`` an estimator of the nuisance function ``g``. The OSE is: ```math -\hat{\Psi}_{n, OSE} = \Psi(\hat{P}) + P_n{\phi_{\hat{P}}} +\hat{\Psi}_{n, OSE} = \Psi(\hat{P}) + \mathbb{P}_n{\phi_{\hat{P}}} ``` ### CV-OSE -Assume the realization of ``n`` random variables ``K_i`` assigning each sample to one of ``{1, ..., K}`` folds. For a given sample ``i``, we denote by ``k(i)`` the validation fold it belongs to and ``-k(i)`` the remaining training fold. Similarly, we denote by ``\hat{Q}^{k}`` an estimator for ``Q`` obtained from samples in the validation fold ``k`` and ``\hat{Q}^{-k}`` an estimator for ``Q`` obtained from samples in the (training) fold ``\{1, ..., K\}-\{k\}``. +Instead of assuming Donsker conditions limiting the complexity of the algorithms used, we can use sample splitting techniques. For this, we split the data into K folds of (roughly) equal size. For a given sample i, we denote by ``k(i)`` the fold it belongs to (called validation fold/set) and by ``-k(i)`` the union of all remaining folds (called training set). Similarly, we denote by ``\hat{Q}^{k}`` an estimator for ``Q`` obtained from samples in the validation fold ``k`` and ``\hat{Q}^{-k}`` an estimator for ``Q`` obtained from samples in the (training) fold ``\{1, ..., K\}-\{k\}``. The cross-validated One-Step estimator can be compactly written as an average over the folds of sub one-step estimators: @@ -237,7 +243,7 @@ The cross-validated One-Step estimator can be compactly written as an average ov \end{aligned} ``` -The important thing to note is that for each sub one-step estimator, the sum runs over the validation samples while ``\hat{Q}_Y^{-k}`` and ``\hat{\phi}^{-k}`` are estimated using the training samples. +Where the first equation is the general form of the estimator while the second one corresponds to the counterfactual mean. The important thing to note is that for each sub one-step estimator, the sum runs over the validation samples while ``\hat{Q}_Y^{-k}`` and ``\hat{\phi}^{-k}`` are estimated using the training samples. ## Targeted Maximum-Likelihood Estimator @@ -245,15 +251,15 @@ Unlike the One-Step estimator, the Targeted Maximum-Likelihood Estimator correct ### Canonical TMLE -The way ``\hat{\mathbb{P}}`` is modified is by means of a parametric sub-model also known as a fluctuation. It can be shown that for the conditional mean, it is sufficient to fluctuate ``\hat{Q}_{n, Y}`` only once using the following fluctuations: +The way ``\hat{\mathbb{P}}`` is modified is by means of a parametric sub-model also known as a fluctuation. The choice of fluctuation depends on the target parameter of interest. It can be shown that for the conditional mean, it is sufficient to fluctuate ``\hat{Q}_{n, Y}`` only once using the following fluctuations: - ``\hat{Q}_{Y, \epsilon}(W, T) = \hat{Q}_{n, Y}(T, W) + \epsilon \hat{H}(T, W)``, for continuous outcomes ``Y``. - ``\hat{Q}_{Y, \epsilon}(W, T) = \frac{1}{1 + e^{-(logit(\hat{Q}_{n, Y}(T, W)) + \epsilon \hat{H}(T, W))}}``, for binary outcomes ``Y``. -where ``\hat{H}(T, W) = \frac{1(T=t)}{\hat{g}_n(W)}`` is known as the clever covariate. The value of ``\epsilon`` is obtained by minimizing the loss ``L`` associated with ``Q_Y``, that is the mean-squared error for continuous outcomes and negative log-likelihood for binary outcomes. This can easily be done via linear and logistic regression respectively. +where ``\hat{H}(T, W) = \frac{1(T=t)}{\hat{g}_n(W)}`` is known as the clever covariate. The value of ``\epsilon`` is obtained by minimizing the loss ``L`` associated with ``Q_Y``, that is the mean-squared error for continuous outcomes and negative log-likelihood for binary outcomes. This can easily be done via linear and logistic regression respectively, using the initial fit as off-set. !!! note - For the ATE and AIE, just like the gradient is linear in ``\Psi``, the clever covariate used to fluctuate the initial ``\hat{Q}_{n, Y}`` is as presented in [Average Treatment Effect and Average Interaction Effect?](@ref) + For the ATE and AIE, just like the gradient is linear in ``\Psi``, the clever covariate used to fluctuate the initial ``\hat{Q}_{n, Y}`` is as presented in [Average Treatment Effect and Average Interaction Effect](@ref) If we denote by ``\epsilon^*`` the value of ``\epsilon`` minimizing the loss, the TMLE is: @@ -286,7 +292,7 @@ Then, the CV-TMLE is: \end{aligned} ``` -Notice that, while ``\hat{\Psi}_{n, CV-TMLE}`` is not a plugin estimator anymore, it still respects the natural range of the parameter because it is an average of plugin estimators. Also, because ``\hat{Q}_{n,Y}^*`` is based on the entire data, it seems this estimator is still somehow double-dipping. +Notice that, while ``\hat{\Psi}_{n, CV-TMLE}`` is not a plugin estimator anymore, it still respects the natural range of the parameter because it is an average of plugin estimators. ## References diff --git a/docs/src/index.md b/docs/src/index.md index f3500c58..faa470c2 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -82,12 +82,12 @@ nothing # hide ## Scope and Distinguishing Features -The goal of this package is to provide an entry point for semi-parametric asymptotic unbiased and efficient estimation in Julia. The two main general estimators that are known to achieve these properties are the One-Step estimator and the Targeted Maximum-Likelihood estimator. Most of the current effort as been centered around estimands that are composite of the counterfactual mean. +The goal of this package is to provide an entry point for semi-parametric asymptotic unbiased and efficient estimation in Julia. The two main general estimators that are known to achieve these properties are the One-Step estimator and the Targeted Maximum-Likelihood estimator. Most of the current effort has been centered around estimands that are composite of the counterfactual mean. Distinguishing Features: - Estimands: Counterfactual Mean, Average Treatment Effect, Interactions, Any composition thereof -- Estimators: TMLE, One-Step +- Estimators: TMLE, One-Step, in both canonical and cross-validated versions. - Machine-Learning: Any [MLJ](https://alan-turing-institute.github.io/MLJ.jl/stable/) compatible model - Dataset: Any dataset respecting the [Tables](https://tables.juliadata.org/stable/) interface (e.g. [DataFrames.jl](https://dataframes.juliadata.org/stable/)) - Factorial Treatment Variables: diff --git a/test/counterfactual_mean_based/fluctuation.jl b/test/counterfactual_mean_based/fluctuation.jl index 4ea5461b..6f263c33 100644 --- a/test/counterfactual_mean_based/fluctuation.jl +++ b/test/counterfactual_mean_based/fluctuation.jl @@ -30,6 +30,7 @@ using DataFrames η̂ₙ = η̂(η, dataset, verbosity = 0) X = dataset[!, collect(η̂ₙ.outcome_mean.estimand.parents)] y = dataset[!, η̂ₙ.outcome_mean.estimand.outcome] + mse_initial = sum((TMLE.expected_value(η̂ₙ.outcome_mean, X) .- y).^2) ps_lowerbound = 1e-8 # Weighted fluctuation @@ -42,6 +43,9 @@ using DataFrames cache=true ) fitresult, cache, report = MLJBase.fit(fluctuation, 0, X, y) + fluctuation_mean = TMLE.expected_value(MLJBase.predict(fluctuation, fitresult, X)) + mse_fluct = sum((fluctuation_mean .- y).^2) + @test mse_fluct < mse_initial @test fitted_params(fitresult.one_dimensional_path).features == [:covariate] @test cache.weighted_covariate == expected_weights .* expected_covariate @test cache.training_expected_value isa AbstractVector From d3ba3499a3b3475a54d050233ccf5220aea7563a Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Mon, 29 Apr 2024 17:24:24 +0200 Subject: [PATCH 9/9] update project version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index fd175069..f2910362 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TMLE" uuid = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf" authors = ["Olivier Labayle"] -version = "0.16.0" +version = "0.16.1" [deps] AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"