From 49cd41f7f1fb6b0fb34fdc3a648dbe4db8cc92e8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 11:27:14 -0400 Subject: [PATCH 01/27] Bump jinja2 from 3.1.3 to 3.1.4 in /requirements (#1211) Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.3 to 3.1.4. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.3...3.1.4) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index a051200b5..501edf345 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -2,7 +2,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4 ftfy>=6.0.1 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0 -jinja2==3.1.3 +jinja2==3.1.4 lm_eval>=0.4.0,<=0.4.1 mpi4py>=3.0.3 numpy>=1.22.0 From d037756332ea226358314c489347ad677b363af1 Mon Sep 17 00:00:00 2001 From: Jacob Hatef <74274091+jahatef@users.noreply.github.com> Date: Thu, 16 May 2024 10:20:44 -0400 Subject: [PATCH 02/27] Run document update again (#1216) * misc changes to neox_args * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions --- configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c8e1492ae..dd10a0e09 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 6fb840e + Default = 8d175ed current git hash of repository @@ -1201,7 +1201,7 @@ Text Generation arguments -- **num_experts**: int +- **moe_num_experts**: int Default = 1 @@ -1243,7 +1243,7 @@ Text Generation arguments - **moe_token_dropping**: bool - Default = True + Default = False Whether to drop tokens when exceeding capacity @@ -1273,6 +1273,47 @@ Text Generation arguments +- **moe_type**: str + + Default = megablocks + + Either `deepspeed` or `megablocks` + + + +- **moe_glu**: bool + + Default = False + + Use gated linear units in MoE + + + +- **moe_lbl_in_fp32**: bool + + Default = False + + Whether to compute the load balancing loss in fp32. + + + +- **moe_jitter_eps**: float + + Default = None + + Coefficient for MoE routing jitter. Jitter is + not used if set to None + + + +- **enable_expert_tensor_parallelism**: bool + + Default = False + + Enable expert tensor parallelism + + + ## NeoXArgsTokenizer Tokenizer Arguments From 153e732f7df9676d33d97f07f5e2009ae7b1b2a4 Mon Sep 17 00:00:00 2001 From: Jacob Hatef <74274091+jahatef@users.noreply.github.com> Date: Tue, 21 May 2024 18:34:53 -0400 Subject: [PATCH 03/27] Rwkv pipeline parallelism (#1221) * misc changes to neox_args * allow rwkv pp * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- megatron/neox_arguments/arguments.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index dd10a0e09..48c03f15a 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 8d175ed + Default = 0d5992f current git hash of repository diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index ff4f4bc21..98a444ea4 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1070,8 +1070,8 @@ def calculate_derived(self): ), "Mamba does not yet have dropout implemented" if "rwkv" in self.attention_config: assert ( - not self.is_pipe_parallel and self.model_parallel_size == 1 - ), "RWKV not currently compatible with parallelism" + self.model_parallel_size == 1 + ), "RWKV not currently compatible with model parallelism" if isinstance(self.zero_stage, int): assert self.zero_stage <= 2, "Zero stage 3 not compatible with RWKV" assert ( From 2746d43ede314e74fd3bda818d6a044ac3c71b9b Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 21 May 2024 18:36:21 -0400 Subject: [PATCH 04/27] Add Torch Profiler Support (#1226) * format: flagged on pre-commit * feat: add pytorch profiling * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- README.md | 11 ++++++++++- configs/neox_arguments.md | 2 +- images/pytorch_profiling.png | Bin 0 -> 89473 bytes megatron/data/helpers.cpp | 12 ++++++------ megatron/training.py | 22 ++++++++++++++++++++++ 5 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 images/pytorch_profiling.png diff --git a/README.md b/README.md index e7f61bf20..e11122f5e 100644 --- a/README.md +++ b/README.md @@ -640,7 +640,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, # Profiling -We support profiling with Nsight Systems and PyTorch Memory Profiling. +We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling. ## Nsight Systems Profiling @@ -656,6 +656,15 @@ The generated output file can then by viewed with the Nsight Systems GUI: ![Alt text](images/nsight_profiling.png) +## PyTorch Profiling + +To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`. + +The PyTorch profiler will save traces to your `tensorboard` log directory. You can view these traces within +TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). + +![Alt text](images/pytorch_profiling.png) + ## PyTorch Memory Profiling To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`. diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 48c03f15a..1dbb4dd8a 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 0d5992f + Default = b68ba6d current git hash of repository diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png new file mode 100644 index 0000000000000000000000000000000000000000..e85324dc694d7c11e922a0cdc54f38d364f99d59 GIT binary patch literal 89473 zcmZ^~Wk8hc_XUa~AkrZn(nvQ$E8QX8-QCSdH%JH!skC%ShcrXOfPhGMch`OOob$W? z5BK^-X6B72_Fg;IdLoq-rO;jxy@G>-Lz9sf{{ROE{{jx~S?^0k;2ZgI;a|XiFI+`s z)LsJr{9c+x0G|onB(&UA9WC5Eja|&)EFB!}&6!a17E_yaj8K; zW@2jM_#*YY;!YzIyM6+v1mA`zb&Z-|75Sg*+O7<83@jHh7%RW0bZTIK>byE?_Pd#T zyY<^LPIy>oE0b`ez2r+ArszwG5X@B}=r-p(s~hS+pTOE2SajIuE64fz+sfm0)Dxe` zOBX`X#l-^b6py^K;w$p#8Qw2}6dn7#r*k4CfGaGbNpa0Re5u)DC+s?{>f7xJn^UOw zbGjav8nZ1M3{}uvFGKy3wCvDPY0tb% zv*JC2<~sjmQTe!QNO)*>?fChQf2oHpJm-ccNla+J@ky7&Yt^##Ji3ydds2#e-SSFo zX=6IPU5<%_n&`axXoB2%`6Q=$t=*AB=x6*1SgqynT6$>`?%8ilEf%p6j8LztU{}9H zLozfeYxrDukDzAY%xU^?z(u~O$hDofI!PQU)pgD@{e_+(J;iK8y`qV6}W1rPp4>_YfeSZCNrkgYwRv~Oa*S!w72Q;6FRlTrLgW*v>B5+Exz;f zHm!Zd=Sg{RsP#Ern~Hgt)BTX`qc46Bc(ma@#f|HT=z=Gl43&+L$Dc{^@s~K6M0s9N zeo3G79l;BtjHCNkky;^-DLANg+0+RUd2k+{43Ck<8uo2jTGY}974}w986(a5C{9d_=X*Q`$xZ`+rBJ-8;@uTaBt6l{2y_>!|GK(>*n z`;>kQKbE9g^8$`^o8SLa_HEu5dZw0pc(2-5a4+xZ_)hQPL4rFkiu$#hHmKY(i%p;f zyx8#)e79ZjW_T<;NsxF>nqcz`hp$ExL0%z~?KcjH&&q4{|pB1rWQ$#t#Rr?RO@G5v7c#X|2a zj0s1iJgS@)@Z1zkzg$qm9rRyfBHwP?7x$UCerVdUxD-AW)BC24%cK)LxA~=DQymx( zgpAbaB+M4n{$Q+UJ$K!Q2qD;C9dNx$($#^s4#tvr-52%bVRf}5>%E1L1RMt7l{G)N zlts*25|$Puve-~3H#Vt?&JN3qxu{Q6tqE!G zGbM-n8Wzft`Sa zlMgD~(kz^Q5k|FDH#e@jp7@1aNBY5-7PUB7ju|0ZBhOl9w+*UXSNc(2&FCx@IolVULc z;>JB6;N3>B1HcKQCu2f)Dd9iCiKJ*KeGZXg8P`FZuPo_gr(ylix_NuIXm^+3Q?m3ADcH)vIX7vy5r-*T zT7-nyi^BdB3%uo#VLE0{vd(WR&3kd5qAL--aKjq&(U9*vXM*>{$rG(0Wc2$hHVA#oU^K(rdyqz1A0cL&0`D&qcR=rLuN5+f0 z9Lv3;4w<#DsnxUkRyx?x)1CH)z8?>+{F4IflaiS_+!VhnI`W4`v#vtP{m_ee;q#!B z$bo-6t|telc3P5)#$X`4l`D_mXtnMQJ0R0fdeko%>v*ZfMcn64dKdSka#%B$d@)OF+DB$l@g; zq(rH4oa@9%N?}9I#3H@BrAJ04RX9&QY}L{)liWAW1*7;?b9^?RwvFofBM=|9nb#Lo8OKLn5WR zL;Tlf^$q@Pou3=NMiH^54Q&bk+0+s*k2LJ7X5;p4{MyKHzX-d_Xa|P=%m8Dl+0AEi zvpfH#H{g|5GB1Ik`jX-?Gr6)O-9P^RWofqrB^gyqe$Xb>{M^ltCn~J?)Asl$97u08 zsCth*rmqOf#L{o~opVY>YC2EEqO3CZ`^-WwRFj;J49%2F*QwBqZ8jjCYHUFX2mxRG zxJQ#wZ47)~W(-Ah(j;wYF3OJHzw-F~#Ed)rBNt7x*6%F5*k_*(FjwqTrAa?Cq#>54 zADLeEimZ<(6zC$ZtS4G&eDR}WRs4dPi&vfp9?e&U zOJ_WkN)}{<66X82(;W#BVQ=+o2|axdGcR@R6o)Z!Ts)CBR zUy%&K3iLviP>&3`pE~r58DtcRm_F>frC>yq~#jm!F+bTMGvLwhp;Z(js-B zd@rKWf}=mfK2P$3R?5k1G9oOPK*0ic`rQC;V8!07{#(EY)Pk{?3Aesh21LxaKQg*0 z^lzSH#)nPRQPX({%UVo`)@-Q*Zv;Rz1YQ-$ZPfCRlX}CRr*xcFtQSD9~_?-%F=359bPQK%wvBv9^u~pfT+_NcLR8O87o4o_-RVq%w;+H zXkRnyOb`Q7tsRx0fq1))?Ey9dBv7Nt@^+REZqM|-IVL~jgjo!0^k>>1p8sx$Ioo$3 zSv=>vGf=XC1Hj&LcaR^a)PA&jG#NrrhJK~(Y(8Sm_>L*F2a+PRMT{s}8|uXsGqTeO zN#xsV&RTfnb8h)y5s-PY@Ns63hd?3CzN6#8@sTBDf(s|7)2z?VHM@39ggu6l8wEhw zX(MWEAYn})S7~*7*mq~tYfX9FTzgR{DGg=FJc~sF)|`$Ltk|ScWXWTUd{<-^^6qcs zjJV?PJ}Ca15Mdq}C;k&oU!IWB{)5>ez?Xe)1(%}zZ~h`lNaMShvt-NvstC~t6!Ah- z!w$Sas{MMnz!$@G3A{l!n12>H#PPxNl5T%xcZEr z7cvEMDUJ=hZEp~&FL#f4_oiz_%zldO$VX};r{tip%|5NIg*c)Yd+YVQF7gXX!3S~f zJgRn2KSkRiOo4_RT0H-)3r~-j)Af3)ExVaSqF{K{>j$j%r!4#kiM6BR*21vIhWo2a z)+MiWX)g!DQ_|;uh8ZF9#fe@LL}F^gAWcso0du==XM^U4jX3n#aO0r&M~IsgM+NoU z(qTv9ME0n_1y6ijaG)iKz*eGWCq&b-r|nuH*#2V8?+_nr_JN8Z;<-_6MzvPVwV!ca zejL>7$4VV$G!61~x4Z%(@->xJ^C4y_jCesj8BQHZ!jZ1 zqMvwSr_ReR>=-1<#G>J+s&I%TNrNP*)8Eaw$y>lfL&99!^$4-%Q&I!_JyP(Yp0v;f8)mrj$2W zO1VU8L4iPAS$5JPAW1XpA1b;L0k=Uif8)y2!oyT45@pb$dm|Q%jZFk2=1i!ulvZ8l z*@(vcg~=j&kP-6eOM>33pD2v!aZ0Eh`y9|miM}DAfQ(mZDR4gE#9+tLvJtAARKNJD zjH@sM$wW2zo)*!Ivvscd2J;y%i3jHm!nb_OMrrEAYySiYM$it{kDtpPZG{Nr8_tJbHSA>*5iQkH~JM zA7CU*Ok)K$N>m=<2nw*-I&WlOPu}ILP!(K+X*SpHIjtonMTb{cOD(BoOc>v+6`Am; zQtfqyMIM&y{kD)APL}=e21ZXNmv~2OHYD6UVmO3ei>l-LH1|VCFUEgz#Pr)1wb;h= z%AIcSWi6YMcZ6hBX3~qAzC+oD~{JQ7#aHn~yqs|HckOY^^ zI$BWR9dRDx@k=KRIW%`_=Hg^*sTq<)*Gx#1*|qi^_8TNk`BkMU_fz&E`|Ta=?41p~ zjDqP#i3yqSH5){mZ1vG3{wD>G;rEb&&!Mijj6@1mws+Quh_ojP z;Y~L`zsoBi%o$D(k1yk<>gg!zG$-v+krF5%ihiY^(QC%wj`i)QcL zNSo)6FQMXzx+2Zl(p;n2ntMA|^nGB$ z!wE1XASF#5`ZIH4M}voy11AuZUR}U5O?`bCQ-2W@*;=jd7Gym0kjjhQDl2C)@@9Kd z`*JZo&u&>Y2?~;^lXINuTBUB=IJe~-bW;vO!0kYpcuW>l-D0p9)VtY}!MBI^Jq5r0 z_H+jEHxfS|7{fMS4O_s^@fo=37j-xWov@2&Ym=2};CO^X#3|ovZqY|GoW6zR$UyHQ zmH6a*Q=|6?+P?#~BmtmF!g)4qLAA%5QsZNfte=&)RmY7R!vO;IhCAqvjWig9bbeWN z(wePn3ff!hh)Ul@SIpWqcQV4(pRslzSl&+X`0XIju;k6ITlq~~UH#UF*6R{VU*Va51N%~Q(Y&F>L<@{LgGbZpga&12 zm#E;ox3c8FsV}hgX!=Nb(2pa+oaS_Ra!xzu8%)|7Ke%Vg-(W|oDyQ|i^Oj{ogiYC< zx8I@QMPo@kD;<$H2w(urfX}%#`tHdUV`j1HdB#XcNr~2hMNr^N>m8QG2J^QL@o0JP zH@?xuxj;9f=?#-nKK9y(&TtJ|tPO){;friqw43cVk(*%VyR+Jr7?Hga5{J2X@1=^;voi|>oE2Lj>0z;)^BgR zk?tL0$fv-OKuxA(Qk@Q%034^P(I}9}$+0kQO=lHVrztR%Gp!>n`6qNdw3MG;Dzb60 zAv6oqv2=)tsyOqPl$dapP-7F7yzf1%G6J1lWmu;xa(p%BAcklr29DwPnXnswPY2>{ zj|nF%h{vG$QxwgOp6i0kuF@6OOL1lO&>NJR2+39X7^Lk3!hWN)rbv;Nj^}|dSsKuS zRV37LhiTy>uZP{$ryx5YN=l4)2qXK)Vo&xTBP4LBllU!wIWEL>TN`^>5jFIO%?ZK` zX=@W!y$Rt6P#F|c(bgc}^e3$J{>KP(hZO3$4*Y!7@rqV(uW}gW}gS79DwiA(gp*WjM?-rY&cw8 zyyL#;GZAt1d^02%#fyz1H!Ib zPyawMhPwL8kX&&}&S(q9ly#W$Og{j&6VfE{{Q-sLmcLFk{4F3&)rVHZn=6JLeNdbQ z$4sX}1&8>oZ_qdCg=ed0bkZ`e9PC&|PDWATkG3MD2UJYDDg#RaCpL#SJBpME*G&(f zrw9nMEzHc)t+)dFZNWpf#q^vqvTT_CuI=!{W$aZt3aDs`6N~)A16A?Xh6Lf=Q6ml% zG}AJ=dzkU}o7~MUl4I6jV->_OCBNNM$NR->+`l>VlRY7X44u@DhIvFr#Tk)N0}lpl zm%qNq=b>nNdiicU4wu$ucv7v}`jEiKw?4joJ9`;My(QjE6$5z{74f$)|MhjRw+y33 zURLQ7E7n~JAl-_J{9;hnF4e)gTWqzJGs_SI3gLE6esw!}OLte-DG6S(MeWu!qMC&H zW01U?+fPR(8o)K!ir3eDP6|O*)? zJM0r`RTu>cI*MG%>rW9KSQz|&*67fUH2@qEc>+hw@|KEmD0s%UsF*(M;VxZT!L_=s z2D5`zu;>AZT*{Z^4}C{U2Y=&i4|#0i&id3m{*U0bt@hfgdB z9QZCIH9O}nVG*G~yG*c_{u(?)-rVs^Cw#7>5?OynMLkqN#Ca!kN5fI--}KHC;^ejY zI=^FYlbuWs$il1Vow`UUK zg^~2X8i`HfDc=A8y@vkZ7BVKjny4JdYxFNWTLGW1uCi+LzTIXX;82i{bPGB925@0n zQmzYBf|eB5oOrX=g5fgv9qDkRk6pU4bfoS^TgqnEqG?vSP~EYjBpx+Q5kSAEe^?pG z{xkEJ&c?CVbaP7qQPP5!0`U_|=lUyJl%rP${_B}*CBGCWuFL-MOKeIJ6Z*~0{ zu9;fW&;DeRDvS{Eu`e0G|^{kzKqXM~{~<6Zh(GC4?dcb4U01 zd6p+MuZ&xjQGS2=9shWi#BXgk-nFVWE|mzq)_p?aME>%xX zqO?^<){ao_&<&DpP%qt2{q8Fdq2X_x=pXD@+S@N!9`^?1em8kAM&%w41+%h|ojd23 z2Fh!`D$n!Zw<@9S-CPal^-oEe5I(H%d6fYb{QzqWfFfRgpXOL6-_o7AM|e74eYnn} z5N=!X;YnfI5Ha+POM0-U=8m;x_{k0uQj8ku6++4`@Gr|*`nhjrY(c@RZXl{Y|2E7Y z2lb}^L_#zMp|H`d)1{(RYfPl$9PR;T+UmGs(@YySJEIdlWK-nbm^y-@d6H8p)Guw6 z`1;1&&tmPu&tpDbTVG3Y!3c?$r7ev)`e!b~tNuG3?_3Yo6Y(JEr<{yb!@eKq1mp(p zG=WwTcj)aA(`K8^QD}{2=>Ic`xDxtgi}kY=^^8WlN0&$UJIBVmpJTP&D>h^T)x=;f zaTOap&19CXPO8e|S-SSo!kX9T@~AKYzOK~D<}6)Jy+776A)Z-rLNQ57rh%J$nzbh) zRxj|mt(-0m6+wEJ2$?xjgvdnQn9#GVEh;zJ&gSk{<5f3U33zZv#a!paq2!-=lWx$Z?VUJt5!4%6agt%?11-NO%r#K&DI#(mm#I}05W892TF zP@i1cOOEz+kN>OI3t+S`Cya)Q#vpL}EsNr1!%0SZRvClVfPaHOat2ghMhE@R7&g2M z>0pG8M9%-Yf*EcBF>$iJWoRl-(Oy zt_Dj}FQaRss|XbGXZ3+aflIa=v4gq!j}pZMCCkerQr#?~=0<|Nv)zcoTju8UMhZ|E z*a>5bOQK*VssG}Qo>tWJ^g>fL^)gz{N2`eUPcP;U)!w~VSO0L}a9h+LL4!@QXJ6=0 zg`=*%NG~3K5Ifyk>Br4XRf}BDo5E0a)PA1YycZ5Nrhw zQjt729)k!E7+&F?n-Xf4^_g}xJRv#;sVRJjiHd=}aG&_NlBisJ^%6w%X#M4h88(OB;WM9pBVKl(|o)QA$^oK&#H*CFXJjhkQzCqzckCo>0U2_abR>0{@4s6%8SnHC z%MF<`;4a@55VH|0cpb2W`QZ?tA>N?>Wm7^hEp5pTjrp;u=d%X4$exguz^C&BYTcpe zd!zzDAOU>gP^%W9WlmI7ZjzUKFg5#zFMiIW9;O^gZFJpR>*n5FD{0y`X_&5Gq^+%qRB&F z{yUZ<^O~^#8jAnl5p?Y`F%+L4@lWIds2~IrJL$3JarVDK_ay}jCWM3f&o3vI;xk-* zx4&i|#yyd5XalyKPoyJ?+MoCerS7~}ET5ze&zyljw@A)@C5aTZJZ?Yf*cDv~OE6)l zzil`?7T~wE$&cT9TswM_x==**Xgj#@s@<6g2zZ=Rm^$#a@R-LCt`BN?I^{Tg92&ar zU24U6>N)oA2inL0okzQ#vHtA{d`OxF^RUC{xNq4zjzxRU_6ydvkkb@$@_4zV9YBfa z-Omp7{I|G71^VBT%^3s-_PuuQnDG+Zg6^2=+@Cs}5MX%^X1FQFT-pRX)}2WSLy1F+ z81Dp$zE1%8pJE=51`Vl%ZoF z=3z;2Vr3&*d!TRlg1CV0H0)8R}>0RyuWw&0Dd7( z7%W2Uc*7*e(uSoV#{n$vloN(zQs`;{F|dqTu5FgTNP!ywmIfUMK#5^fP8i#=WyGNd6xbKZvdLP!ofMfgMk>BQ6#IBq9F zcr^i2A>5!(ghzUSmrqr5oY%FR7`^I_cPX#q<^?HT$vAgxQl+q>l>O_XD+B}dw3y@~H^=W8$InHvg=q=OPqL>h5~U*Jhnu(S#knAoKo)L6(8l!X`g8?iYjN zzF$mwDp&z{Dem;g-b3mRKW@C1=vo+ z$_89~)w3LUW8NI*EFdUjg5BltihS?gYS zALA#F_lKmLv?p%`0z*Son`Tqp)K=Pirj|!2p`wCD zi#zR9NWzwG|NGoigaW_|FRmb{orH+1X(dq%0jeafy8qfv$EhSLPzLy5f7x@DVt$T8 z1xPd2`^KIkECbHH-4x*UMO5SBDn>FZM;^1pRSe>%2XbR)TmTtVa>cKZxy?*nUdxAX zt=4OVVDGEa+Kyn@8s|kGAeyl-QO1ptgvk@T`4^5@I`mln$_ozgh;r3{YmO@^arAC` ztzKZV_8k!Fup#ClXV}t001=5@`yOL*xj*^Dmv}Fx=lp;>e9*jLB7ZuBWzODg?=754 zs*W`;a{QBxoY0N#C|CmGf*(J}{GwMXXeFD|0icDcwB)2v2^?GCeNu0Ys5kZ9sSuD~ zX2$E*$ui(qBJKH{4mDiz{m6K9&p{h~(D6Dxw`4uFGC$zS6@Z9rRo`rCB6Oqr*&R@| z<6(7_sEr2}tlOW9%qR5gs1IIwN0{`05}*X(B3%^bZBoGn`nTm4yc3^t9#eF>9^W=A zlCMr>7fBB>cG|ge%z8v^QVDE3meW{1PR_yPH1#{DlrgenS~(+;yf{(DlVku@O>TkD z4CFiBW`ljv^ok^sPN^s7&n0JCKF}3G-rOK`LFssqwL_5-kAeuDV{s|2!giL#0dPFP zhKc~8Qh_bc)95Xpp^MNTqq2hD&e@+auokw}M>UsN0s<`X*Am3sl{YxS`da%AI<^Mp z#d+D^Ru&p%@(H1ql8VbNX z6FNS^Mybq5mXwVmJ0XHOK(r39>H_3+V!CR~1Q)0Mz~*UUif#}uCpp0ATa?0k>Rj%N zYD%b|xoGRT5r=NI?*|n$Nj#w1oi=)dg=XsNQ`OW8 zsQOSIBPkN0FS5_|YUENr0B~3_B@cu&z*PaBU&~`ofnfCac}BShAkylgq9{{jHMivj zZay|!Ur`*&Zfnmz7vzXuyben(Ap+1rbjBTcC6vJ+hLZ-zQ9~GDk`L?bYPMKAld|Qw z_AMVz<)HNRy_;^Po<)iJAKqWK_BL|5M1}^h4;yy177znta;;Dr4KfGBN5>YMNr~#x zD!n=Y0iJJ$2Plq0CQKw|691$Zj5IhIuvp&Gk1<^}a}pbiWqyK$-o=PiijKFLd*%1X z#~FS8e{(Vb97_ndm-a->B75s0oAh{zq zkt+?69);I4ANHBoCsdVAL}BPUeZSbg6`skyw7FHPV)O45P<$z068XAmh>ci&YhPg2 zKUtbmHfkh#Xmxq?GkKp0mno-CHBtEysE{|6?mx2Jl8;YI$y@R*G0KbyPufja-xK_g ztCGXb(9MJ2x(|#jEC&6C6%U{*k_Xnfs~qa?gua=IYsv$h1;3{s3k0v>pFE~T;4vem zu1DK*lp(btTSSI^Ux?CBm7L{z(ew;z62S9UIL$>qX{jH2Qd#c=F>?H#tE=$Ms*2S! zChbJTcW6DR1bAFOLoFG+*wA&-$y{%8ts=5)XL#5+SlU#z6_$(|8ETvE$ORRAN=^41 z$$n~b2L~>gt&!J*QC^3}t1Wi5Ns;$hvC00oiF41Z;T)n-V34A7OA&w8G5QV0RB#}Y zO@Q2j6BR3${T4av1F?bCug>bnB*BGa%B9xRC+tY6va}t4csf@BbHEkE_ z^~nw<+8zktv#wMb&-|6@X0LUzj`buZa5k(P&C=$>%l_{Bd3D3JlV;1EtX)gnIa&^q z9Wy>ouZo@5*|vU!NuA>D0^#}DnJ`3z&CDf^pQG&+!F~bVmh{K#?AzofXF)?ycgz23 z$)33~Nk;fb3T)^L+F)cb*E>DAFL|6I?qqnugvtcR7%SiZs9;KmcvMq_bH4s~35j## zAhCuM$5u&~oTne7@6AsK3gJmoYcG2;+Ah@?#Qp)d{5%GR&}>1(hwfTuT(2|b*RO+3 z-oe068Sh^wb9T77>4S&P?h29UPd8PUp?L!tO72U@@7l%wDQpp97 zj5;a_kH-C6uxj@B8OzHgoe|EuyA|b)pGW^nBk8k=BSrZd>AuZ;fAokzb;g08-?S7+ z*eR-d95i7TGa5;DTZlB^Zmzn{{KeDfV}44$Lt|u3-A(OAjz0d!lH#5uXSl)1E!Xbc z(kAPZ?`2TK$ZorSauiItzMZT?V30$cvfp+}3!Ljh>2|ZJ(j%~x`u^*>tlFW#@Q z4QCsqOBO@H4QyoTX0|(@^XF8nGRg>W;SEYN1U1pgbx$aS?`Mm??Fxp#)|V?;ZeKh( z9~%Vdx5TP9{kd2x@q5HYXsLJo$!j-#OUJEe?agO~*QL?IihzHh*Jj1r-EpOk%)isV z_0z+{XNgWy%fN?gnTuPnv(yJ#Wx^NK<)#OXBNcqS_cJW5QAbA|8RYGkuBSpxvkI54 zqfe`S9jlSITQ{4BV7eg)^z|9*=8CG);J=kw$hGOsB%|S09-xKu9!*3c9c22iLAM&e zE+*%gd#td8-#BijQvFZ;3#KkTOEKKW9HsbI1@!-mWZ+u@B`HJMm{DHso;l7 zIKhXUr_@hi7mI#@`$To6GxjjwbZdd8oGS2Y>mK+vn-vQUA>fEzhvWTOwPs}4eO=~k zFxX|?e(1msG~A=i8@R*%ONkV8)VL_ig}bczyR$M&tNO&{5!`KETST3 z9vR8d=cCiOlw~*8YW2W;O2J-wtB2a3m0DeRCr1mqAFPa!5S5R`P}ZuCf=&qAcnAee zm=*d>Thv73ySm+pUDHYpyfqlk5N`-ID3;S;gckFusO*2dno_|*V8-vcjhTrw#MUDIALE*X)5<-f@G3UPz&>rrq2*Z?Msw{Kj~I^ks;(2^o++(_sj9J1KES z&p33N<+Wu79C)Vy?=)>`7}=>@Oj+Z6D6qC`yu`7BzR=--md*!AYrH316I)1D zE29Fkf~TgZo{W^}PCE)bwCcMP@->+;8lXtN^e6A-Wgk9i`y)#gsXg%c*X_J`x9v;K zzvRUY$}Ps*us!D=TY6GrT5uA&K}msqXh{vDs$}Dt>P;VMnDD5H`f(0g7~w812}~>? zp1nI?IRBnS4T-7U^e6A+6&C-&t7xdpEXQl0bat}f#ydUkVuE4L$U+BxPt!v7AbjAC zR6UznHt$0xRsWQJxJlGZT>4E5j3$|fpg%*fT~_qPj;3=5avC)_Gw$by0UTlh35_ay z!UHElxJVX8YhroF7xWaEh$uNLlUor!0~U=92=N_+!#b^c%&K8(wMjLvQ#mK+MNKcz zn=?wZTW)~Fn3~i*+)BXxBe2oPH*`=&?XKq1qt%V@M{=0yL)}k}YP&T+KE@(>J3laB z_)g`9s5!&-rF<-4WnxT3xhvZqKo%V-9`ibNLEgs!S%F)?xe*4~H=tsY1?C!V5b&w2 z%LUktgB8bdq5tm7)4m=fH}P6gutV^*J}o;iC(4M)|+Jex@dTsLgkoB%l2 z`S+h6YRuqEI;Cwiuqu+7a3%PPNcnnS1M2e$cT)Wsz-+Y2;^w&F5R)Lo_aA7bZk3e* z)hkdj0C!FX+;U4I{fg|TESo=NPmez`J`3-VcJj%@QCIp(gCiez&d&EMe)^D-ruO50 zuEaH6ID}xg%V*QtS2Ex&u%Q4TSo1$JH27VHPmy~=ZpB!!&=r%^)d1OG=Zd0)iaM7< zH!r*RXzMGI3blO}RsgkXq6VD{%uK$wL`rmT)wx%dW}pAY<~Wm^yI<)5&Tb_ns=u=c zcv~^W1ju~RrssF8`NAg5B-IsAuj)Rf2*qcvzH+$P$JjmsDLkyCHd+n!59lgGt-$kq z53hk&>(`|X2ZgE$xnhK(XpZzLtF@=^oV-VD0cg_lVniH`3|9}ZtPj~+Na*asw-9*Y zux8@MU8<*u?j%LWJ!g~Zs(;h_BShNh&nI8sx-rYpvqg4ig~5_K?ZKWHC7tiMaR1;U zQ^!8MUGoM?42Jv4$bb|qQT#bfTr_}5gU%u&iCFl8^}HcxbteZLSt@eLoog(To{TnV6CqWmoI zWkm9em&iNzlSA?xLNLqG%teoz2X^R>PofEhH?3Y?r?4uSOA7TwizKu$0oW&&nD0v)>93R}Tv+An~lB;ePnQm+h4w zzIcg9($BZ?@WDOUk421@J%Uw@tHxL9@+uPMNpg37r_^KG*t*UkGkNl|-h5)gN9$jD z{wjDk(sizO1^K7oIe6#D&TO}f5d`j~^S%spTs^%e1s4zJKVY)+=JLW+rG)$C>StULKnz!j=7710Zvu(`-ohdlBg)I-LUI0 z>=)-v-k6~B9X8jn^HyiiGnbHsrEynP%OVaR{3L`cK|#1zROUn3Jj6-42|Z)A7!g|{ zOy}F?@_ovVxXU7d*Nr{REcf!P)MXn7|yHT2$Ku%F* zZDj6gHR@%ZX(OOgKK6ueO%6A0`%FUVeR+lXelLb-(tzK6P++FUtj~-zM&zn}LERcCnti(g zUUTsVIriEYB|{qSyY2jTw+I|qaV%L2Tz<==+xeyx1>C5&K0de5yQb)qPw$@!)vR)u zj&EERYv|Z~XpWF)7`+v_k8tDY3)KXt>+!hvrMWE_J#7|^HG+4$hP+p-5_J4;L6RoN4(t2=w&19~WqUl>tQGvr-PzA}<`>?0(=h~qmx zs*Bvq!SGX$5&73#7Y-;v z^H6MVG4&e!GU)obF<1AS&jdZg7RT3ObgR*>h zkDI!~1}>y|HicOpU?pFny9U9LUfzHk{kZ)~IsXM6kBWzf$mkP_)Wvh9n3Al=da|vo zfR1zwhmEGC2XbKG_C_ulu%L)mCMfxxO2+^M865&0{=|(CEVAM&9Gk zmrNf*X$m^%{HJMy=@^v_#$m*=it66(B4LRavlXX0%H&@v({(Q%NmKJE{^UCnkQ!W6 z6@5@2-RN-{Vk9XelD$|ouu+5^7kc7DN`!(m!EZE)^_fn z^U)yu*$>_saMGw39n>!|{7OXLH2*|lT4dbyV;yAEH>15jp`FUHq@qlX>2Eh{l(62E z^Ap(*54&b;{j_=+h8)TSxn&S~Z@UQzi3xS zigok^?phE?t3FiD70V?&!XnRXgsHhub(sZt7Uj;%s9a1pq!TMLpF|<65=qZo;-PBk zjSqE_x_lkCeu$Ispt#_Q-4}Y#tp&lWjMH-SL$rd0({1wF&H*QA~*G!b0eaP)f3pzi909#O11dxNA93R znWJ|6(lL%3lFT~xZ?*%(fR?R>5J8F$H@GwM?O|;TzSgnzCckK5az&hNe*8~&r%xP? z=Ynx)B=B#tqt`&_SBH)G`qPA(TygbQ#0{fOp9`gNfccmlZZ}Q2|6~^y%+}9px&GZX zR$XgKmn2JDy>^YxLgb6Z%H7eEzbf~-Y0a3tOvdylvjepVEFZ)`pCL74kU^%|8tp42a9LB9p6>zz)i zK(fvEV?p<*uZ9`B>4olm!|D18oi>LDa>H0hR*CX!P*sC3D^4^Wa86yWk7cDKQZvg1 zz23Ce>0DH}WBW{;Q^hYV{w8nB)%l>e+uW}?p15%7N5L9PujO^4`6%=L;V%V!OVasx z$No-Z$18)~=H2yw*?N#CwF~m|3b^JH95;k&(u~u0e1F#Qi-7bZb$saVe_DXSvz`^Y z>+y9AXM+px-#k}5M7P8d=B7X^`^U>qP|NE<1Vdx+RX{fXDeikuOr&s8C5n>V+){q{ zoo6XA`d!Sf4xP>g<-IqdTO5VQvJv zz~P3`?pBkaz=?4m6B*!2bQUzR*`Fn!p*F%>MTN~Aqvtp-jwX#3X=}8^=x+YzAjwFN z!{8X72(j0$Bbkjd-&|U0cwdP0mtDd7vuplP?#R)&Keb`6A@HnR8x+^LSB9)l6zZ|en7=6VKWKWVvF3oVh(uuhp~qOvda{(6GYhaYdL zw*pEQXaG50)A=emr>Q4*ZaB{poMbzb2AM-hJ`3m-8HO9_2+cF|_SdD?lI@+Q;|Ycb zTn+NS$Rj0UM4CYF%J-jv#{Q!UIjNYPY!4O6^1bfKR9fcrD`Gy&8GE8@7-@@*?8RBy z*nadoMg33v!roxQ0>IVUbekp9*_hlTXL@Fy6E{f3%X4xA0?XvtQ%$5$YOYhVO7KUaQHc4!ymZ zJ(PMa{ia(|qa69oB!gRYZ><-COTX`x1xi#3lIP7uPnf})n4pRmn6HI~2nW(YpIg%K z(|**34?%Ur_(s%|OkPw&9gj^t2^{Om|6yG~bkKiJroE(Cd|&>cU?+6R;mwHj4xAgF zx@Y(%Nh#<1Vl+~YuzAKBsP2+?g8l#D>n-ErXr5?ck|4n$xCer}yK4v@oZ#;68XOjv z;1XPeTX1)Gx5a&N=bh(o_xFCd_Z#fa&Q$kwSD&tPs(SG@B)To<73Q_>i6v{j78Hfd zWVkd>WAeYHov5zE2WLa?1=;ZMVDcF3)ns+U6;?3OUUQ*i)$TIP374+J@Pm7@{Rx@q z$t~Sa#H}3TXVG1`;nh^KJWbx4?ZA~ReVO4FuI7vSzv7x<3ki*m&)#b~y)!JZ@je+` z1~kMy16^RRuY23>$Rv$SPjCo65>V!)@|@W@e9Iz}7`c3&gvn>`^o@EVo?U0!pxa=c zXMXAR)G)MPT@$Hbs(qU14J-WeY%p-%cqDK~V&YdCPD8U?{gguh>E_J3ZEx8Y2lVsNiWG`b)w{dsk-o?JkyIm1b*CP@+%F8GS{1T&`Yi-Cga&vNHd&& zZC?k9X;M3jd>sTH6`jrt-Jab4LxQy-_!uC(NECxWGH!8=jNaQ8GAAhh0 z`8Xpr_eqY*{!py;YOx_|V7E{Ry`Zl6p6E#O-_osDRvPLeTV+Yxh=RN(XB;TTwXu_< z;AhL87>&Q1rkot*63Q;CA)(@&@^jKmCWtJa&8Q_geIJUb{Fx-uH`qtOy(> zu7!e9Rr>o7Fmb7Xx;{>FOuSpVM5ifu^No1@k*MH{Dhpp)J(WzAJTn?G5AOfCFff242c4u zDgJnw{BJtL880jSyVJ)1ms$IN@jm~Xyo7}bCHz+J=*so6J>C_=7g)Grn)arNxT zVnu9BvjZQB@+z1abh-1$4vM1H26e?hvKMZUg5ALKw8g#`&DObWc_*yPgFJgH3(Ef< zj?Y-rLpWLWKTnmae8g~$`?B~4SQURwO)Yn3FdK+3RWZp8D?vPF?mvi?)moqiO zWDN!5rESeR2HUS9KomKuyK?&Q3C)Z}H><%KB8VeJd3=nt*Pm(x5tB_7bAzWaL*dtW zG7Qs;OAXB4ku*8OwXllSK5^4fy}4e;{fCpOHy-?tJn`JWpW*2%YX63FTcOR<(%UGd z^^E6?3q~9I{OSWiP5AW%=Zjn;*}n!rFHb)w1AgOK&*`1vt{hTj=(T=E_waX6e`M~R z1_5OKV_JnWhuNp0b`j5*dL#F>CU0HPfSSGch7$b@nWFYr0~4AR67T3k^(=+HM4)&} z3r#ugz(vzRg`I@kp~^(6Y%7;&xfc{&GL<5Jp|v4Ls>Y|r4H|Ex5+`P#n$2u<2)vUld4sM)!t# z+t9$qAd#yQd0hpEsS9LfamHu~SN#N@|Bg&tp;R>nj{l$>A zx)bi8)G;>SAwo*V)W9=VW`4%UbUyA@S3V**{OT+0b^F-iRE)_r*k5dvXcdh6_pe5y zO_%TcGr1CyPD3Z1qo(2KWVLx?{O1`j59x>eabfaglrW1mxihC7RD;TffxnO?W)8R) z!8w+Tg@4zbXP?L(~X{kYm0T*KPU_;0G&QtxiSQpB-KbrfCaNs4J7cYdc5s^ z98RaXXb%350^!OzS-j7pEN z2wo|8kvhYAUmOVvg`1nZ;R6Ew%W71lkR9dY?JX~@7#88Fov}()08IhL#>2Yz51Oyo z|E7=E>H3Ulx!W84zgW4&9uOGM&T6U1?H+{iUZxV7Qbtf|$mep2&dk4vFSmr>A#E}t zO*fg4fZm5JM429_jHh+_n{mx-OiPb-9~yruK_57>;+JnKeJ+P!M&b2Ae3048HzX9q z2$UIhzb?dKA1$1BSKBz2uwyW0j7=f83>DQLh`2#Do)vZ~|D3pJPw=7m_f*p(A1kQc zv8#mp?Kn>-^qc9eI}~1Bo%yDwueSw`x&%+IFA*OYmk-t`J>c#ixW;>`G6%e?kt6$C zt?8AbY)II?CkVMjKA37r+n1<1p?c0wH_k*BIi6?6rTemEQ+(XsTm}y~g0jGzG{Go) z*K{$nkkg{0B3sclx~s(gOM;{&KuLLE3lUEGTD&-mEA@ycfhPOA*!dJEQe`@5h$At& zvYY5SPR4YyC|M5n_o&{b?QO|_ybFtpoKbf?4-_zfLK+ox+o65}KLBoxx1J?qw)D~~ z1ElXiqw0Mt&P`F2fY6*KDd6*M-&l!2}5JgMQ_Ze@>bI>#TrVhH2lK?Q%iDS-m>x+XyqplF6t_rIPZY6J^PG&rElw z2B9qi;`dopYzd$8cC4t>j7$fS;BbO{d#(qGms5Aaxz3_;wq7MHC?x*<&sh*|dz;jh zsa$X@3Ebf57+Pl^7My5giPx#ae*#FAlw$u*)CQ8S*l*n8hdUlfs#?J4^F(WMl-XW0 zE_tcsWw+%dO6vcPm;q4v!=0U1bi26qef?>M2lZPS$F0jG7J;zpuLY)O{6p=e=37|D zB%W5e`TqvIkToC)e!JV!SKO;>q7ubn<*8I@@<3|MYqXyAE5;Kq!IWy$ThoatCl|?P zxDNp}1n2wajnhgl-NW1bb4aTVOMx}g!nk%f*h_EEy)Ie8sPM8V@W=h)vrLtB`oLX# zqzqdVc=*TSykl5BvJ2EKrs{@S{-WAx{ub%^j(CrsluCO{%%Z8VzHCAa&3RRWP3ll@x7k*4 zOQSbVb0%@}xrixFkPXcr^+ZH3_Cn?VRg~l-B#D};Lg1IWP7wBkD71iJQgpUlcn6xQ zQh0&#j%>pGSFW~j>L)&eR1Qymr!vZEtQdVhlu`r_I-%LZT>S%l!Q@bXmB_+Z6w3&y zOvmTQ)GulBio{6S99ibP((|KE#bv29yGCKApNvNd+rmY7jb0*YXa~BsfN{2i_pbsK zz*r5t)W=m-g5Wbg+oD2IyuQbdOchli>)t1RW%O+fwbS^s~&sz%aMo6Z+IoY)EtPeW?|f)zS8hbkmH{ z3S3t?NED}hWgTD4MNp9tFycoXkS*L74w`s-l}BiM*WR$5(7M``?B{+02fTz)Vu36I zQ>NXw_ItEO!3&1O$~i7xGv323mydNhF0ixE?ylTo$7m;t_YW|^U*<_QwA1#!&i2)H zBPAeA@FT|?id>+lucz0X5c9wALPs01(K7ppwh@J9 zsaJ=j3zu8;sV`y@ac0i~F`f8>Dv`&BE;YhwTa1<6-%Xg>nusu=KMfMpE*Irt{pKl+ zM9yy1kepO7rMeRC;^wgk8v{WhrRHYkxlpz@st*yR+tjomk0YMq5RqYUzvGg=mwnaX z-x_g&tl9w=>;{u~(DfDrwJ4i5cY_@5a?A)p=Y{)(22(;~I_R`W*-_U5S(T$ZQHB zM`o|-_VHk(KPshGylq3e-+T(}Jh0>auc4qM)^i(G^niemgSE9CvDAV{HOmVl9_zJf z?^E;rn$X(U{jD`9a=0h=lW2wMI-^s~V;jHgq8=CBKOF^Gavd+Pe4omI!bUuTPqStZ zTk|A_d}0w>-p9Lk5E7>6fs5lAo>{jrz4KR&A@9pXAFj&g)bd|GYhYXRwa%jKXLdKc zmh^M$XB43|i1PQpv*{oon=38frkhZHj~!`UbI+~1r}K4Ru24@@NZ=o#&2^nS$AzR8 z_xsMCl-m*^W}j`;?9XZ8m!0cyO@0$EKre?rqBxueF4O>VaOOneKUTU@^P`uW?Qk24 z883&vob@(jMyBrWZ#^2(H-+O@`Cojb?r;0;u5MiP{A6x|(Y7)i*N0eksphfzxKiG~YPg9(f4 z*{4!7M#BxK4yyG9iHYjK+0h@v>t@8pC#?~Hn6lr7iPiInA1Xdh@K)fCSxr#tp*J;B z&zS{8`!@|3Hf$+)JN^QwiYIi1Kuk0l+b{e@;Hl)eyvLEgd!3mi9a6n@S(^K4qVG`p zR`Gt>a5t6f3|pU}30FGEa7rpuD`3o0?RK0wVk2X5jLas(?^b z{?T196I^GVT$Y716d2(*FE^W=;85ay^BwBlSI3GhXAUTxAx2$uZ~tA6k#`zC)*L{n z6$s`xnU6}W)`E$n2V!RMZFXM;?%04<0S|9Za!F|z0!%2dsIqA4Kokg0y#IFJSD~kG z4vR-8Ha-kFnKI*9b;Hfx0voXj_D9Gu#g9?}({GeMeW)hQq$opgeZ1DN8-ZBp@_q%P zgf5y9IQ~#JZmv!c2bPoNK6fO`WT-pe=${&!Q06cqHQLkeZ0Cq+FRc?RZUt%R;rw-u ze~g;=&f&xYwLT^6vboxJ|e9^$ne~!N%COPg_M|H#Nizh%(z|xcnQ(t-na7#)+)XH^mZ! zdNV*gt+UUchE^wuA`eYwJaP_U1YdTFG@Y!Gm1T+?o8FjL_!_Ptnj`}?44jQNu)jNX zI=hF1M@v!ybMT+HCQ0s3o+qa9Z78xL*y^2Wf#0we8KTs0Y4zhonE^f^X@TC%^}+gV z)q6ke)fbqsxTReuprwD9FM!TlnyC*32Jb&c6R-7@ij}$V9k4~x5e{6Y?E(S+kjPY? zg6zU%c{A;)rp8?-vLJ4*%qzE#P@nxhOq%4%l8tyO)sQ1kb}V*Rj{MNg1qn@uRNb>383)4Z`0-QD@JeZQl-j#a)x`y%pUkj z4}LA&R@c%qd}VLF9~?@uuXs+%m~GEvgwU^e#8kkrENNGhjAz0&sL8VoJ~bme-@rRJ zp36}MNW6%E?mjRhv4h!WJ{5{kSJEv5K;UJkzTts<`T$Y{=rO7R*k~oJ((Z!4DcY>5 zzv;pX&7bX8Pysp>=zg(IB3#>%I@yK>yIm_LALNmSV!+lDI#_+jA!A~V2PlA6f(-gg z{WBdA>2UsWqyuCFgDH5nsXmAeMtNntBI;-zBNNMe^+9b0lvoPejZeNZZn^2eb0#j( z0p9mWmt7TH@-S z)?AubN~_5hAw}{)zxw2HqfFYX2P^#%_mP$8mDfdB2}}3oZgkWvw2&Jho_^b3Ycy0g zC&TaSp-{DlwZRyC5A<<=PpTrmQ_GDXW96NDXjmW&CL7h-I7-@#zQtc)7DO9~QF1)eXv-S$UtA>*RcEUZ~33Y#4wg(ChxahB62J z$~M7qM;BsaYv*Iq4dD?V`*wzkYQe1futLQvd|GMTAjRU`U&H|i?nNg6oj(YV!KDB+ z*SXCrF&r5A_=jc6<3C*E0Ws`8-YG``y43B&0h?YC)013EeB( zE9|D@S?Icv=mU(TT-w;W8A&ezZSU~@&zv2jZkS}{yV)1__rdoO!zoTT1Dbvtjejrp zm}VTBP=HrcdU(xAY<$Mi*lzanIq3d)KK*F7ASwBNG9UGwB33C`T9o5lzIy+z9_JbB z*@lylSM#ktA7^q@M`Rh5?wB8-)&%(3feYq8&iFBe?m=+3{KD$3R^0HZYP|m}i4SLm z=l}Rx)+g*NJMXNkvI#To`%y7nOw4}l=v}H6KT&=Hvzw_oZPv5+93N2kXie&~`%}!& ziicL-pTx&+p~t15VXA%Kc)Rvzx#y5<#8ReChY-eYIJQ83hkN+dX0Q9RE)YCjlhakE zx7AA{un->?=?al0C3RuvE>YjaDhNR=7>nC~{yF`j$?W;=&y*Txovpt2 zl=~Tge0bOfkY{YUcUuF_K1)5tpim(IuCk`XOlV`}QUH)zEGV>P4WJ0zHlMG2fg|49 zoljm`ZvXXkbO!#yCg?ak`c3qs11IPz(4|Nxv=hlITu;EDw;?Fg#H=BTV%@>Km~gm% z2+TY}l)G?}!!tcd!9-8VM&PJC@thE12w;1J_$I-SqH$6I^)3&@4IPlh*4oU^?Zx~( zS>kGxcb(1hLWrK$rSo1@AO@7taEJ;sf3B=xLbn5(J09BSI;5joar#GfNLox3*)pj)HeD{2rcdwnq&iAZdr z%fTKFVX!En$@|^+sdx6)NR6lrDkQ1@)dIME{+ioCpl3j1$|>;tvpRKT8#yY{FP>=t z3qt6>|6Bto3$`QaSICm0ln!Xf<76_L*h-NUMwZf#D0UFyIA?-ZNr6rb&JqRGA;fFI z($wq7g0KpMln)1R9sh*#bO2}tU@~!a7wxnNU^dv*;D~WB6D54Ig9i}%x0iz6T8C*< zd{rfyTWpMxz6%0h?d8e4lc5%b;Hp0rx4x| zq{7JI&9$@Rgo=Wq17QT5K42{MHRdO2sS7+oq_NLgQQ%DgK2n$UY$F(@$vP z^xqd2s#y17_p>iqC3U|+BUJdU-zx}%WoN#VF;2xa7+8=8W$*FQ^Yp%N_sj4j0!V>J zw^P9Jq{!6}IY3xxY0$9`OYHPX7Y~uHCm3)bzX~mb+Njz)St5({9DJepS5hC)g2i=|0k|& zY-?>x!fr~M>BdPTT{`Ml9I4oOqPQIs)5}H09WN4T5Y*>vxtS&BO@=w2J%2ma$+gF% zqxR<=1q-3(r^Hi%)0@`&gS*g`b6bJSB(PWgZJt-t{zhd^?ZamF+I;pGyyN`>RiXTc zNlxPX4FyP<-RkI`*;!8mL-grNXUyxVCGq3>{^7Z$X;ovUSBK{pp@%!vYH(jDche~k z=QCuTt)cCiTP4qGM2>;L=lMrt0RAi3bdZc&vDnX8tg`XIo%O>CX*?oFS#6D#C1{)iN!4f0J zARRkoB4P^?FGpT=$-{XMrM9Wd)!4a+!3jxw+ zwySO{Y9fIw75;$K2_WF%{sIF$-f@YYST9zV6;w(<4Fcs-1D3@wg%uKzgX1s|B|N8) z?=Dzga>!KVNO)0K`d#Ufl5qwa05NOwfC4Qsp!lpZ>j7gXsny!ekZ8KDZ7B1XCb{_C zqi4#&-Ou>E^_aC+gL+mLCY|-!hBSndeXq0OF%zpE4)q*{oBT(5~x(c8X6v|Pz zh_s$QXUo(c7f9XD5_R-6$bPT(&$%O23^Q5*l&Zq-V7$AShZdn;F%TV1zdNX?eo2dOJ>lPN! zJgeggIl=#ta%HwSgF67izx@yDGfvOcac64uBgW3B&V`Ikk*Y$kINyLUtfM<0pmdGl z?tEqNBg>X%v4J2=hks-UZLIYhnK{wz00Gi)X47F&6g$1Bvs*w>YrGC;Yq+qyp{&Lc zz9@{dJC{(L>IzhYPnf`uvH3CEc1;aPyoC*Du7vC{{O%7slvrHxZIE4LQ~luK8U>_A z;!Zb(HRqs9DaSti?x;<@JQ}K-JLjip4|hs(GJmq9XH&>;%q&>&$&&MXdrGW zisJ89Pu8}9nD}qvxB|A6Een<1$4sA>H@ZMS&h$sv?$80*{&Sh&kX^@BV}{>c5x1{1 z>Msn_1j3WNs&bJ15^frbt>g@nb8`9=2ixMsKsJlP{HO7jOHTe*G^XcQ`IFf{T=tS(bnK!SpV zs!J)=IBBw+L#qQxFvYLd0)tl3unrAY;{y&%xt@#13vA5EDqDsI8}Q|0Dx}LQ#pwmv znV7#bXKo@sysyep(=&#sPhCZPYO%V!lLzO`AKJcI^ca5m-ZI`Bu96FD)-OaQgXm<*|n;xO{?vLG-h`u*grE^B3R;Xa9Iv-}5Upye;1+RX&rg3zIU& zpvDM&1;pNgZKp-O$aL-n5p5EP#U=%r^|`-qm)v>HrR78i)a$HL@wg z-=ugY@FX%9j_!*Eri95+z=(!-AA>Xdnb}665Xze7pvi%yV6VR3lRZE2o1z~rsC#kf zO4D;~1D)i=VEe>`9gxh}d41^W{*KfWVUz0aO=>+NE+V27l65*<2ySW>E5or$^EcUb zAbb*Up?$E19cZ8-VM~3jFzp}H?hfLS34O9s8FL2GDn>i*zg!au^*=}0Z4alv1^T4F zeickMxNgFuZ+$j(?vYTx68R<>Q&jr^Wg~T5OejQ{T3P|! z3hYo3o4Y8mm0m|6?WMz1?K{26BVg1f76OV6D5(I;U;_hV^Ik1faoRY<$P3u&nKvWa z>6?F?m^bqqdyWwL#>?{cYRl%!GD=7&Whj9D<=g;>zpkI+Ut!AM{<*X?=ZLq#W@cu+ z(?fk#lre!HP-lQ+L8#kI)Lr%-KspAHk)Zb*O?3{ymB8|1vIk_!*ieOZQj;6{K=qtv zf6C6CROhE4x37yoV7Jv#u%OCe}j(>ad4)8^>4Y zL>Np=_B9RQh|cKX0+pjjKKul8FE;f3V}Zz!Ii^HGWnNr#|I6>ljPfo5BvjCFaD=^&oiG7L+8A6Tt1sZ0 zI*t89 zKvSYmy#0fdeKO7(JeToZk#F3T9wyPKSH#`^LPA07(`ITPA;~glXssZl6^+0Az?Nwt zCvdS)f|A&*>QAP*I@O0$8(cw>zWRFcWvxN8X?DAn;Z2C^tssP(Yt;iMG+R&Xc>Gjl zBK@f}DfBhgJjB1~cWhTQxVO0l*V-qH<-;@s52XLqCZ|3L3EY}76PK@5F_WtIinwxz%s4t7q$+Tl192UNaeX)V*hK6 zzu6n?hz@)c=raJoy!4u*p)b+Dq{XL@xk6UF=2QA7v_yfF4c>H=ZAeHlA(s_(`#7bi zR*b$TPrtm*SFudSB-T)crXGEo5hxTN=U8&On#LN)q8{Ur~IjUJFy z6ZC-N7o<(;vdx1)O~Muo+z8eyW@gGcb{M`^64obeXb}(_6?-GK5e*EQd}lw=l;X_X z6c%K1Pi5%#wHPcv2wXb_kbl$oO>K!z$)`4j8%8w#cvuI`^4A6CaWi9|}^a_3U=x z(B~Gl`2$zGdyA~DZ`!YaXLCWfxpBc%pN)?Mw!2axS@0~{)TQ69y8i6c9&Wu(BjGSM z#sF{t<=p)>5&G*#$b#Hcqu*0wW!vuX%kM?~U&uZV)E@Rk?`q$dPqX=@Qlm7X^6pTU z9c1$EQDbP^z21Y6C07)^9;y+{zufKH1ZU-sALtWHymRX81$l9QkE3)-8+ufg6Pt79 zy*&lLnVq-wfQ!dpU%X7crY!4^+B9k?emGVxVx}8;-kaJ#w=~S&BNzd4rMy*qr1ex(finNwAJcu6M`7d>=K0Iwy zhz7OzcUxlGdISkI;NY6DxCTzova;0XXWH)wh;g|})( z?wFdXw)>YGzKzaP+|P~1loo#z@Qy6{Stk7#IAjl3M+Qfk@^(8-9lx1=x@;L5%nWd| z#?2)4hqj{C4BjOtFW9QNo78O#jBKTm&1yr^1jVQ6L{fv?f&H6|DUf^$nB)0gKzX03 zsLdBH+TWFOLRZojB8#b+TWi2IUq^UZG+JPPB)WX;#yq_sd6+RWNr3UuUB?(lH8x%~ zsQCNVfgN9YWiesY_@J=5K*g3|AyYd_wpUPMxpe++;%5+9wFb1+H_oVER`y8F%r38v zk=O+A^D5JhR2T^E<|XUm|1=?;HTf&|PYw#iz$MTIINyAk?7c2k>O(>CTpC2_I>I)m z%@*zhkqf!mPC}XvlCbHP%HxVvcu3%)SGtmwGdC<>ii*QKJdXn4upG%y-c*k*IrAGy zSl**5ax>NvL-?^LF6LRE*wvxhp={RV(Ey7Roi`-Uz|@kfWHyV(;5rpM!{Vaw8V~i@ zB)-(P zP=%yn!=F@kHoOB7j~tgiP`gJnpyMY+o<)wpe4p|3HL#z=&64k z`}uWAniu*Lc`G@%MK`Mbn}WS7TA%YHYY;WAEA>ZKXhuxM^=c7?&$Qo#%FS%mpp<^< zTg86$BdeUi?Ym)oL5sY(Q-0ec7=7r?#QlV!@B_SLS}H%HIxarjW=V+H-MVBQ{zMvv zv2PTTOI_~qxZi821eCf+-gsab4X4@vF)x+$E9NaW9uG$W_YuW;Uit5>1*k~x>qD*; z)PY@PPJt&Ij6o zstMsHdfAKI&v4C=n8v}|oX|zx@o`?Uac3LLwYH>Y6D zhrfSPb#hyo$7AdU6`RHpZqn$3oIv5Rt!-++({Sxww>)DvS+pm+2Wn|2I!Eda$Joz6 zKS#@WuT&tK9Bbch0(?R>E*(w~ki{2idaN{npSgw0hp41t;6C7Tj{RpG3NkJXJb0Yc zA#eL1n}N&a|CyNmy?oIB=ihDb$Z)rh|NC~k_W}1R%N5Tu8VN(9P&@sduyFWyq9db1 zMuG%)*r%{86MkY;czF1)9Fi*YxY8>jXptO1cjA3HX1~Xt5dMaLmlY2ENGlMs279vL z<=)Hh=?mlAnUbsK;N%7uud7g;4-LjXHyyt(98jyB-Z%n7k8>7tB5kIi5V9 znc=tk`R)`~hCe)|QMF^g3EcZoj4so#t;rsGfX!C(vYOsNfwy)LYIM#+EP*V519Kp~ zC|#QO;G>AWZ+!5FLIcChsCX?J4W8T7qkL0qXs(M4WTFAoleOqooaN&;O6&LVLk11gJ<~mqi4sPDl`VGb+`uZpM-q4&xS+{P)Y{W?t){Nz z`rrtyBo30aSg(qH_CRE&pvq>>7!VX^NBjJ@>Y(_lB8wRMR~QH1zO_t@6Sg`2Ns-^^ zP=N^+RHVCrwRtHR!-+|lJKQOcfdQGGnqH)m=h0_zV@AY5ixT@~D2g^OUfjftF3=W- zR!KbXJ6lIpN?nkV&?QPXPQ?08wU@Ipen0NfbQjg)UPzj9<$P6sex!IHq-h}r=Gi$a znwhO}Q>RaFVL>j9BF=fFX{$u&Q=!9e)J!cogKzloMNMxQQv$rAs%~&x6T0@2TVU^d zT;iMfB=5&`y~yV0pl&7B84PrV8FWo*!n|6up!ja~*`pp`M?;p>HVz&7G-Y=5j&Bt4 zah5zbJ`VDu?S*oS^oN)xaxlN0exo;T42{P35iT4PZ$esdZ;;~B)grZAt_4q92J0h^>);JNaB=jn8T!j68^EPT5zG?6qjkpWwiC>4DM^f31!}E`ScX>%z zcc|UF@lXXwXiA5*hD$p|D^p}$ea)eQU+>hEphGJpro*hR>?(ijWNgg&bjfM-Vf~2q z(aLH9G|z38eFJCp&;{E6C`kro&8;!urt6D;h{?v_w^9e|#_T@&pk8{be;1j_x~9+G zef0R@nwtpfnwN;b-U<=OEyVS3*pMd_JeLM5uWyyx&z|oduKiFb0oyzdWRw?IcrPqe zmY3yPnCovZ?%qMV*NM(~pCJcMJems{BR~Q5TLIv3qMkk{x@bnXbNUgg+!CL#!rL6S zZ!`O-)vm`P-EnzlYTZ+~Z@vx-N)VyD<mhoa)gyjSF6%j^JgbH zH3kB4<_9SouUGSSVM9~Og~t#+4}+c_C~E4HVjT&al?t~%PlFe6=3xtsLR~2f4&BFC z(QVoD>sk$J9r8W{wpyO3gm-r~9CnHu)#;DXYv)=kEnDuMP|HsQ?iM{gDvj1+NgJ8j zWvKWNya44*#L)Yd!nxVoyI5mp$W{9J_Idikr|>@9+h`VwFX>N=t&ttJx^gaVR4_Cp z-nRh-&j&X+?mNnNwK)#0Ied-!pG-5ekmX#;g*0T?DdQ~Uwx1SO9kb5uGtSz6u~%p! zg93Ose!*nF(pUN(6YPv;^nJCW!+Gndj7{`v!pUNDqP!_ntGT8_oo;K4zosdGYV!0d z_I{)Rq}&wA^aUrEbc-+9eqiDQDJyovv2lX|Arg~4xyGMRaJn9NI->7k|3v>@8Jq7A zinMbNvRKfT>1usY0nXb~PS%q)RGa4+iLj?07V>?CrNIjidXHW{WctV}new+dWl83n z#?`d0C!gfkkp{=(Hk{grs1NKX$GP_3VB_hXM)sL_&9o-4vwgM=_VyHVeSaqtn2v%2 zwrN(Mb~b3_M>9aWlbGzAKmQf0Ki` zS)sih;Dv9(!yLNsY5<2+oke4xrTLL+N^{ck{dgD!qb3;2Z*n#IfRN#iJ$%nyb6LW( zH=d=T-@R7ZQoP!S6=NESY#9@;HD|36jqNObf2Xmc*z?B;=km$L?lT>NPF|1qS%Ast zFx`%sU3&l(=4ZkP$mC7-=LOfgt7_wfRKSYO>OaThx?9|#);78W(erXad8ERh1y}-g z&2Tnak4RU$ScrA+WXQe;JtYf%)paEG)4pQ!HbC9dNU_toLDQ7}TOY z!^wtMg3Z`b6)BayCeMsnkjox1ByrgKHwQ(DkR-UW^mL1@k)ih#88SUg`$?9!{rPXf zEc^9$)rG(b5<^c-6fQlE2;t8G@@3m_9lobcp4b!Vy|r|=(`);msb>0dgnU@|?iy+x zeHfCUKX_)Q2{U(pQLr(Y-8H(*N2>Vi>|LlBZw#E3LnQE}wZ59T{+ml;`??5D8RvY0 zG0WwgSF%Apq2JyqIDm_}|LXWHh+Ntkh28NjN~mySYA7q@=Bs1kPOt1)+u0MwLYXQ) zIZuC8ZOl$DQu8T;Y<3+W5_Y$|skAu!L&{elwvz0VT9IR#rSk-5oy5*+?V-YHt3S)u zJBG6{0jBfoN5)zEAWV*Lup3QPVzm*al-=pG@D^LT$ME%gB;B5$A(>4COL#%8-FxA8 zXDafkuplCVQ93EoTO9UWi8T-X7W|pbWQ|*S8ER?#JEWo7aL`b%70A z0+Z%9NMG_~+xY={R{7=OA}TTYrXxCHD2rA?EJb)l+pnFm%mIduw=ZxD6@?x%u+Eei z>b~cp2cz)730Q-tl1UVp(v&-ppZu?v4c6KUjv}PVzTor|&*yCONLbSiR|5FM!+m}s zt>dS^4^BAozp?|S%J*N-bq3Os@1~nN2$Tb|6#Q-+F=05%7h^`YcH7_DU;{M=7K_g_ ztON*tpu6WPxEJ3BF7ScDwXwM zT~xQ^1;4etx}J61BluoFBAy&t?;e4co_G5B4)BRcX{CYgG?t*Glx}(s-3SfkYVRYu zO1kKz7Vr?+GTb3<^;OG?P!Hfhfy0x9_Xn9JGqEGOzA}F@ z)gK+|Ou=Xwh$0_BisX4XDo}T_uz>lZi@MRe?YUmOCM6qR1(b6-O}GOdW$fhiW#tFz zOrrP9)ZwqbF|f^Wg3H#tgo?3w#b5}GN&%`9`HU$nE&RpM#eU+IX+{W`Z z7JY|1KJr*#q(`83bADo_w=YeXeo(&nmy!nPvlb_Kl%g2AWcj>0?m$2OwqO$PQ+({P z%-$aa#}{y98Vs=QZ*x}ePGUqk_L#{l3Z9nR@HZ;JbFvJ#_BYz$C6pFhYg5({G&ZuHYqnuyM5SE7GoOhlLZm6v7;v?lu0U&xeBc;FHuxr8&Z=F z|6eOM0Mvj^ys*?(b>yDr4jAqvJ2!d%Qj-+9N7s0IF z#bI8a^5@{f`6Vg$nD~7)P)hB2SZOA`=g*%|UNYiK(+P>_{>jvd7-jAsmhU|SlY)97 zE~~t4ur!ogcd!<#8&MksqD2dzLGKHpKUmg-N@}kTj|q90JSSia@Ropf41xm6sg0?Y zn2(%OTzKEdT_`P>Qc&b!tDacfQLe>@Ntb=WPxvU2m!E?1l{Jl(!o_m2~HTxv>?WC0~bm#9h-^U*al8TxE4sC7pLcrQSE>SBc9%TdMMXX#OV(L(%+y{jEl+bh*lGPr%g9fJJfno-+4BumQvh&K z|NGJ3bMkB^R{B(P6@2SG%0&Q&3SKI8vM`k+j$n*a}&9@IKF^1&$?(`d;R< zeH+Q=Fr3BxI{ibMg~Dix;`8U<)Ce46$SHM_qgr2-m8sY^iO8r}(lM1;6Rcs1e#vV& zzLyM4A|G*o$z8At>oabB{jBOD01#|Qp2)OHy~fjdbHzjBRfaOGta~VmGmBDYF@e5M zxKoQ;(F%qE@5=gwW#d0ek(XnC;K5TvVZV-#PZXB4B;);)X0Aep8NGE2E3F{Sos*@= z4O+^VDELG(ER&Nr;6GF>R;Fr%4g)|y*iZp*)EkS?$5>+`l0zZZSGOZ&O!H%WgT<|a z`Vo0owP`$o${2k3e)3PUU=Q?BBqDp?n)}v&bM;p6#3Ir8=YZg*|+VUHQy12>UI<2Q^M8qitL&Kb-n2FZ29Jc zRN_COa1B!M^aFs20Zvye$s`_wP2K|TonovKNb3(z9rV|n`bOBi@gNbiLS`#k+m3`l zohLjZg6>@xv_z#@9{GR3QQaaOi)H<)*ObY@9R7~jpCpvf%oOr1)X*UX zd5SMn!;O`#aeRABlyo%VV`IppqYT2|F##{~u`f4Yv;3!hVw-}jAiwAl!ae+e;eC&R z_m5w`T9rQ;8e?+=_| zf5PWg_6&rMgucJ;A$^ugoGGICZw%t3*yY0)Vr-gQ*Ag_^hEDwmrv*cp1&_-$Tl&0oi$;pdlVMonEssPe;>30zEZt zj#00?Wk9fQveC9@rdHIq|BtD646o$*qDBX?ZQGgH&WUZ?wlT3iv7L!Au_m^iOl;eF z`}cqDd++^_52yP%>8k3pSFMd&8!18(1rqXgxS`nTu{SE-1Y1*aFs2)<77K1~HF0HS zQ3fPO@P*|T#`7qY#{>9#0{lb#Bt+CtWi&F-;PE3$ve8`(*GSVU)!6vKTol805-fv> zn;tSSFsulqBvY+(5iC|MpL`@H$%288SjAb=(VTA@@DyUcdc65t?e%p}WezHSgbBnU zzsYs<{P^7I+rq#>(zQob(cog&A-uq8Y&0|E0aDFc?M)l*~8|JJ7GSuX2~`xh-*|v{+xGz3^|%F|M9@Fj(BZ zzJj-o01bi3BnP<;h7J)MnoFXNO;>c>pIzo@&?5_E#^s<2eiA_eR>xh1<4-gmJ|`YI z!K#-N+d|&V5iV9GTr-!Y5T{lF^g<>cgZbWP$2^Au4M8eujH2hC#w7nGbD@8JB;DsC zc0i}z0;Wk65UJ@_)|W|Ku5S-ntH~5m3q=R|rwowsb|KA_1rOnhrl5CxASK>yxcl-v zeC_yD>>`8`XG8qe47&%Gh}MS*G`4csGndJCu7!@#@-E9ql!X4hoZU$kDf8n@yoRj+ zW#LJLfY%Rm<1dz0Re%ARAkh(@RYa7eeF>*Gx4YhlYRW`s$iF|5q6OQ=HVCsxP99q& zVJ(-Tg!yXLP@&%eyJkAT7g93`#arJiPBmcHaEyj$=CY@l;Ddw$1ufJ4M&0gq=!;7e zxK~008G4^*{&N|@9F`doPcbI?NfimWXKda%`%R|$B|fu(*PF3|9Ig*0f(j>`@4e+m z0>ayYfxlGBy?LHz=i-?7+9&l$tx*wq->DQOcP>IOjb~3<-V-*jn`NyXUVPqs z$8fs*&#PplWDl>;aphZXhySJ}i8ogp(;;mko^Iq-cW0`)s&*(gT>jfQYB%^(7&Kb? z$>L}uB}v?90+TXMhXf|tm{EB=XSwnxB9-tBW`)I!$@7?5XMMf$X_DibOAJYsCJ`29 zsUt5jXJyC{q|<*L%=5%xmOR-JJ%CpYMSUZ$2;_5l+Q)|BHwfu z;u`~|{l^jHAul3@{FG_lTF+(oEsD1KQlfgIoJ`{Ln!H|N#X_vq{k3PY+T!_P%xZbW zF4uEX?GgO1qL`@yO?a^q)BT$&^_(5d3jO_d_08Ttl{z1t#;E%K+nXkoLXz5IB@PC-kpt8cO@NUBk`mT1=aJv=hx zWI>tV2oKqE=l(fV-2EzG_jFpbyx_M}MFyTX#?<>R|NdPwq18l+i^U1{Y4el6Pd(=q z)2uTGn+x2=s`}75P9R#bwD<=(x)4&!rGUTrdk;4&<%*@TmK#H3r|#QSl)TaGv0~Iv zN@Mv2!iCQnA}jp8^RB-CL}-vJrGGe~(S?=^W1P(k{ifgcS0J7O18Md9Y3KCo0uIRBn3Ko# zz3*We63_KfiUnuFuRRjfU2MF~%$}887&WcQnkRY#n(^ z_A1>zpF_;g0X7VSl^rDt^rD;H&6>T4LwAElEX^hlnAcOUr-vy$!-<2?2>h_vXaU-F zuV4I)Qbk2Svco>EC~%S^lTWQIj%#nvDw6-W!rwkc@mxO-*%F;0OkMB!(qUucZUJia z*&5uxY_M~wb+``?BCJO#>4_+$-?r-H9c$YoZwkQwwcUN~y1$~Y{P#>&P-bMct%Jiu z^ZcmYZ8Fc-zt$N$(#jGch3t3J-}!bLlUuNGw4!y(o%zu1rNh^HOm0$&R&O@tEHj!B z%6PQx9rsTtgFbBbZRMyDLgocb1_GA5iXe*VxTPKXw<->WY3l7*X;-jT@pjev6jDs= z(rtm%z`y!p-n)K2R`<(N=PM%PuD+mvKq80Ddx1T5dH?iHzDFWA^G(IbB3By%S!v<+uE|m1b8kWa5POat+24>vdlj`#UL+>| z%r5cwDs(4VU+7>xid+%*qrzP&EilwqvRBjOx_0g31@T}Rn0vGAYqs)U-4%9td~(EQ zNTPvBOk?x^TEpok^n>CR;&^RyQ_9esY}WVrNqM0gpf#VxO=Nx$_80%Aw!xfn|G1d< zYL!F7@lm;~G2?LoF`$6ztm>G8bRlpE)97~6tm`ga8fhTzf@=TRVbBjMi=@laQ2@+Q zj!-k<`YB6%h01cC8zq|X*TCk`e{;e}&(`-C<(u90;Z?An4;E2usxAkWE;(E`vP?D? zahglm>T?tj)mqH%-O}&3l!4GY$jPhBon92y+Y9LHx(MJXItO;jPT~n6{WxN+Tvdq^qjx$t zMgL!h@i4)GZ1~{41_g?cU1w5?elJmzBS&2I67d?L2iswfJ=l?dBfQ{m7I{*7!(Pxz zc=YWBz)}Vi6YFp3cZ1VhxVs!?_QkWkuj;Mk9qxw^3m5y>F|*o#dnGLnV;=Y(RF*0$ zrpSU;emAqk02$Afp*$AH-GRq!du;QNxT(Oy|6gu%2T!C!1nGZi&i`dJCx_BJ!ifIM zdj3BN&N!yakx(LYGH7xN!H=gp40Q5U6_;MfHP(EA2HbYe@sL}}!i!uvWhF?r&LAUM zK;D(Ko+QaON6tzXjMxj~LC_CT0s}uXK6cgaOZJypWH}Mha12lastGGbNW}>~>v-7^ z{_E2j(3WpLFvDq=x3{&GiUu9(Tf!_<&m9$fK2$_{JVg+_fe^IJ(2j(t2V$IFZnb!N z`LymCQ9*=ak)%+(9q@#q3y% zMeofGB_vSof$?LR>%sm z$l=;I?31p5WHXkiS>#d0y(k3;iEKOAk8?wLjBmtRI2gO*K;$n%iZzTKQAt21-P!d8XD-IC9X(>uKsfmo6WpZMUMI+0kTHb;W&shvB{KZ-f zstGpnhS>4;xTDbp1Dy}0b9bQ%m5;*J;_ijJQ=KpGy3x`O9H# z)iD!74u8v+qDZzU^=<X%SW=?2pba zT3i^A;1pQ{g*n2S7Z%D$sc;19?tFw51$HzLzRUy@{41vbf&xDzC5}{;HT<-{wT({$zXI|7f;{`8l`G&t+lbspe7VjqWBrd(T)>YiIJdcjGDekMf zI_ZF1xBI@MhsQgjJ<<)ncEtO0gFi7!^pAN%^Pwm}O}=+1VtbGG_Wkgx^$m6ZG5u0N zoX*Rv^KbKg2PTdF=2%yb*{7_oq~K^&^Q@c< zUEbFi?d8W&ft9!ADLGGHeP8l+{U_Rlt>Xo7{8x5V7aqrlrrk4J0p(W5*gVgoWH<9A z%P`!#)#ugBP2Z{G$5zOv-bl#jg++mFxSVcPsMx<$xu2Kn8f#7I(x0xVV;`=lT;1gc z#zX7`HC`#!AI^~KypFVx5U1DT-`|{0b6ZX}n{SJkx<{&WI(h34H=js{QVz?|_U*k1 z4tcBzxGyd*A>B5i1Xk{}#oh?vauo^B8uq#E_|}J&2+{G23RcX-FKtFsB29V zZR)2Fj)n*k>WeJ6+uL!V#rnp_7(qh$LzagVp}Ko(v(!^`+=r{aa)hLv2tbe>FWNw| zZp+bQxMojwhe2=vq7ThU%ge3ogUtv>FGa{Y1T zF?TfqT1fS(9V<6mffUFOMDQTT^V<{9);G-hK6!mQ$-EICR_e|-%En^19eHy)Q^x;P zXbb7inqa_!dId3)=^`qk2&PzM;KeU;F5)4^TgFNQV}=HdzOETDpUr9p`@ZO)cv^Qe z2ra3WjM8ANzV7@Pe_V?{DQnh7>@{n--qr$M3qlJ*!FmC0lIkzZj-k!6EFDT;q7`ui zXHXvH&yAu|Q~6Wqd$=Zq9Y+rZ%Xzd(y1%kpY=0Op(+Q8}WC z`gq*b&P0J@xxN-k&J~cyC~Bn#BZ)2PqT-Hp6INHEJwi8`*WF6^SPN-f5b$>6V@97F z(K#n~pYi>_)dN03Q{8dg%O6pv{({IVq_9`WgYlvhL(dlmb=Is21-Ow7y?qPmAEv^`YOB)wn-)K9|nlKFzH zLQ+pg3LI?Uaum3Dr61VRRv04mjbMCoa2U8y-wbOqCw()K!4lq9q^Ge_ zV;*{J*?!gA}L`8%!*6-2hSKU8t_+eBYF2uzKF|U)7 z;{#YYLlun;#0FVU&SIjU#3iTGe@o6jPJWBd{4f#>HF7m}$MB zeY9R$mXydE4)a876${ra8ZT&DIQ{`zQ(^8+nfD7w)wBadiODj0d@LQL>JJ}viEO<4&d_H(MDgX`NkwUNV=uq3FYPA|F9)3n=KoFlH)ak?Wz>` zj@2g$bu>nDMf2_MT%W`k{IwDJQB&az+@CVcJyn`^szzQQ?2zvQ~(G zICi&2aGjm}78UEk{n&!r=r@O&L{k47R!L=ZS!Qf6&r)A-6(-B5w7xXj@VbZ^9=x)! z@b@8Bt1Vlk(t1@~CX#6nn;Z%Hw!(EP2Xj4E4|1WRA|klpOjA)(T^3WfasAJ^b>Xcb zWj176_yE}=Z(D4@PsU{p^hRNcpPu z@CPMI>W=MRxw|#Mm-cbn9kSLPu36YO;p+xSLEF7i-{)15xmil)0L|yvC!3X00UTUEQ=~i zqI_v9;*C&}Yc2;aA8r>nm%jX%H>1Rft$(O<4SwH_xM{!0J*{(Jj+=FM5Jz3p{}*6A z4Vp$RvRd$aGxlqa@gP%!AEt{g@?m z&dz?ckQd)F=3Qi$amLy4=i)H%d2;#)hcy+V5qo=$0y z$dqEdPZ2bt*-qU&2eUI z@6R^1&>{Q-#{?h_XrP1ERdp=*76*KsWdBA-^0oE{q;N2enHHFK`{2-Rvz%(b2>R@` zmH9e&Krih*wi`J*)B?xSP|a)r`u^;AeagO>FS^&`dsDr;K=;NArjb(gMV;Paqb)HW zY(tebM?(Yc7X`FuD&NVxsG&!`n01cc!3SH`Sk_ym_tU$USL_@3+x`#pV`s!Gi~S_g zeJp{^H^?uc9c~bv4-vd;0t#c+kefh6jkq#I(#G z5?@Sve~AhS3ZjXsu~r+|>$6R6Vb!p9^wjzX+t++B0psP!Sloj5em=eMudB9YaTS4F z+33tHNni&?cXAG?xlsR6_Uc<_2i_Vvjm1f-#q?mng==cF?W1U% z48|NIF)>wGN)M*hl#3HGYu?^2&rTUXnU?0b>;#|FS7o4NL!=-Q5Y_WSe3+?i zOaNfVg#!OLUFIHbUiZ;2^O&c=AJ04bdBp~vKj?kO5*s1qc2M}e=^ zc@XR+nk-@Ehog)kJD8Fz<3HD`uMHQU*KEMHnwG$GD8j^Z#XfKwIYK2xNzRkL)0%{& zO4146IbT_0xn3B0kAllA`3U-ItesQMJTTz0Hq5%%fH9<0b$5Ekg0=OO)bB`GSg@L zC?kGjNHL}f_wbmF0fcqHIckJ@Y;fh$sH{)UOA!ZsjOK@?Y}q`Ga2FD=CH)$w6|-N= z9hmhvFxjV6<}dwHrPj9>iKjG+dL0eETwk8Dj*Gj}nV+yaity4dHq0PlxIwNgQHvvQ z!RgLCZ-wreG@It`NX;)Fj}1!!Z1Dqo4T&4iqQ#5xX*tqjlHfS+Pi#88mJ9!Vp(2Zt>fl`5Ng+zTnuB`RSzq%v`y56Ea;Yb^BEH;RgJ;uE0l&N_4G;8V9 z+mZC<+MO+$`GQbOOH(>}t-yl6c~h}P!y0!fs3+IKooE#M&Wb%G6y%sc_Hf0NVjFA9 znmH5`d7-yjUaHfbz_q#Er*|xN3j$$1nhwzQY&AwA2RT-jyWkbksO$Gxc$~RMC~#}o zt4a8XV*-mh3TFX2vcJ;{o0h|m4oawd4ZRNat+ZSl?#R*v61}YDj_rv;HTdQZ(bUwt zZE!ez9P;gum{wtl^cz|J3}x^LRhhSGY5JF6yj`wg0e|)lvF`6=UU-GF8OJ zL|&XG+4OD0MsuiYpTog-C3)Ew;adWcT4%_qR1F{!(_04xz!F4EjxL{fYpnsYR%6OK zpX%As?;@+bgR8_A9TDvV^gc2O3ITJ?iT0R~J98)MTFj)b8p4;K0pXZ-6KY#wz zW7${+a2Q+&DIW2?LfIM#d|RQ7XoPtOxw{)bi7KnJD?_SE7-_KctbCrvNEi*IHl6OG z^E|Nsi=Kge4;C33G^7giEszK)s^4N|u6GZd9JsO1(fo;lMsWc`0%L6}jGABDMjpha z3Ir0`yV6=EbQw-ceD;TgKS{=jqQ#4;c4K<3Z^LFWz>Cs?rHX}9Ohp%7v;Dup5D}JR zfeQ&H8kMP7nP`fU`(3$R16~x6y<(*S0|QeOR75+_;mZu+VEh6;cxHU&*XTqNO$2qO zs|mPy{QH>lrnk!46ggjE!mW@S#axe8MM{&ef13&+DOjuQ?rSK92a}2hD$1zMMcfaM zDvqgaAXMN*KAs=8zrm{i@HB+3bAVH%uLoF3I9TD+kPn$@tUITPW;)h7*Uh^M!n^BG zoPB9cU7RXn_kitY_kH+goX=;R-9iKj5Dwc377IAgr56=O=D!YFTmI;s?0a^`8)s)x z_kWfGRzP?f*95R>dyz}WvGy-vx58MAXV3edI@ckz)9HVLeGMO~Rs|fD>wWITU$TdQ zm79$gzyzfXostBLTqgXSEZZN2Oqp}9jXU&zBXR#cW0-}RL(ug%@$L82mAZ3FM;YjQ zBpqWmiwf}Waz$@aft`_v{@%Vyk@LE?YO2YlBZrvG2HFlk^kglN!ulj7pc&iQ=6+r; zdNdS^6g&FYp5DuasoZhp^l^O*Te~l}Giv`9T3t@x@i2|4aTURsKYKC=877K8>WCBB zXwN7A@TXNq&k4Of*Z*nyl^E|fJ=s`MG#N)T0U3l|PevTRBTm8?7xwd~7~9ADQiW^B zyO8ov?GoLwAfPJ>6^2+MIS{8X>);m@M9On33mENs=k*^l=6yPNj4Nvx{A4?hwx?wF zQ^U3w0g3`%rkf?vM7?x6m*FI_*g{cH712yUVnXRqo za6jLwdtu0SK^FLT1$$sN)mtv#Z&rKrTH!C8LLTHmCll9BnC;bJ{B%3kE#KY{CI0vU zY5sR0M!6=(;yZO%7i1!d;bO*c!$EDKHzJOrh?wxIqUXPhGm?p4(VFJHgwP6X3VnfD zD?UaqvOXWlHCKNFrIe9Jc5~pgo;Q^mT?3DQBq9hUfRQQGWioT z@+IWhKNVPe@6IH{4}p#=IXn?TJQ)#Sq|pyCPCZpo)0r;wc4fugzkx4LpEPm;(zH;uHinm@PHvfe4#=T%M8w(cE)kRWL z!qj}tti*KU3mA-Yl&0z}%owd*Uf1WBhMPml{#^fg7bAamnJ-td)&WI^P zvHUk9N{2>{6Kn8;au?fiPHAP-Vw38^kfVeQl|H#D@26aiP_bfye!^4$8ylQ^oC0sn zQyMwtL#$zlTeKK^$(4X8X5@jpxl8&djyi1Qp|L9`}(A- z*m6*}Qchm}31kJFKc(h6g3Gd038LN}d|80!?Ij-~--RAkpZ8?s3_CnRMOa3B0bYdD zXuW^~e4{2NT7$9e_XtUfXbODIXX^ROQ=`iivq3Rmi9{J~U&1C<*aQc(q4>3Y})>>4UetxBoC!Fz$t0ivT~r zKldZh#Bt*_Ykg7Adi$z)u5DMAY-Lozn`I_J=%L0tT~gYvq}>qnS^3b(;n3@wsj5fA z&6J2FBSFcb%yz3UtwxXgbYWEi^q_tuY6i;EZZM zk4n1}FI*r2AU`J#a=(&NG_9(dkF=rhE z&!J%%`zmol)4lc}PGPFXOx%gqZnImpD7)wB^ksRjJ#HyuvA&``R(_%lGj3Rb=DBxYKTm6p)M$0`Hy zCA*7EPsA`W9%^Y?imd3``)Rpqx*UH|=_`b0xvl{76TPBkeTgO>)YJvfy=Lszxu=^* z|Lsq`SNv6RZX|Hz3X=izd!@C8xEuvS$WR1k9RDTgi3L=Q+e5yo#LJOlj#+;g`>|Hg z%VKlaJz?ql)z9EO*dtBMRO`vz1PQWQt*X0|?{FuiGIcV7CkYm}F3d&IN(RtKv5iJX zNDeZ8H;afZhV(kopBsM21hs;aYDZzps~vS0On(&}#-B@)ZapL%PV|#VE}PP-tB#Xx zJtSSi#veE&olBPTvcxHngbp1L{r>H7E@v_lNEKPc!#jj$09P7FT|^46$wW^vGD=o* zYk>`}D!9rxqckI<;lE@tVAQxZa7OQdaDUQ-Q0L_Z`yV2bX+%=^^f%gKx2G}Zu(2I=sU)iSqU=9pZ$^94NV_S*av z8Vy{*hWWyU0~`DpO=eUepBHL+VP00xl);3q}LRk4$LTc|=ud8qC6>Jg`wuc9bM9E=45CLcWy^lkr*M#K@K^1&pOYobNDe zkod~P0}9R5$W`kaqJz^H;h!Y~OgRj2aT*2Zf|Vmg8D>TW9tjwKLITw7Wm=pi&a#f% zs9Ey1l(QmhG1A-*V9@0BjQH(qa=?UU45FjoTt>}OcO2HP51$;p@IAikF~cwqGr$`- zugX!yja>>$TNE7@D>K=DJNC3BGnV}4M`_wHMwOuzzF^rm4L^H3H$pewpSn-p@FVHZ zKNlTKh9=(bWb7pw@jBXHlXS%Gbhzzfv6fiBCTK_tiSQ*;rW0|%lWv9?#s)F35(JnK z2dcNOIESgH@*S**5#n~kdG{N}u8O=R#;!@%On~X#P@)4b&?Hkv(v>O4h2uHtW`t&f zTWW24r z?HGB>Xo=TkT>D&T^0XA#(gw~egEuZ~@pQ(k$fZ-Ba+qMIi%w`lW<-lJ!v2p?DGMd> zDaSp>b-S|e3Ci)IYu1#cYtoXY2%&ZcjiWMY8DV3_q?~vIsRsp~_L%DHtf;Dh;-Sr@ z6?zAP%lhA4=h!(Es>Glog1!R<2}BQv7MmxQ^*s!rMJD&~_(OtzhG*|v$~rSDNU>J? zwho-_n+S3*17XaDFv(?vNW8uDv^*uv?vO0<*X|?IYBQt-kf@M&om+y*_*lx%Xb`;^ z8wlo{2LGVv(Xx4vMRBB;W|h7I+JdY|%*vfW( zKYl^V=9E&f>N(7D+1vOoiT;(Uk%0ffGO&0m(2|467GiPLxn23R`IYJcGjUsjbSg>u zZ+h(Jg|Sm!;>{fj$q1BZ4jur=Se!OZN>S$Aazd6&A)nn*J zRKO|zgX!iSlC^A*P86zxsgv%&!qYkb4KJ66Edg`(v5J{o-m?DBsSCbpUZ@{P*hWSM z-^f3N&Gu!@wcOD2Mbp076U;f6B<@Zcsp+xF4G5L61eFVt^Jlte`K(lbkqv@OOX3LP zwP)BrCbtL;6=`DO@Tc%=kF#;d0N>C#!{FIA3I1l!&~Q^Oc_az!DeL)qtRkdKNqzH` z5#ytoaxzhMA6siamObWMpIJP=%0%HDj?^*?(lPX{63L~+ z{mK5{oY0h{|9#c<0WD|kky(1Qnrv&-gp*ykn({T=ZNU6C@pQyc>w%0_xsPG3rNe?? z<$?~F@2_I?-3HIAHD5Y9&7VsLhE92bB>w#9*!%tWU=6oGSI@CR-p*e}{={4tbTrJP z;nvXgI_cx1RVB;%oOT}$O^RcnBK~xf{RqBoXb};S(sy(!_)!f!;E+LRg2E51{&Tkm zo|(TN{X+a!od~9>z1q*(7EXl(fT@3`1D)-NECUjoztTE%E5@{69XWtX|Un%8UFrS5jxSK!dO`-3j>{me2%E_t=lb-ttR zba+!OEIH&+Pg2hF0X*rrQ&haL<7yWZ`vO1T#S41L?;<10?DfTJsbYpTi4`{B+S{8*9NGY7nHF13qVn=v9+hf$OWJ(2G<*HO*>@)_$>-<|nKdUu`= zWks5+3PTT1)vr1$KJTMvTQ64_fAF|-BaXj=MY?#s-`u|H5$993I*(3-2mamYdVe$7 zt$E&yj&m}-HvOBnHe+A)g3#S^9k#r$+z?cLTyAPTguu`LE|)c0hgskcD_Y%{H_r{V z>0%To_M23MIyQ<-dc6Cc`C!v;1$;!nT^OMi)rmo3gOI{ZhbIVoX|iS7C<|HNXOEEa z50s29bI5P$5*-Vs*r>kKt(;50JK-qDNe9m}p9_8am&}PvBQpQdf6mO$pS8}u`%cw0 zI(|YoIY|ORT+inG1iVhKjFX%(+7xk{aFsABH?P|M+!tD*q}gHi-ba6Ca}#)yv0Ihp zwY$36l=MQCiX;+9K#^$FN<~jdNm@w%-$#jSVv0th|M#v$!j8x|_6ky3y=On=DS3$D z00rj{@Nio0P$HsoIk161lSmC!g|9L$!E$UwSu(QF~OfqtYo zTPV{=ijq0e-q3*e?;tItFJZ(aCnQYQE(I(){b@t&<)}ywP$Z+<8n6XtM25&S$w)_~ z_NsJvr(^X$E!pazFwO##wwQEOOIloz!D_=PGKd|JTT{A3k&uxTWN>4H&pX`sOF*J^ zT#o(nHV%(MNMzXL)W&sbSnRk320QU^bVfVjUu(l7l~s`7zp_wV#p>3Qg)iHrkUowT z#45)#CAldGsZ?r^XBVZ53J%a~%&4Fy4q+wH5w(hn{xv-DK!r$skv^`)l3+)!z!C@H zM!k}Dlp><{Tw&$?h6)S-;n9IiIV(wuY|%i z;3@c}GB8*by$9_FKBd3NMuh@J>V=qL1d=^DIO=YM7mdli{#HP7Ze>vZ-A6W|Q>X zqcfe3sd>^P^CpQUX5XV&UZVz4u!3b=+z6T6N!smZta-67Y4*8tt&LDD6p1?fgK&gu z^U@SI(O9!ZU^T&;B+54|L;UczVfa-Wp{8g{}J~eTX0&vtM6!Ki~SnTldU}0h%223e&(h?ic7z2OtT~&J( zUgSvSqlN-w{-wKWc^z|gE+CP$x$W11ll(} zLNzS*f*)-?UNA+2M9$Wpm}j=GgPe6?;M|&(ZWkLUJ1DW^>OVp@9m*UjAy1o8;Ne+j zH@PlybWd}Qi$0%{u0-j2+YbF@*FMe3@aSm;1po-OoCR_%%J{?QXUm z`T;qL=3|SE0YP`#5sy1wNe@5l<1;b!1!X@50=PQ=32pPN&mMXnY`aTfYNbDMq8iTS zr?M7U8O4-f0~;imk&A1p0%Ak)%OX(y&*oCsLoBgeIF=Hlv$f29>I^a_$*h)f*WP7OY{PSew_jCx3Uv<MY=`S>3fP${xIBPY_$wY*ExUwedR~xjOCd}BMXQ(N(BuV`%k9s>sB*Vg{)e>P-Qarm|AbBjA07QU z;_9Af*|-H|&F5OI6GXGl&tL1b7MqF(_$65MqO|;Kc6Nt5YPicgT@VF6=m@gG}4V3?QCRNY=-|9L7wFbJAnl% zo)4+|#y$6sg%rCTohNbOi6I@czfBJ0zZfhSZx;HjCgYW@|NQw^VUzfP{*zNRHRhRQ z<6}^Nx%<|&yrv@rdLe33J1<34cz0^bUIY!CX0)Dal;;Dvsv#oZ8Vu6g;$K_ni)>Bp z*5VXOTy*`#@38O~1jWWHGo3CO7u{m7WO440w>Mcjl~2%AV6t0XT=Lsm2cGu_9%c6Ci+v7UJ))r`xgiMw>rA(u zhIY@b8(eItxSQ7rMwS{2W`F;E`j*Ovh7|oAj-b&kzEg z=Y-D_gP0b$C&grZVKsEEQJ2*(q>gMofKFT7WwlpuzD2lpsjg7l+}Os2+QmnoKM(@h zZC_fKUvofQ$ziqP#(DO)&h2HDt)|NsbM@n@c(C2(S=t2pj@T1Hleo+1`{YOYY57cF zUmk~b^$WaRp|@#2I_0^2pR4W&q4az6JYe|b{JvgwruE5N7_|QH;ZdO1%3ZhiCfe=3 zc7KKv8I;hI!hTaCpYQI^0yOuMeC`Fe@q3boYsBwO${Z+Q^a)+XRGY20r2Xjinb*hy zfAo)TCVYm?-|*8(x0KQ8`UTw12P&=zPei53(w*L&5Nwu(Yuho+#c*!<9XWv9L5L`o z3^NXSL&vX`$Bn`KZ0}lczK(~WTzuu6=o_WQw=LpXX&A{tBb{4D{rrak)jio<;fEp^ zg4V;Y7rsyC5|M!=<-9&arBrEL@btfTuDr1?j*Mm(qOSMM8lJZW0xXp{%^_BjXqRuo zy0>6V@+xSY`Q0F08M~!y$il0wOl0ZIZ^Wy2_945c#@p@lJ>Sv-SIJKFy-{@+()`xoRp>gu{mcn1alB#tm^EK{cv9B+_)8fP)?4{p z6V`HjD9~bX`K^c*%v?r$eXa6oq1(W@t@Uw}u-NCo$Y@)Uq5J*z-$#r-6CinZ<**ro zaa)-X9^!dh({0cv-EI6Z+ur)jve}oEBFl`uyHWF*GSZMb=V>s-1L>f537!CdIy54f9t=A)H? zQg&(eKO+UdRy^7tGCscZxKHWyPKw0*=U>-d$CwiG(MAP49yHhbXm|W2`oFaRFPnQt zUx@nlO|9&(5g9(gvJ1a^oHOLKT&~@Pg#JFa>oIM-9RRu4{%(wwTmij(xo$%?sVBrOfjN*vL- zt@CfoiwYo3qqx2N#7e5s4kd!3d+B}uV_Wf`8TRDpmD_jbd<b$mzg8;A5<}0ccvx9Uk^`|Sm_-r>)Q7si;~$_g+-=ENd)aQ<-SpRE+idSuEVAk zrJ~h8MIpwAi%N(YyIgEY=M|i_+(7;8CW^Brc%#7pl4=&mXQT4!?hF7aj7WuKWU76yXwjxJL6;w9U_xZpO5kIk z&(gtOuMv=&j+{jCdOCj-YkXp(>y2U^%-BbQMPO{=^h;+gC$L}}a=h^{VCm2kdfWnq_Iu=of!%aGJa1 ztj|jdPj*}+8#kXGc(mcfMNGVB<3*vH7Xr$yEkcD71Wsq~yxbF}2?>a|Zirz>LO$rc z#2~Mi@jPr1xW!(mZ_JY}?rJ@HUC!#0ckb{iF|dKtF##E|6cncD^^KhYEh(Z%gM&P> zOJBdr+={S#bUMon?mbbOvC5i0z8rc!sOy{ORqXR7Z&{r2GZv5;x^mUMc4$COROJNj1wt!49=+9nzXhq^cAT4S3(KOV?KN~P1 z!KM`^6?iyQdpV2(o3C?Z?7Wed8OyvEyQ8GxB<#76GbG#N5~SR$<2O3oRFl8jLtt!3k~3)gk3Qc zS^DFcDH_WvPyiBcT8oH-P&{)JjS3MQ(8!i)_2cqV0t$h(@7M@9Zzj9-KTOCXIXE)6 zV;Q_~aHumlPXBQ{NJ`K{O~wmei~fi1GP*9X?Rx%+=QVE$M@K{n313~t;vbuc0x}ca049Rh~1HWcLuPN-! zUAf%Y`~aYcs-iE3k|hs`hrshTRDv)sHX9hZ!nO95J=W^_d_QJ7#vTN;4}L}-HB2+RxGTcn9MRRTw&2uh335fFBoR_e6SPP`GE zK*uVPbIwPwaR!aU!6=G}ZDOwqghc-t8d@Amq&Rj&$P8R|Q+|%KN^ipuB>tz39Ae!t z)yy=~jB$*AxIu=>xW@P_Pj@sUOPq$sFAa~q;qJC-WW19e)~PD*lR7uvm-Cy$LlSD3 zV%P)Zv*W&tdlCthI)m`o4DKJq%j;o0@j{FJPJx}iX>9?&yW5GWpuPbX1i)%PD;!Gx z4~$HBA+*i&LK%y-uBcwk1-~KcY`j5MnU_ZnV+-yG2QbM5WYL727Yd^uqC}9`pAPS~ zkIu`^o!TU*kZILq5gu!x-S2j+90gr#pyXKUbc4OXOupex&otBC6nW>L@CREq6V((_ zO|staL0D~*65^O8hA6n$pqQbljsyXs-{3viBv2%3VVRXXl0JJZo2H9wgp>-r2~~^& zz5hXPLHQWf@2+pMvh_RapE4Fe3bZeVhaOk)kAbu2Od_xA2Yf6XWe@J@kv(N5}CC_^Kp~ZU26M@AVr}+I8N?fXSzVR^6`QS zU~mE1?t^*020oP=CAsSDO4Gp+@tVkN>E17RQ)aRiE7^QS~tfl1Hy zbyBpr=Y{&FpzPpEqhDp0)}yWA?WjOj1GBn1`qfPaU!_ia>cZ)%BpIvUUFL#lfn<)h zrik{?%A?8cBp5@p4-SXlcG!5wfgZ2r*AtG;-rdg+$CUL)B2V2M+qJ-19jHRTeMq&K#fH=IXyF*dU-~ z?hf-6vtXdWBU~R|`UF0_*tX2o5C7MnOyKiC@wn9|9dLgRe-rWwI+SI4a(v)FC$o5- zMtq9TH7#9t+oy8X7(uM5^Na19F4dm6e-6@id&ZBX>ih3;PQT7&)}7g*wWF?_aF$>2 zVsjBeiI1F0P7Pj2iu|GrF9=>g@7sR3-I?m1jMRTey@Q^)CpMkd3{W9Nqws;BO+Z(gg4+yev~+#P}h56<8a+;#9lg1fta{e1V{cimsNSO03d zPaoEtu2Z#Z@2bqVohveG*5Lz?c12F|^+pn-!VaK|38bQMA7OPwHDwLlo_XJo+G$D2 zJ$w*qZ)f|*LJ21RQf;w4gwu?WmZLrX!6*Q}*tD;7q3Jeu%|cz5FIo59ot-Lce(!bS zE>M0w_i-fFNtevLsqO9qg~z$b@{Bu~@qKTzeEwi9SXcD!BH*(3hEI8z8N}y`oiQxY zc_s4DL(^oB@zP|0m#KHSFw>GcPYmHAMLGXN;O`RnZ6o+xvcEH=@kvbWWefEP_HdAf zuU7l-+{&au2v|tWRt9}aA6`jXiZ+1dMGe$1@5p$tFSl8!3H8%6qT?3+`#=!jqblYR zk>KhH&qfKeXH-SXr(HjL9AxF_+P^)&l6b*6pQ1deK=Zv9YJ13)kcu9t(ZPdHwy1Yl z?a%!1EWmTx5;894!^N9zA1e_(b}0GT{ClxVe`0c%QRj_)D2Cf7UMpe#SQ21M#Feol z7kE3ymZUF!`|~0^0c36*m->3`b9eE5Fq@w!KvJA^*rehV<5#@qt$2%E?zB?}-xGxLa3jb1!H%y#1_9rpCx$q>ZUZJ^yuv|IWPPH% z1eu*|bGTH$Kw=&_-?5}o#7-W;$0j4Pc+AbEx=k3$k_Me@R zfior!EdAfPai!2lCFloS5wszN_fkzVXU33Z#^`z!G;&6E)!hCE6A2HKx)Nwi5OL$i zQ>qj!_UcIOD8j#KuMO=p^8Qy*1Z%LOxGrDYDyw>aj{o*8;6V?(J6>qsm4zc-PM1Kc z(@6yV`N>EEBR93|uc`%oGL@#Ip{Us7We(q@6Ae;uN{O|{X=Bo^5CdW&RoEOnvO&r zs7D=G*SUa+veO&3dyZ;tQd9olsRL(y1a;c>!tkqveAI-_u}ylP+Q*9G!?Q=Hy8)ll zksROekGzB;IYcFlB$qL+*sC0!v?I8rkjy3>`NjWtMnc(EGVXt;FMv!%Us&3Q->Foy znDHRYcZr2IoMrEHDVBR^<=%vnOWMV%lcmc#frA1fW0vPpFW}w^sL1glTx~HH99NlI z*IY7pQVx5%>};MBOqaS>)3rr=SqckvH0uP{jl$8{Lr@w~9rGy<#ycOKAke}gGmW11%QKk(k z{9!32g}bMEQ%%!yt&h7|v61*R>aM-Hr&;5;-!{2X*pyZ+p9Xco{QhL-{AN3G!bm(x zwhw&e(5C}1ff!A`<`d?JGBMJ!3YTDGM21@n#?NqK_A;mI(BY_PD`XT9TDt1Dv^Y#k z5DH^Ab(>--+~&4VZUvPZir{#z%l#tl4ARNIg_6j+u)bx!1*h_;(13_pVwGO=Ak3 zJ{w}VYH=tC_j zbqiexy!J)&J@&vk_?%#BJ=VER?@2ec1I4nb^F~mVGZo~kv1irV$;}k8 zq3-oQ60O$K{N}VOw1K>k2Ff4*1od|n(Ym6#*CkPp3DIxK~$sqG%DN_%>fA;nJ`cfZm?B|XjIJih*Gx&<6`;#kv+ z{-ztu4{D=pDBrE(@o4XZcb_o~m9XVa@wRGS-B8Pz$oL-8Q=lYa%6%Ys(zJz9TGGu& zG3X03mI`G!e1tr__R@Z5-6^FmE^B@u4AZ%9e08PbL3*oivS079*V{~*=VRDsbQ!AQi;k?&;tyex7Ng2Sb+ z^{AV8sc-TXEk8ry^Zmb5nS;Se-9kss-qhDk8*(3a)L60BkW{VH=1J;o0e&&R2gw=t zc1UJRgO81GYy9ND-(o^9oS!>80zL1p@|k31sYy$0Bvs0+bjvspq>WnY;doc{o1uy`Hd+xGewk%!!2>jY&qfHFUZb(EUhV@V5SB_3x6zb9mlK zFsbc=q!H|erjsN?(aH?HtLwyt^}Kmg`GKW}bK%mP4L$Rw&aY3^^%#u?Gqzz$R(uPfS*i`+^w z<+;aGUmsAlPGH@!tDXsaHV8gGavj%5?mW29u&iQVMm~hy1g7L>-)9%IKqfS(`E3VA zg%&E#NUhh?BAdS3HV86DT3GCubi~Rbz@8b(X!upHAUIokrlPEiR~cM{x320@w_X}8 z2-U|`VKP2#@!nV$oE<63qtg^77Vi4#Njn}SvQzi%+M$R&M@sw4?ZRSuk9C=e;6>5G#NWimrP5l7)OhUA()pWW`76GqE_;d;E07?c$gd3H5Qlv+U5> zcRaM(Eh9!yHy)36v0n80!8YMYQ_lNQ+f3m3__>Kk^uJvF*n|W9CtTxdm+)z#K3r9% zt=Y9o%k_5cPpy!c;Spxt%dJ)rHc)8>@LGzwk z$l?ppiMO*3zjBZ31-t-qE1omNi|SV2 zw6ya2CpohbJVALy9G4J2EjG0Mh}{welKd;fm_$kT$M#evhzi?3GG@)1nNh`|Ct zf{v|G{cI}cxaN~nlagNzm*QO;3Kgj{jCb{X2CidkHH+Mj4W)vq0D2myvp*erCb6K# zS_EyxXtOgvk~<_Ws$8($9iL6lb#5eUM#I1ETm}saz?ua`1@YtYZ)(N%^~Id|4>LTb zxEIFS>r!fJ8e)J|tj}Y^F5Vis6NvG0dKl1UUw7F&;#edi7(TI1ofUpw*O=2s4Q9>Z z348Z#sSNmEo5u# zCOca<#R8cQJ2hf_byJVsH>~-CVs$B4^9iEOm*{r2-Jf`o|3V;$^p3w6xm$--4-A#g<5%3(GbRBs`i!)vs;>YUCWf3$lD-~ zQx(~KFZq_X+EJ5wah%OIGv|?@Y`g5M!iSNm%am>}BjTy7BxVh$u2r<2CxLLmv`R`qb!if70CzdbvmnD7$F{^WiO9x8| zBj<^l8GUmEkocTjRV@X$MS32$ecUWVhI^dLLDu1|?cC1p`tl-xujpR2>X#SkJ;gIE zpm?A4efpATq1{`+3+M8D_aP!=vwjxBbGy@=)}pGUf_hCt5@#NfRm-|$3sc7dbb1v2na)VTk` zC*C{+fXfo`^QWUL!#CrU)sTxzneGw$ z**``^VdxL$XJb!IF*f5hvo<|*8{W?{lNt`hzu0a&PW*v>Js$bk?O54B zU*>p8cw-%*Il0-Bdl^@ajauw<<)ijB$$>&Y_U6MtV=hebGUn|<*85AxOrW|(Blm(( zc*;z=*Rks~D$hcLw?No|h}dxO)!RiCIq9sbxP7G{GV9DUHr=#@;V4Dc1+>Xi38|=!`WE~ zj%I5>la=)BmQd$wO{6Sd&7gu zoc2X9;$fBFJE#Ygh~nwdOUeuoLa?rL)7t63`4BLy*Ow86kXzx2beNGBmZD{UXYZ167m-% zBDZ4~sreLjQsjVfWSw_1){YEso@!}gEzBWNAGCgAVre9S&sAfsNX4&h%W$So{T`m| zI{QfX%ZTwVX1YwejyWnuRpO34(!#Shn?dwSFO9S8V^EW?>h$Gwb*nwEgMIhDkASF; zp#?b^diFvVKS+UghJv6M;wM6oC5fi|I^Um7mhLy1oYzBtI$=hVJ0+fxOuE@$M^0U&=IVHWSRyD+xe;;rpQM z11A^Qu?PwyOjMsV6n<4FbdAs9h;snoTPtec2`xDh9Z@W`k?mJvC}az#t>(qk|ngNk11a&FL}4EnwC1 zNO&E;S?-n@qxjB&jSbvNx|r;v#NqvG|+9D ziA+g1Ln4)DGTb|O;H>zzs-p0fUsI^>>25CIS4U%X!WF6Fw9BCDBTy*am{N4Oewn*_ zh;ZV_JMm|cY2Bc{zM8M&CTT~f->ei>nS5G5Y(U6hIaCv= zl6evzGkT;s@hz?7a9*p=__?#po=0zP5VOGO`_;#iDpoW`I-V>TeZ#j}VqjV@?j6t2 zrRBf6R>Fu7c^A^0O=M2M4%Ck7Q%4Y@9B+tHXMV-ZvZzec8}Ibr7luRYG2dPic7=kV z=t1j(?5WWc%Pw>)eg#iA=Wf2&Jlct5XU3YWy*R{R);tn=<9uJr&A*rB!vg4u_8j(B zQv+4KwvDwr0fqy-1C^2a@>(|dO5)9F%V2$Pj4SR*u8-6gCthMu1h!JDp=m6d{@Y7` zX_j(mKc0utc)i6Jlp7=zjGxxaOTEPbHy(Drwoo0sG}1M~`wgLEry*#^DJFhj_wlPQ zOVyJpU6BdEwYJz$bm6{SB4T~L;%>=k0a)<+c6+3HlJIF}q2)z3)QZ$gCej{IUM$cX z^^{LI%;!#|ts;D-B1DeQUT+2b?h#uuTZEd5qq!RS=(UQY!Cp8`=aB-Kq-a!%)Q7pt zgChK{r=-(gHkt9u-xG@oi&3u)aP4HLjD>(Sf`4u~^9oX|nHNvujpROku8 zh(p&!mx~Mmqq|j2MCCF&r5OV8bf^jjeVi&%=Zv93{6hjNI5M@ED*fk6S+w=EhOLpt zqCpOXXmV15zFNl0iBmIclDB)5Yn|tBv}PTDcNC)o-KfeHD)7_St<&bZ9GKzXZ4MX9 zb1SIORno^S2()Rt<2dP&0cptI1Qb`OAoZGvr#-vuCKfQ?ZxpaSZ^r=BjJU}enp04K z=g+1L5H1L$PKg`4QNI*Xj$&6EZdv;E4b_4r_(% z7l^;UKS5@$Pv%jr9i(a@V!L6*JlWF3?dyP8WjV2)(SqXj?5r!VYjhozAbBEt?qD6V zD(ru_01R_bmtRPWMV$K=5jwc2j^a1?U4dLxsso2kU$P8*yy|^<<0p^|RSN0|CYLf5B>mbx?O1yy0sP32T#|__Mtq;8lI`o7FIU zUaFw3qMS0W3(O zBilYsbXg=}ug8!zo|##xYfXjfo`Z zo-Xp_nHi(;Rf^y+zjs~5>I6Li;8LXq-pHc$d{pXPyc^}%cXTwLYB!QmqA$^XMsRQZ zpd$9LS3W}V%j=PZwdIy&v}MPPb`wjOWFQ8<<9y+R?%fmzbWtbR)P-6R-}}Rf*|=M- z=$ecedp_gFnQ+P4!r#AnZ>GOP_me zU-<)8YeHkk0JJ@aALzV+a}^@jc=HO_*U%rB2O=mP2_* zLT;sBXl}d;sh-`$w~$p7y8`%`VaH3fLn!842~DvIn)CopUO&V37zPtP&faPhB13X>J!!cR@2(W{OItQam9ergXjA^EnNK_W)Ith ziM3oS8M+Mio#<)VMM9aZAomXQ45x~|Doece#cSQ|s>yd;kBt|Y*M(io(l1RN3zTgH zDR>a)-QRw~xAcPl>ioiDFHI;^cR4(Evix_(cf;ReEEt<{rjdqV4$DjFmK)fQdtE5s zy563~>i((!ilQKk_28G|my?}Sc2IV(U-qEk2VZ1Er9>@WB`a$4@6sAbs4Y8$xGr@g zww=URE+VMYaasyhRz$cK`gqZ_g@)Kj{BDknqvMd1wu$^_Fz;>ZAjcQ6SNneYU3zVT zR)S`{c0}4b*8b;n+)|zQ>M59@H!{gnVRlsd;sgLpHd23vAiA)ykP^K!B@e;|5cv2R z=f@R7rpmahgP_Ych6}h~|juBehrvpY)l_ zzSX-LP$zS*bz)xAE+2`_Cpe3=bfYe+uRLW)>L z#>U7w)`z}$zIO`P3LtFcgm8g9q0JMHH8$unPufP#`;RKMp~vTcim&~LbP_c5<=o2m z&T(^og@I@TVY~g4P~bfZDQ5`FASBZ}7hj#Cka+INHsbhI~ybB zTcBV^qR|*M;^VC;R@79?)hC9Bg#``GUQr*`%lT{3hgA<}@Qy{7fF!`RvA`RwK@GOo zU4VoLq($a2OyM|#6hOFi=Cjf9;LRJ0_^6O6QE}7YcF(#?EExcw&F+(NHmp3|+<1{N zOJ&_!oCyg?g-xBm$^I>qUZW620hWgQ#d|D6d_>P@K?ULdUI2o%lT+a_2t6h!uY%Cr zPD9rxK0A_pHbU~g)-TM_>K%lWogI`Jm+Y07P`niX{Dt7{WZWkwzIG|jHi0j()6Jre ztOZ6AM2!XX@h>M)sOoRp0`;eDvdwva(he;^qDKfoYQ1nIv&TKWu2<%0YLrOMjL|va z)J)6~XK)}m7;H!uC(T4 z=YKH(s!!TlYBOE_1XfdG`r4vj*4#B+f(!gPzZf=Ed2_mP&M?WsEmwPz0xO316v%Yr z!_=e`r)WFuC`vtTd+$7*UU;GSWlDY9v*BTz=@TcTo-A+rU= zMat=YJgxIjPQbEr3+d9d>Y~Ryk)Bz^yaq{oO~m6onB{x%@=gq*sE8?ho>TPj|NAw* z*(LMHHA46`02$dcRCMevCVbba!bdO*U=eqxy6owa-lK`hc_Iqmjmjv1#5cr{w~*ju zlZr`Yez^z=vsZodtT`dC%@!g=rG>_Z_2+*y-0_PC+32kNGk&G0Km4?Ea3*qh+m9;* z9V`p2Ag^h1f1F&V89coyl==#up1Bv<4>3Q6cGTaXxOtvuKho@Tc^_%s6;7SYLKSaj*mb4%Z& zX(jnE9Oo7qLeG+fz;pBAgy!e`Ka*1}S-F=_6r@p-aRTORE|{?g-7Y`ThvK(w89l?M zn=m53#4D0il&|&mWEz=T?h86J8NbsFo9`kad|9hPzkcPypIz;#BtVkQS-^eZG5iHz zDgA7KE4W28T&~!2Es|ulW%@XHLxC$_yCQzk)`hiA_VjA}td>l9=HP??kMzHfFc%CJ zt-IBRTrt?Jr}lFDQ)rz*2+qcnaH5CP{~Ut91>pmO|Ig9$NGM17KYw%bq}W`*ge&{? zDTk4v#L5iD?;Zfx^|=}p8S~`u`TGaDl&EDKn(R}VBwl@898R}jo!0MzHMga*aBSB3 zh}WOyd+V3^%hZ1C}B~K8TT$xGyZ6{gs;dh6B3IzUs`k#B8Ma( zK{&VU(Y`NplbcXiioY$N{99bAp&%TeByV*!Ln1;!Dm)@Xkke&5y=d5v=(=P|9X~T9 zCUMRwFbrW$92YVNS>FdyhA(XFbuEn!%gGygj;m*$>=g~@mmMd3V#w$*LJ-Fp>I-nNedx|W_O z1F00;W{W7%OV$sKc;vl*pF)86-mc#KLr@|1ckSt_T%xz;VJ^73f2*3$$t1S(Sd0H~ z?_xPj^&L&VGe%B-=MV`_b2J8H-9(ia@U09D4{Cr95xL<5H~NgeW9>H${*8gVvH3cR zk{>M~pzwb-mfDu`H>|G;b9?-Ca*i5#OKSI4715J7wX?j?W#WZ@_Ojvl}YQwBOkB4|qE2xmM-|6O( zII|gOI4_-~vhBu|r(zKaUOh2b^&!ubuC(7Se$=qL2gF{u^ia(Vtqtb;9 z2^o*Rqr$L4(ydP4RN_pZImh|1Zf7KvrM@8>wAA5^cDanU^6byRzcBJqn+M`%3*SyO zBq+w#AU$P9rWy))hH)mGZ!(A~ZD*q%={)kTy)K)QH+>^|y$soId)TGOI%dwky}FUR zzcc9End1=c{bS@AUgdYov?PplzvPiC?67*WiX7nk9cSJ8`;15K{Y6h31hV6*ai5j{ zr#P@n{&caj-{!Z&$YI-kDn+MRMhE0=+j;TpcEeggs(iq0$vFcNdd>O15!U{xFr6Fs zXB6Q>52$))q_)4yE6T3Vw!j+o^89(_i68Z|v-m$dqiorYQpN~#UYvyG%t+6qf{k0@ zmQozWPz61D|1r5sV-}9tKO05q^NGjrx#I|X5o`4&L6W~DxWajHUqt2;r9rXz3d|Wt zqyae?Q5r$!tWoxQo==PQ^9}$bHXW6{73&9ZjS|@tUa0VzR+c%!LYFZEb?jKg_E&Mw z{;74LB~)qD&7mcv3Y`FVPI`3|TwgnnbJhYubJt|+PS*|1{m+$ImY|6OQ` zWoM^P9zgGzAKwjOMNscOMoowci4T49hNbXzcZz&ml8pl^x_N(6--E~$u{5hypAp_i6fdnaFG-J z<=cXZ5TR~f+(pPBR4iW12PB$R+d=gDY^DS9FHXkQri#`^kY1wMNk_&@X* zm?OQ%oS0fGO6Cs_^+vg_PYg%HB-^8(UJIdRX?oTLJ+_|j;^#BqETkxCnFtZ+d(${R z?FVpvK3_j)Xmnk6QU<4wRLLdbndTXO{E-Lek&q>XGK0f)r9ld-jnhPD$sUjLdSRXR zw(WhwJ#esqjl^<#og;;Zct>1$P$9!R1jyn&ot-Ob@#Vi=3C{;LRiTcEy4I)iyJE+` zfzwDuQc&l)r7D=)yFiQxdjJOHvH#WLM_4Kt;Up1Z&6p^s6@KNmF~UTOkwidp)W;2F z%pe&f5u|%_rOFrc(TUj0?=Z36b(Wj(c)MTeG(W%O!A^TWs;<~aV&cdLg(R{m-q@y3 zYL3^z_$5u~Y5-|r8G>F-lq(V=M8|XY!16j~vB_bn`AkgScJt=HyQB>9X1xZqyPO4u z>_+l!<`FUM^Z-9sT}OyWT5YP@j16bkXn1`7RgoEs4vWbAL1um$=KiB-g=C10XvElq zMSInY;*q~&@NSWy?qn#}WeVMh5G!pEV*emQT)GAcyXz!LXC3%a6+S;|lV$Ho-cunMi8ju*_=zKS@*IM;hwGKLCAddda2Px zO_f-0e{q1!d6SKDn$Hl?iH!WAf}#JFkN#6$fGnJr*o(9!+ao7Q?kij1vT0kF{!V2R z)-6_GBB^TBcn&8=M@MG8&D0{!4&;#F;;GDF!Cf}4$*Btg5z(}*$y?xUuyc_r^ZJ33 z<=Omhh5Ey%w|F%w4SEo6KEg5Dyj^RGn0PvkJkom=J$I5wh7#0m zhlIUBzOJ8Yv%+EXS)*<(fmhIy2O;XbZUy7UWOvV{lIMAMDSGQ2ruk1bB&lV`+Up+q zK3CPxc0>!X#Gdr~C8_CsbW~zXU47AnblinuT|$AaXRKA=3$KH4umM}!_3|k&K9|q- zfVD(*VqlCcMvPIPCe-u#cFjcj+U{WqBg)#*=NNWm@W>iOTj2GSq>?d468pk2i^^eD zU7pUQRJn7X8<1T};1>rnkITw`&+7;$Mq~wrc3LhUMFPtsMk3%*)!}i@Y_P1~*aNjn z)Kx+^#G)`7cf_I1r&r&r(#Zjwe~N{x+joe83BGapWk0`IIj$>-P1jbIN$SDRaTJgY z6v7~gHR!(T9(wW~p$I?VLWBN0H&A=~37>TxdMO_BLtYSX{Q_kK_$Y5!8#me`X<4it zB+3RRI?y>11d6~4s2l^C$rDQh0YMfSv;|bQ{Ovy}!*QdtyJaYz2>`CKjN4;c&5^M0 z;(bNawo+4!jyX!u$Le26&MxdlPDi*m6@H;g4$s_QQVe^5A!EvvsJZ1uXC{W#e1nNb zB$TPSKiXBNZa4Zyl<5-Gej5WUOyt3RYRyxMPs!Q~{i4h^@Ea@0_SDbX6__4$^US&x z>Q`?2(ekINV4wil?fyJM62j5q3^cmh${lE0OM*VBiSVQV z{l#YMhYCl$*%epR*S$jq+{0zk(>MezI|hj?y)*j%2Yo( zg2X9XYPRy>%dj0l4XvPN^fTqg zUyHS}sEKkgus^46x#de3x^)>za*Zn)8XlIAW3fQ z&otuL0qhhJ7;Tq4VuCMGxplstx35kbshRiMYm)oDx`0j|&8X2n5-_j2?wc$1^Y&Kj ze7PV@(C72Pl6#w@(cieSxvKt)2)L0j|6 zGa*kt0Kr=DTjEBa6#3ESQ|Z63;v4ud{KnxR#HO?>V(kDgvPb|Q1B*9zDBy+|=*R}h zy=}-e%JY61UazrSdmvyWm&nFz^iQwP5w)=WTui+C^PhEAP$KCF#M5_n;%@1<^Dl(8 z;!9zopLkv-B8eX4(-DVxUsL4Wb&7=X;#~NtW9L8Fi*_$$_$vm)^D;wPNvT9lSLnxh z)po2rX+$xi0+U_gevOH)W7Nrx09YOe8*1OS#gv3EF$syTbB$vUYLloxNTC~|3Q0?ps=HJ-!?YFYCbxl=6a&W{mWylIv%b7P?*m==rO5v=I~ z7%F0~sT5@GD!lGT+ruFO(3$33KNY?B&>8wf?;xSsk-NVedd=QLf8b_#9i_kM7n(=A zew~OillHIVy}t5P_HMlU{$BU2ORe^JTP`BxAq}w9Jo9xISac}5Q=xXf+^EGKwGxpx z9_Pc3C&g}S_zq3CUF(NAAK8N_DUmnM7LX{Jm}_^WwD@pSJv=+QAaY}B-WQWu8zaLM zIW(+1VuedC)=!H4-MC3=pecREC6gQidq5yA(bemf`rqb#S=ZSGQ5KbMz0apL)5V9N zI|T5TdLoQ)*RYXD;tsLo%5RRl&DiVqT^l_tI@O#HQVR9q0m=nh7MtQ*E`X=(fI0(tisXP&;1g)VSY$f{$NJR1kg^pwRrqV{t?xiBo9mYW zy4^GP0o7@(j5Kl@2iov`uAS)SAMIj~3WbR?w*p*so?DgqHtF?YBIh=S!Za%g2L5np zJSrUmv5g|`wZH7f7p<3k;DKpQBZAAO>V0rRbGns_75ck%Qj*ot4{WKH*Frd00yO-2 zvgwy$F%rc;a#DofGndI7zl7&U<%F9NBL=o%w-DbxOTm*p24 z!}u+ijDM9ANhZl3qO|FH$rUCJF_jg1-O5U}9|IF`SAS>E2LrrvXRqMFvdB3vf&yk>V29sI;yL)j&H zEc9z3_%7#9<`La(f6sP5(Y}GGD5@1OW3O8Wr9j@;9<~m>Z%P+xPSO#T`L_En{CiaW zP>k3g(X%m{wmjxv!X1Hz>|rP}Vo5N%GA6S^aVSJA_qJ0if?{AaSO%PsbvC%RIT zb3p+%G>F z!83r`aBNe~#V39?>euH_hGRF{FHr{<8z|--A{mFR(AJkboGW*?+7`^Q1Hec#eCM)LIEU8Be9f=c#E{B2ruxkgTs%C7| zJa8cSQBc&gFodXD>EL7wL+K{KL$yFl?H7$C&0# zdl>tg-NW>NctR2a<1G=wtTc9q4N858CbjfS+6p*&Lt!I@?R}_w%nV}OL(JzKMyg2e zOc-QceR-5UJCJsGujozC-(i#yU08j5O9vu5S!2|X5xFJq^v|DE94m5%-r+2)7z8sj z9UUzN(+>Yk#A-2X2dZ{@+RcAq{UZti%YO%d%(PA5ROz>6o>m|{M@B3`vYWPmH$|{B zB5f$R?&Vg(E*)01yT7||Ux>4gDNB>PN6&Tu`nlFBXSmxJ+Fj_|!KjOy5}X{70^3ac z$6Dgr4|5stKJfV8^ce?>#K~xx5XE<30D;eMlYz`Tm-mY=OJ+$?xe@s>Q7%+W%yQ(AEAS|Ynt9$1+FI@gtUc_OrnSVC=_+Aaq(3{hg8{p-KT=sxZt@{m z?2jwok;zm;E>)TlI42zIz}8|MPawBzl|&_iK_yQ_?5UZ*0!yMx5J6YQg4a$jm~qIQ zz>@xDw9dL1aN-Xj75S~s6c&lQ8z@`aK9;$X=#E1A99eHm4a_v zH$|bTG5Fji#l`rL`C;iG-F-?TPLtC+kIV9c)V;!+^vgVv#I7m*cxzI&qo@Kkw&`?G z@?HdE@Cq9+1{0A)@btl=j%5ZAi1d#ch6;FYuz!dQeK1D{qt>#3S}V^> zH-$^=g`J?-e|}dtG(JK4h#C}L)*@K#!ZzfYHeF_u6q7mh@~7Ndq&oX3h`X$ajhw2< zvQ-8AjHsvsvajZh+$fA(#y~|A{y$uRdKo^%1O@>vt5y}ox4Szhx!)q++e%o*sI>IP z=InuDAyJA}y7)Cs<55Fl8ccR1-BYxvma)Gfd12!xoGd9CIMIiDvKfCwGT-QQln|?9 zmoSb=;=c2KD2QU2u=?tTQbxUbZGH8jpNVB?J(}4SCEk$-@ztmlUqBI3%Vva0d zgf@lfZl?ZVtoE8HPr{2aGhJj&@D7>|DqF9aKOHN|bD@xF#uIJ5@la;X?SfIV&W3`t(LkLsUFi8Fe>6|Xz;lA zV^`#T8M3KM#Lpi$}Zr? z<_5(09Y;@C+bZp66tYo9K(IOGWxP8XszsFy#m~2PK#=U+M)}m}hG+S;xXG3|!WtPW zZ-G{Vmwe!rTH|t!gR7-lRQx;T+(rA}$gtd;J}`f1lZzM-AxS$7QEBcTb$^!8cG9G0`0Gy&PKu~c|x~ZM>iSBrS?uwC2mOv zbLqR+IsH3dOWua+7MM^(Nq;KX80HCQkE|A|Is8N#WEpsuFCA z{!lH5U?V>!Pt`bhmwREkfyoO0NOI~-Gu^teX9CAP@ViPW0W$NstcV`}=0&ZIzzf z4=pJo#2Etu)NJPZZS6SV9g~yf#weYVofNH_jTE1=tlpR_A2Jyi9=}Dmo&%o0Tzx>w|9eM*c?gONhY0|o@l>WyNRtNs?dn=j0=({*tZp1d7VtDmvDn}g0-B6VuDaj# zB@o%jExsd-j3Sf0ImaRIihpKT19XZlU6cWZ^dp>#f&n@)mvJyQkQDlmOPKILxz-4H+9^`@7VP zcxwMC>kmZ7`|NYQ5Mxv4z;Evp8_LX9l~&4w&>wKgdaxH}j*IxSraCjTQ_W>nH=3>+ zRDx$#8Jtxfk>hpV+OKe!Whi{4RAMRk$SBF45hxtjd)q&UsME_I}u%dh9gu z;d|;2hPCImesSANk#uRv-d_Th%}7_OlSmy}M;`t4C)l*NaKf^9L6z0#IpWR4hQ-IF zALf8r2e0Ue7uH^xZ65DVU1XfzY`?e&un%vFaeQ}vRvxAaA>96@hWoevtfK*`q2bX^ z+Yxs?8I|+du)dLR`EdECI8papOuN3&tYN5~o$VJDJ7qhoKp%RX2E~duC}!CNtwvHA zX;Qjz=^V*#-*;H41$+XHe8XyGm2DAKnUlsp?}PVtD{7aL`gPe zA)uUGU(VN2aHc)c_C8Yd&j-i0r+$5stRX@ZyUUPKXfi+f4=PxO^_ zE^aC9KTCTtpz1#$n%y9BzK#2DMC_i+YVDwyD~uY@#`gY{VNs9S)I6~Ud5i3eS7(BC zBjfo508F~b{#$8U1Hzb(Ypgq5wBG73^9#Ef8tTOl?)+w%^eHyekX<%&8R;a8;UbQ| znJ#t&$8rVsw|br~sPZyWTbimM;cm=k-~z(?fk{|X?47X$(CMRTIY~J^QsD7Ul`_ga zXtjtKOg~Nt)(ALo)(9mfB~=9%jb#$r*^Br~g+YgRBt(57<^Rseiy5$Y7PIxji$D9G z_7!thY$6Fg973Y&F>Z1Lm$HpOU(^kS&#)C}yq0SWHHO}66vId1lNxK=y_bcMk~JX7<146yuI5S zlThS$s`b^wfq07I*&f~Q=~8X2MifSFu%FHG5^>WGE783q^)|gyI2$K;1*vU{yLVg> z4p}3UWX3F!9R*xAo0#m|ZDwH#3JQ|qjPNeL%rjUoRAVTpr#;r_Ej&ND(Ds;!pN%~( zJTLRqpDxKvB{7XQugmg3od_%xDX1B+jl6CfDKLC-JzDL%`X)a3<<>u<{PYnYX;r^x zHPxth`?hYw=rua@`mogCnmCBIb_<%Dw==8Fg`TZrMA2+Ekf8dTu+?*;dJvyqugJ=ys;DVs{>}sk%Ru`tNDUkW~+iJ zF>M_g%IlN71Ol=cI(DTQB4egh zBgw@hA<^fCsyG8fjKAHmlYiT%ewiO${{`$X*JVDqIRX=X^>i0`XI-*yy!p+CD~i`| z1*^;|sU|=Ubw0qG$m^YCPJkVaTPuu6pgHZ(q4Ug6`?jsS)}DW6{%qr<5$I%(M46#{ z(Ib#+A*aW)=)iFsut(3&H**fVu1ZuSS@v4DfD^5;c>8)2>W} zAT~WdZ}cdPCDxr?cS>tfba%3Jx}n*wVWI8(n(^p7;^2_!Mxr#FlEtlAjPu*68?C6K z2?GXMs5?-Q%2pvPrF9fLSK0{nW9_&{HOEV>yIDt?+~gXqm#l8eQVF;h;sFvFV&o== zUpg)3kQ~Ud8BIqY&$72PHR>L;ZxdDNO0s;o>MA?l{U}ZK_qV9U3GWtykP&`rZa zEHaQ2K#rUKTCWHGtUP1Wuv4nm!i4#NbGFYl$m=Jgh+2;w_B2ZZWfJVoF`gy4axcf; zpQkv>1FqMu_~*GDy68qK>aF*m)_-FQp1(ple=Wi>GI&ui1J@Eu}5yO ziFf;+Dc`g%$6fnL1W<&?HnRoVx?!~}nT_5hK6?VvSF;3f@Dtyio^JbMuJk5Zor0zx zzw*-lbBKM^r3FA0fXa;%NX|nHu%WCa#1{pp;jB7?YGGEsr9ETl^cnS7z;#khGwffB z`l|Th=KIYfw*y&J8YRPnX-%qTUETW#Fg7_Tw>~Q!^Qq5U-N1pKtGrcC`Pk=+&-QPo zhwg&8n?#Y=aQ}W%ot{w0y=J%se9tj3UlQ4{?Zrq*5m8qmu#=otq2admoZ;uW5vSut z`RSNEM0t5a4ig$ZuE(zzMFNd$40tGR?2(h>g>=n+xC-|HVB1) zrN9^lKlA*MsL=l4r+XKbLB+<~56yV?A7EtJtzz*`u1i~#^%uY5{#y+U_k+q;3GnkP zK`pJTztfW~&TmaV^71%sqc3;YFbO;LN!~qugx4b5VNU5zXI8A?zQH02z8vr@%e8eS zDZLi5Xl~lY%cfwhwr1iP)*HTdBzT22tEQ_WoB)RGnTPsnwNyed%TJa&2w0i}$ye@V zm%hA8e>W6vP_|uN zB;FlJ8f@q9^Hi3Zk-mH0a%xpZDzXf=a~065ZIyK$R-TWE7(Ds+?+Z(a<(a`82ra5f|e1Gj}eAL<*fTxYBody*MefDHZ=B%E~7T-87v${7KMF$O8GUll4aqnfSltcaLi$X7MA9W`GUE8 z?WwX%R@StOOx@{67;;y5GoQd!CE~GQwQxftiDPc8Me{*z*KVX$#Q0DR(A!a*pV(p)vPEfUui%n6N6( zn2lPJG?M9PWZ=Wn#LuB!@y~cPOgc(^CwjS^?@T3oNZ%MQ8XoZ9Z}w3QvWG5;!y4>4 zZNEzAHj_}rQ+p04RZo*zM963o95=gA%3+59(P#+MJ7Vg;GrcFf>5_3~sI-dN&G~^X z#rFTe{KEg#5krQXPMcUfE*1IHk^*c-R#$-Cp|m{D0$a|!kWWgi6_)?7Pa?|8sp#5z z-MdS8G$ni8$Nw$!AsylzAn~;BgSUXD3W`)wzU#ks&a zS$S{K3_V3LE}~w9lFZLeNZGg|@z{&=u@W*p#%~vm0!HW zVlUH1WoW-cXV)~Jo%%3|Yzj&_N3zV*7jAJAY*e=x#qh_TK9}qtI|D4;$l`4E5zZ~8 z9J}{Ot$&lYW%+z>LqK9St+6E;c{a5wuMe?ZgBl*mV%0bfo8Y`VkeQqGn5BGK5B82+ zaAwJNM`-ZsFw*&X9rg4EZsgG7hN~XCpGDH%a{`AAFpWTNqrfjkcI?g!gYbi!je$Xl zOif*~5+`CffGZg}EXdih0lbK5VmOjvMeEPhNe0}+l#WRA&gEl7)ZR=>TZf@uBjSg5 z$IWO{tbsazuWtC6rX(&O8qaPZRAdJQ-eX^~ThqEk1%2Hj8L=JoeiJ?b3z6oz)-gTuHPf zx91WhH|ux;AC|ghXr!+bS=tE`S%OO}=Mn;Jbg0s&LpI(ZN^g5_x(#%!lcKoR#D+>~ zot-bv-PhBGTw8=^F-=jce3cQsrpAXHO#Ll&_d8-U#~Oc0K!J`z zeM;*h^vD?|;!L}58oE3t^7e2o5d}B#6m)Ye&>1?v%>{#RC#4#%IYqe8&~HwRSjYH^$$lrSMQxT zB38FIstF86%Vq#At0O|h!sHOFtJp`y{Je&Vt@uIJEIHRGQ|{s{q|-B6BjWd0UUQ%= z1V~@-j1nH6o7~gURjraWoBI75%FrDeBC=1H z4->$Sr6^3JyIaRAHpAX{p33zJay9RDjoj?_a9-{`01T1XL6?Kj&mHOMttv^EXh3#8&ddR9Rynuudt|t=JoO?d1E@9j1AY#iO_qv%Mn;6Soe^1wo9)T>Mfzl9(`|yC0=PrbxHWGnkJ57>u*p>*Oxq;Kk7%_UP@(2(JgX-1@lI(k=m7Cg?aDgr$ zs(a!6PQCw*u#`Z)XuvN=*LO^>jAZhr^rcWaxNr&IK=eba)!JCI7bk)uOQ-I4rF4u9 zPp~eR_))=ApwUT)N-%BvlcexXwE>2~A=ujVbcluhdJuDMXp9z*wCJ|v{t1$pX;;FF z7?_qnVClGll}ee)i%8q&`5vQ#oF6_n+i3FP{g_lt$=ZcQ+VuJ4`d!V*LO2DgKfey4 zHQ4yvsDOah7boZ+@YPb_Hpt0&nl(^c4}%kOu4QW4#Q7R1dby)`#lwRRIUO0Yn9NJe zYDjH(Q-G?aLDXzH5d8g7vsG%KW*>->_rDhwoJ34hwUN87OKwGxL;;vvWP6cr5Qv&Lj*)|qB1%%n2Ai#q2qPRD0{~I4KZz>{i5vzGFi)LaB z2(o+pG*kQKp~Y4DXma%b=FI#3f0O3_Ur-#0Zw{7|R^i3_a%_)yIAhfJM92<- zDFW^i`a_$Y7bv;r`H^3)n#B@qXuePf_dM=lo_6Ln3}7havsxf%!Opi31#x$6Mb$2T zcE&O^%g`IQxxR<{TrMH1{O$oUWXLNy^JAIba&Rt)*#vk?u~lYKnOOakQiGlB|OVSs65N$`wKM6Uzu~(zY~zCx=3j z?vJ{jY-`pZ7TQ>H7~YAeWKoO5RIS@v5tPgm`DIL@1(|C}Ru0-uTQQt>W`fa_3^xWL z@jez&Qodu9zt65blihik#tT%(PsFhK*$_q65L79+Y|*4tKZ0)?Lx?RbqF7hsX^un3 zjZAxkL!fe%j2siWLsQ*)_SW~qZLIL{l5%A7Iq|q?)LCy3%LCXsezv&Mx_N&!^`(X) z(Nqped0<_ZTw@vFie%Z$I^B{<7`$^~@ZB5eP@DBjBIFC24^?pJ{aO;P1KA)z&Ar6* z1gl%7Ov|$WqRg;fiWvz_C3L9v_4u@nDO7BKtb$P(U@FLSGZ<)tJvkwWECzyQl$k6}5d8yyz!X>|gBfeDL%{Phw@& zX_Jly@9b`F^M83tH~R~b%a}K+c)rF`+b#IyC>W#VgFU#HrH*gPP^*BesGNZ;WMJWt z9x7nI@h!^F*;XDgdjI~e`C*-~5`Bfajyk4)i1I^JB<8OVNN4p6{mZY6@hF2{9K620 zBCCDV%HcM7%lRP%+NrO8KZo$UU{zv}i|sixAAZY;zz;UM$Zl*)tii`e0I`5pN};T% zpy?U~x;i8NbkfJ)GPD15HkBwSAnz;J9(i(VWWauz`%K0oDSjH4Y+}5Vnr7MeYC<|P zJ`9We4;TjgvgEH}a6CdtJ>k@$N(*c3R-0eEVf> zj{fTWzZwL$YGscDuJ>ok?PK|Y+D)B)*53++y0`>epH@kf&-P|CrS(=9r@KFhG^Gfo zMZo#T+S|JAiQIMRl&7u}^S8_b>AixFv7TDjL8q71%^w98r|o&A_d{JOi)^N3cR3XN z`dmbMQ%1mTzaLZT2Ck#@U&;n~iN}s1kp$lic(=F7B5f80YftIP7H-C^TE{Bwl-__q zD^vjPTG3h>2~z&Sw>@35&>lZTGm--oSW(w5pib%w>=nA>B=F!@TwtGHo{*7l_}9iW z`<%m`o?+x!@2ZdLjti4?T^I(rIS~+!yyU-vffrojB$0!~u#XH+72_mS^%9sQW5Txj zT^vjVVBL}67?RaFdwqfPL+X!>At2-L0rSnd^tub4{&~ckVh=w>U5?l6-3@{q_-a)Id&8nHE_)rSSN~)IyvY zDR~}rYE8R$%TZh5Pm0dO}It6^292n zG}OqysIHQSRh7bcp~Y2zM8dPv2CZLp8&o_I)>klSm>ZHe1y=slO*;K0HLWNh?b^(7 zSSfa_dtWB(oWMAwK;Nh~w^BvASeYEh z5n`Xb`R`;VWqd2fk&Ve7>M2>^u3L{6^D|Y)HL#N6uabI&kt%``lxc&kf-|yhSPKWFoE)wz{9P0|hM~H_K|#Z6g4e%?=uhss9t_yQ@BXJ1D2SBi_w9iLw~z0WS`nboI`kvUt)@^-?& zI0KgpN0}`iS(j{=ngqEW&b{ZJCCe)n;rt7k+WPc;O67$IoSD}i6>|idAuHSo0=G(W zCU2e0zXb%Eixh`24Q`Sc6tEDl2ON-szI``heYbfpw1`4Gswe7MV`IhJ-`nKPdE;Dh z(yzoleQY7U`OX^DcE7-a($o3mvCIE?pR=z?yTT@AA)ULa^F1&OH?Ko-xFn6w11Vh7 zxy)g8KlRy{gmoYG;|Ew~z^>N$b+^+rS90& zWK&9EaVul>$>IjcwEVRD3i}H{1xkPQ^kR-SMO>`hkGj3QE9=`2yFOo9-<_MN^J{Y> zA0zWfD6=Fm(7!c{*{2>O^@&XQiI2UH_#EOx{6Ml&FEU2%lio_|$z~G^Zo5|ecOTCo z7f|2&eb<;HpwZ0>E|u9{NGU1Ymlu_ZR&Th@z^6|n^o%`$59pvnq27mw>$%90o0;Py z%jFKopo~q-KG+{a<6jn-!b64*zrd}}%uwW%BHQwv>CN)0Jgc`UQ0{X;N~;5Kw#dZT z6mt4b!~)OUQcbcQ!|-M^nT6UEP5Rt3V&vvHv2C8f0{W*(xyZej_LgfFYw+^u%}my( z{Y8@Yk@b@Ke1+CO(|73ZxdpUG>5P(Utvim^A(I#SMSsPX0ULa z4gcVVPEef9jumw=m}PThRg_g(j)UA_e*uEo*>Ow&4BH-sSqCOJ*@o`}SnJ5PWH&2d z=x!3~NjEqw-1>6ORRR_CYc?Mdn1A^iu8E;uT~L(1(`NfcIb~pV#fJr2d{Il_(}lU} z7123tosSohLqQb_C}Vfq3$%Vpnl^Zm$8M2o$t2+HMP6nO4ya|Rxc#rb}otG(;= z=E^l^PjS%U(C4#l=F_ zx08(eq;_p@T1UCQs7~VUX=;x8;H619-+*np;?V5kZYl)O`~_-h-woEmugo)TE@lX^ zyj-G6y58upZ}Kox(!kF$gpyLNp(*5yoq<;6s^2yQ&fd-ngUQ{bu_u|7&G35Dvbi`V zLTw2nBPm7uzq43#ep$Is(Y?V{iBavWsxoVFIKm?<@%ZIguqMn(D zfG&%6MjZG^!IkFYh;uLO@6B|~_JDCm1JV-L{YPSC;*ea*0-K@pw#{LN)PS-Bg|U>; zi;HbSXa{gv;H|#t5pUq)VIgymzIHyeLa)!xtjKjjsaI}GQ=ya?4qxYLSzKUav&6)x(|b|1O+AUi&26(aGpM7*VUHfq7k`H?VzbW8&1GW@Sg;R7v>cSF z3okQvjFLYo7rKs*zAZ{hx*!!7W7{FI*2qKPcfDjjQFKr5h()e|Do8Edo(d2OLw%2V zb4xf!Lm+m>!kLU9W3UeTYV*q_Q*lz@HX8aH%m?-XyGOV4KQ#Gx2{~!JRF?Djk(-9| z+>x7TFFsbhbCKS1FKMyQ$^}6)uEwLW&*X++d8$>+8TpEf_!Ju_y5!GO!6(A@`>>Z$j*MrL3y!M5`tMG4r-BAZeWHAo%f;Eo z0^CJ4?#7R4>%$TWHvcNGUAl2aF}KCG>pf4r^M3Asntt+H{Ou(DP&CX)Wc#?qt=)CZ zOTcD1=b{8toR1XRXm`x^w)YOba-MMUV!8D>_l#3C@Hf%otrNOht*k7~8hhK<#*X0I zbAM?mKK~AV`jcAi`dIp4s$836pR3!!dV$+|G{G4$Er(}O=-|Xk4(L>9w0-G<#p~Nj zK?NxZ(staK4uUv`3NF%iH^E}Fh#vH z@eA~)wDFu+A6K$t98LXi){WAzHbL=b02RxbUo+%SpE(3gL)AX89g_ME=(qCBBoJSA zP&hd5x-o818!cGkxeCW%`R(fG|{Jmzo zZ0$~{B$20CCKXYp{9I0f^Y&jMUYP5q;RgxCgMLVb#e-CD!P`SDuwVm`n<&unx7Tae zIn9L(>3=HA%<~Dp0OdH47Y?*IoICRNA@>sL3vJQRG7K^uAF9~5q3+jz$ZqPn***#c z-EZT}+ik&-BD{~ucdvSp#sKP2&QGYwdn*(8o%6;c1vmvEC+n&x*njy&Q0&Uu?zM6D zaFWsPUr=Xb>Y{~IXXAL9D%s-wgr)Jp40LF1;$cr8nr)X^8^Sd;mg;@@S%=M@20X0} zj4`yhP+^dyFL1N@yiqXe3sfPBe6iBq=!NE{ep)rP_D$4@F+_)Sz$w&{LDa&H?z@)8 zIU;2dFJNA~4D{2Yv4G(u>#KLbX0&Zn1ru!|f|Auf%Z}f6u&A!`dt)e{E{P4wRl7Y= z`wEL-WYUSh>dbwj1e~|ob&VF`f_tv^yV(SsM}v{kYoLslcH~LN@mRN=r>aH-Y}EXM zF`s>WT5P}O-m>XX?)J`?UenEMB7Tjl1yM+JX0~+KZq~?SvsGar}SAS~}GUfFlRY?JtE4cc$gkY`%YX6yRKHy=%kA z$i|d!pW+4@W0Rpgc(`ugUeuQK-5;%?4f^N)*k`BfJTGNx8e$#Vn_KP2HNBQ zbJencbhpUOX>g05z ztT=D$v6^mRw9AGNQCGjsel|ljDY^?#(XJ}yHkU_}nk!e1#!OrdRl33cb#hzJxGJEL zZH70({0f|?!RwF96hkC`@($_fOg&ljPOjUVH-(E+4jP?%t%0d<(Zl6BwKO}V$FUm? z*>EAR&%&|uyzBKQ&tl~WlTIiD5)s;5#{6phJ(6{}f3}XGcMs#5bqL~zmrJPOWfxJ4 zW@H2r0KFYKPtvM?WsWH?R24GObpnEV`{*_)CCo<2PpRStSZBP%*sL}(Wv~LB$%FG^ zn97m@XYEY1t*X3V!S*ASv9HX|_OciEP;V~-NM8ImN^~a^B*hJoV=sDD)74~T%|A*K z@d3X7<#I-vC^1c%DZV>zzd=3C8Vr}1k8@>~$-tmR_7cL`NWLVEUv8+gY|p7-bnY)( za~)(y$p3wp7Xw-xb9~A1Ix4SB-WL2?gHc36%JvIBW7s!#z4Om&r*Xo5nbp~xLvKbb{#GfQpWmbJPXluj-!_N+Yse-qpz&ns_pE{DcB?gy zZUyH6^O+F$@%?JX*lO964%|tS7xF#Ki!Ay1ji~aO_hYOK?Y{u>>7xkQa+jv<6Dr?I z>N+E~!Pf^QAks(FeI?iw^dV2)&aVG@=VEV3noZr0gu(XDcekSc;y*6d|NOcs|3H|u zfvLl&X;tgI;MZ#kKw`8JL)~6pht@_WW*s@@668Mnq-YVpOrGFux+RfkANVr333Xh( z=^$hx0yErqvXWo5o8smrt3y)k)ipjW#tLP>42D23zX$>6D-5J`E=GWHlmhInK2{0M zH*gY&g6;M$7t#XCB3F!>5&x|z2CLrE4GHtXxoI2mO`#4+4?8~A^iYwl@>-3aolb=@ zNC~>SPP4=9#4EivwC_>DPx1XT@c8KXl$@BkD?aGk7>DE-M6S$8+*gxrPqow7`t2mR zk4Gt)?dkDqBP(_7yLqaQ)XNIrTWP@=_3IjrmoQp18w=6ifu|cj*pR06ph-i6`$fyU zDcdyes@1oH`RR2_&8O$NPtoPmoflKP-nY+#BIjgnk1P7YnPX$59LjCvv8bU+DHa3% zZ@~t-dQOQcF|y*1!*P<|;R;h-H_t8uaHQ}+?>0_q%vOuwzW`8oAW6t|&(T)$h; ze-!&Nz}Y6d4QV~%aOGHkTk1aQoM+-!ZNN1&>lx&*o%LVyUtdj3z=#-t#tESOuurBE zVVs>QF*6?RUKjo|&a4@%t#L4P;Q2Ic+6c4 zfvTcS6P`d8lrFD23i$py2bV8DalxTJ2Mm8q9g&wd*igF^mW*_@WHM|Mfw+0+$il2Z76 zfuQ;|7N&*}r=X%L?n#=C-CjWbbfi1bNOa~g0^~-f^l&*%eZUZY5y5z*TGda}C_pMV; zgh#ZMFX?iJeUx?g&A}Y(JBLli=423Tw06i7*5Pur9+#3cz&85KcHjo$JwIP>;IwXk zrFs96?djg>oipm_w6KNyv2})ccLqgiGw<rNZcCiRgQ?ay?3=;J}8 z<9MJExaTX)KLz|rvUcjoI#FHpt42BU9XZ67p<+8BP}+~z2kxY#8eQ(ndEma|F1Asa zyEXFBWf^_W9+?U!?g)Bl;IVy9(th-@yKX9@Sz=@$BU0!(+H)m*Ar0ZayobX3GNnkJ z=E5A0yz+AJRWV4{>@_m-Kz~2sb!sQ_#v?DbNO(j@!a5hR|%=ecjSuzDW+{CUlDvBO@}E7|ulJ<5}-IOe<=tppU- zCd(2NN!Bd*JgMtuE)?z%xO7D>!3(0ndK13?tXFB>BCsGk%s|M$4@Wp6Z8v>{M33d_ zt%f^%VUx8eTCRdNIi3CITub6%GEZ`@(z#~eC&7sM{@)O+=77}iq-iQrWc59 z4~GPH`~0^b(JjPF&3(mY%cFxF zT9A>(v?-Z>C0u=0tej5C!=UTfT!x)s$4t(313GDa*5%yD&|OBmNBZi^vxS@Un5akB zJ2GAFjNTLd8}6F3wB6ZtpaC>;r+Pe|V_}{jZILTIET4JtS; z6Ncf)Qs1B6mUh(HwB2ry6c%@`u0lxOG6bPQU&QxX^Nu|S&=K7X_q{gq2Nr~ac<281vcMkHN|N+t zc@uyNxU5JB?CJj~_J*09~Pqn+i# zKYl~Imv_4&9crgX6wbz(_3#QG>9>AkE=y8J=Qd>ps8A2S+$Kjwr|kDtSx446R~CN* zIw`*#o8Bu$GJAkus=dg#$%{-Vh-q;{*s@qpLlF`F;X%OM8l~hB_;ZCKS>tj$H8~w+ zBS6?BrD{vVr{(jS5=BX|$YgPLR|u#OyEF`^pBSc)5%i!8i71S*oe+O9g1(i|Df&aB-eR&$H? z`BzFvpAD8W6wnsMG;iKNhMc0*3Dm^O6I0~xqdE66`$ivq9PIDux$Qw;PbttM4cYBQar>*x3fWi7z^4=%E>J0e@4ssB_2Ps$lt_wkm z-=nw8(fQDMt&n{?Dcp*JEuC;-^EC<;n?g3*iPYl1CiY!%q7u3 zn;oYvvB$R?&P0~QVhO8#z-8qes_#0LedKx5|D4+6oHg7xXS}47mjOxeEXneE7Ca(4 z?b|glmZYyrvP#^-Q7I(9^D6)ip3xxB=K*cgmfKL{hFokL2)b&)Fmf?E=zj4dYGM|N zv%1KV`hH&wn0;?a;c0+=;E5_AOA22kZr9oja()9q+XN(afHf(|Z%vus?5p&c`71wp zbANZexOh(Z8d7!Cd3l^6tFsCxa<}=q4OWS1WJwGddxN&xh=6#!E=`h^sAJwI%VUAK zZKQdp+ys`DR%o{2j{9{x3ut@9uWw#&v0DW4Bu)Uk^+Y+ql1X^~pUBl-DKxtxvnsVW zT_0bm*zo4gepi1FTbtXNa)D00YV^j_-R|D2#`U+^AwSPZs+fK+P+Ir8`}-sZ;d<8g z<-o{Q`)P~CLHp|FMi{xacaNXe|HQ%F?#WuJox`Cqj8!Kg0q!}IWn;(2u7wSA=rif)0$;MfOUoo= zy_cKpv@hx^x7ht7tS!D+nZ~JgVy|AiwSIW;`r}BJL2*gA=5ymxbJ;~qjqCDf3NB%( z=fBSWvDXC4sl`on!#c8mG}$;iUTLnlRa#9{z`F_c?t{<{@($ribd)mUYfIwMWPM8@V9d^3QxmdClh!14a^Ab- zG=%4+=EQr1SBB#y<_#PvLY6Ov_gR*j{*w||DBtB0vDD{W24hh}4)**^IAyBf1UN}UiIy*)rY5oncC7|Kt%Mk<50HmM6x zjvto;bV%->y^G`GJ5?g8eSDnynovA^0aM0{N3({Sif@<^;umUFNUe-((@Y3GFl^N0 zC3|U(@Y|GPV|nt9rxN|oj1z@01)3Q=y*ezb(*qszJ^V~?Z#{8N7kLm&aI7*H6EFOE zY6Dw2+XSFG1Mkh>SKS_Sb2aBR+;!aGyvla_?iyF<%DT+hJD=K9&7;Gm+qOgHMXP_f zpB;nOj1TPkH$-vB@NH|QzE9n*1se71QCFNXhGO>1PNSVhE z>t)^m5cV}*4CNC@Ovhh}--DXx&3^4vHU*84hiBZ^sQ1UPRc!gby@+f0an5mxbQa8v z_2$Cal`3l&>++nox>yq1R4FU16UkcFtS!|41Yq5S*S|+n?zfbEeDjM1TTD@eZy#4b z1G!Mplz;H28fTCNhEP|{lGPH&Hd$SbwUO(it&itfy`b(j>!ZLe0B3l4tQ5HKtZ{UY z@J4Q)sZ5Li4LIPPbN;p2+Lx@w8r$b8G}^M^j*Ey|L7D<{6PYHR7WCZHno-d064$^4 zp2^mOgO*xj;8%VrXKAi5IehzyM~dTh7l6UX@x!^lD~Y;{B7sE*svw|0qpy7e&XHvt z$W4`HV$kaT2hsxU`d8I9-&63TaGyt{eak8Dtx{v+Ove{E{+u*`1h+pVfO#CRXpM<@ zZxit^JZL#CGKGd#vmDg669Und;Cn+MI5?25tc0lgKa~tuZJ(HAVR(^xO+>GWp`;a@DYYHxv|f=*AD(@PfkXQhpL83PdyJL#g%x~$h<>A7a9gq#`rk> zkLe2dDHexAq za8a&cb1INYz-kjK!yq$a=Ub@^7C*dX4)-{4%*YV{ z?VQ)IY-}HV!_Jpwil?;E>Q#NCMfkv$6cq zm~TlvC0_qx{cS;F5D2kpG38TUL;J6{fkXs1be3p!|2DVPvma)Qm0J3Bso>=cW?ad2 zk@{y^pn8b_H(h2C6=qrNml#w25Ub>`2?5f#h63Ce7($g~xVWh}ex-dDQM)PiUa@_r zo~Z!(^l!-2cpWtg4;$WqAVL#lgH?m9%&|#CfXV~Rr+9}Hd-!Im=Z{mWs8R~7v$LmD z*NdCLuB#h=SsKMjtFRg?Bd8Uq?>)X^`pb97PgAu2QTce1H)s1KwwaEGKTOinc>lzv z@Kie-sMpO-S9(oSt7~i@=Sbgg4aV-;Xgi_nQ+ulkj8VU9$o~ihbRg{S z+Zp~u^t7`EzAHd~@z4JTFt&Wz5jS=M+sWH*PMEGrO|dU?7v!c595Pz82tX{GQn>%S z!^OJSBC7AV=l%AI!NvJJxk#yUkyEJVcYQPOWe;1Z9gdnq!P#(@1)%~4P_FdPZ%-l_ zw6l$90{6m$_hhTjy!y*S#i1GsV;Q5D(&@X+I&Jcua#%c@{@CwB5qbdA;#l~I1<)FH zxw~&%6`K6OHc$93c3mP^c1WWS{P)kcw)$$4oY361vbTLd>u1Q#b+s^fv7!o+_*?x8 zd>TXA?}l_n4{O_1J_i}bHDYAQ2=H$rNX%@+5S8*HZXD1$zv)f*Bmb&%sFV6BBy3@| zwV9ulkzIB-0z_AcQS4Ul9HG$2qkFI{itM6>O3ncQ+?NLp0+l-U~Vm`8dXCUxGd+x?mTiMAJ4 zB39Pd=Oy?a8;9mhmm(vomDi^G3LwFup@3^MW#tR@^BX%u*JG3EdE?>NMK2kqHmTqCc*sN8sJ;6#9!hlg|oGm?eSt45A_{ruT|Ufp)HZU^5*xFABh z*l|3ukJ-~dAVj2VS)3BnsndU@OblK7@%W3R&rU|Mv0_uPQH9+p?=m98hFLfSlHNP+pw{)8x0O2`| znk6j8j63Et%RKSyec8FckP^H(SXp0vix5DSK6JYswG@ft7vdcVc1t@|VnoGFGx|NM zEY-VDl&*ySS}H93{r>@f|L?H%e<0`o2TE+Ja!yxWee8(7LjZu$(ZlRGx&*6{@rOM( zsj%riLX7>GtDf>TR8>_vxou-=PW7lx&+qi~#Y$3DAm%Nxk(qS~Y|7t%DEYbMFjY{T zP()nO#)N~+Ywx5`S_rzw+p4OkxCTz+`$ra0a5+wKWV)LBg6Z%6-d*w__vxiAD&?g;xdTo*cCS5p+JKy*?7PzNvzf5MYGa8j?-3rqKEJ@l;rQGju*|#6&_+amwu?oY=-9}ukxIEoPxVca8XG;6 z-- z!4uS$6JND2sQdN=t}XQSXej^OVX&%eO=Yaa;M1oeYk~1U%0B;sE|^^^t--t~wJ>h( zExN@0c{RzrV@)!x{+xq0Nl4tlMDOT&U)^Sf(JtFR$nx!~4^-m9@7k(n`lzJ!=3fc zVw&elSI}xw04q-bH(vzQ9vd#c zU6lRsxx350Q^<(YJ9<6aGDp&u2>U;zMh?)m<2$4UBWl%eeWvj|TEh*#fS|AGYgPxL zG>O1`#U@>T@=h|(%`dz&V=at~HZalZ^EAnp=|jVAV6x!(#+NFNmT;DV4@v8rZjL<4 zZB4OlKj{oSOu4v5mT`B@E}CaurxwY3HYbUKsd;r*J9-ba$-W9D(&{gX>AU=(-$wcx zDSka+zB1?IugOhnx*m?&Lgv?rW1^J~ab)pEP-T$6Y=m5cbbGf21cj@%>71JnE0|IS z-ZJhNo@O^!1}&sH*G$)#q~KTDrSg3@=kRgqDZCah`ww6Z#6K7qD?gCyS0$%9%s-!p zfr2lEQ61RnSWq2KBdX^c3KH-f8SbCDc&auH5h^7@721O^OQAiZT_frm_?MdGts#;e z3@RD{zJ@=>M7#ie7+V=?Ei(7X-RM_jx~Q=I81_l4J@)}U!gj1V)RbYQr6_N>kjdO$ z18IsDm#vbMynbj+?UuRgR7k7N?K->2>nbw9=?Vk8eYhUeswt3mt%IF;;k!T*tEb=x zwJ=rAfP4hSq$1Aq4&K^ksatN!|5H$N5wM~RaQhti}4Wk*Y&eDAxK+a!;XmGHj!-! z_BT`v_5|BL5?C2zF<9X?UUr}NiM!369~?tj<8}gptRFA(0-ZoXvoB*L!)<2VY6%Kb zu0*`FpVjhXtr*NTDvp#B>}GdQ4e$Miq-t6IOAi1~yAnnR%gg{KfmqrzwF$i#>9aro z!~W?J?O(D^1!&EzQ+Pv^p6ZI{S8rR4aX{;ftCG5$kN?)+WPfLve|35$c$ptx_529| z`TL^#Mh`QZ6izCXwmz#hYSNdYLxKD}SN6N62p99RQ&4BuC8)%vKmd39r_RpA92q=C zI_IkE+1aRH#NYbsjmriV+<;|is0_8@fsQADy7#3t5e9dQtj(mS*vtbxVgz6DJf}Th zx%vbB%q4h#@=VDKg8CHlt@!68JV$y8LWM~F&$D-T9j3w#om^aypGKeBw>rtH&r4j{ zA@k7o0J^AlC?s)`f9q8;So$+VvZJ(7s>ij&3c+s5Kkth`zDrRq=`h<<$WZYgJ2XEZ zJ_nAvn{0xTwY0>{tb{ojgATs_o4kVxj3C)Y|yJHc8yrCKH|FngMz^TYDNI!(}iw8f@GU3kA9VI#25}v5AV6p`+<$BY|L0W_p+L3pV@b zUvn=h^5av7UGoQ6Er+#XCE_ak>D}-z*>Ck>3jebP^v5zu0=WEt42M{_(f_FC8@528 zWX^wrh;Blf|DyH(PBbOLp5+?gXLlL+&v1kXna*1GsVaGYrrF!?)3R_4usgW~MZOT| zgV{Dw!98ZCIP$++mf}P?z1MdGE(P%!!?I1AnOwTDL5k2N=z5OItgwoJGn-#+>(93* zeevuQDaOu9XyVR)s?JZ2yB!OI&eCFKc0Iq#=h3lt@!W#g=kJgp6Fs3<sn7qQlUsl*mSF9l|$~Qmce^?~8yNBP3$QWEyeiBgo zvJ#U04di|(i7#}vTk;6gl=)yBo>KX%2h-d9Z~eCRd(Ho{^Su@7f}O=tFs|w^%YAL= zW1qRbUrhncu2i=bxc{&Au00&eZH>KxNIU}vr(?e7@~AxBvdlO6H{bdwtJXy zDP(7(Nijktw-JRfE{zDYHSWnZ*TI+=3>jvIne(-e_Bs1`&L8`nzt11*pYK`E_pNWO z_g(M%uJwC=FPCHUdF$?~o+m?4j^W>W%%+(YVlZfTqucTafHLCV=9Wjt_3i47zOTTI zUTx-)-23EpkyLp)Me;&*sG`4nY$KP*ZKWwSK+$xo{6pwggY_SeRL{-y9*`F}y1O72 zOrj8Az~U6sw@U&@H7CE@9#2SlsBlP)xcst}158v4r`BrGIdZTHM z*0;x6f=1Towx~FWb@;A>5tjwk%&pH-?A^)QgzoSIOSrH*{<;g3ka6OcSQVWzb?5-t zgIEpqd|j4Fv*)I>+yYWrM(#crlPuuFhjp1FYHaq1Dx0l=G36b8*A zi*ki4g@_7m#VNVch3zg#&bpv(BV~A0<50i;*rW5w^CCh}tKY8pwj>EWHfa`c#4w0& zLxIQTYB&$!e5L05xxE{VqYFFiCHuR5tFB#1^QCUr6km;KCB zXd_*U#`l0aaKr{|=@PQQL@VvMTBgF1f05?kU035PMuZgXiZqZ#k*#0xpo0M8mpCxC zKPspv$O!%NTQ14(;0D~KiHq+y*pn@92F%>v#uZ&qS4@a8tK5`gkq@A%*&-5%lzil_ zDlGwGRpOmnz?2fi0FUVe+4 z{W}2P?zqfZGvTS*Z}stBk%XBU_sH{7Ua>Bhb~_AZKq(F2CD`uklG4y<3JQ^Fcd75v z%{>_oki%P~10}H$u4GRG=bqGrJ>{B}o9>^`NIChdZo(>LE%IE|`WdWdgUYT+PiGEV zK|#{{W^6Utq$DNf*%w(XY^*ksQVsLkiUfSZ_;}jLOd zZ5AfF7)7dA9<}dOX?jKNGph;|sevG)eW|$*y=B(>nRR`#U#2Hrw62(<_|>u_=UDok zo8IfjWzchIvo5qNO3YD+1WvXI{5M!7w4un2+D-8Lu7{UvsytfOR$V4Rbuc%RStj@t zIorViwM~E1`ZIu?Pm9>8C?wywf5ixz9y7MOaZcU+0m@zLyZHSnsQ>=TKdCmql+d@i zgA)}DQ_BnOb5<@vww<9M^N&dg>NXGqL~cb_Cdk>xk)QaH3Jbve6^YO? z=-n5_sVf5x;OrShHKhAsgH8!rM}&Bx5@Om^Y#xV;tvJ5M^FdlRCC+~H z`nYsTsfP(akF(_|0FHk)TVtq`79y-j?4xTu=0p?rWMjC}>9ndtxO%;|Twc+$qf>RD zXdz{hm3{upPrH+}x@ICDTBmfC@4rCqobjo2$vj-_bpPhmhm!s8`E2neD6ATI*?qJH z|IY6i{FG|m(sFQyfz$`V2fRjxWZ%W+l2Etgwl{DIeIXbxRFyAUnkRoVZY*c9!s~gX zgj(TrMcDOAS++U~zE-K~Yl^t{i-Ngu?pJzSacYu7Nrm?JNc^a(q}WCyME>i5gg_Uw6WDSaK2M&4;a`zV3?Yt-b*g4+trl!#Zgm5 z`-Y}StLB@bMUoHuP5bm`gIaBQ^vWVs+zFc*TA$fvf2bR_u*{Sh=Vz9fb0@p?&A`)G z*Km5NTKMx@W68v8kE0@vHvc=kPWK4K$=HtOfJxe6LF`-dRY+JHLt!m6L`e@_OZYv{svseQtx4Cdi!!v zX7mnaN_nx{deFj(Wm{t~%E6+Vx=GtL#RMVPG<(vDU&r$K<`HWQd-p{%^h zW(%lc1&U2Id*LAqU|6Y@2cb^1g{4S&Y4y}tk?;MfhV9ue;vfZT6b)ByC7$oLMhF$q z=_m~qccF1OCDk%Ckadat+BlSEXC)8N-sZhNHM@36*2^PO?^=izhptJ6}5 zLeyHw@*c80B+wV^HZek6Rmk=AzxAbaoyrDZcYQS_x{2o&uQW$Hcel@&P^M&heZvD> z131@)7$NTA#^g*kbK{jA#B}Nt74b51(f5Z2t>LNPb~rl}6ac_M<&{m)%8IDus`EXZ z+MTRY5NJnYtmsv3Ju4&N2OTCF@8f$F4N>{EnI>aAF(xBTV-55+GB6r03`2%ps!Xt5 z*)v8`MOYHf2jb-X2_N8FcXbHASTsA>Aga&l>6wJEfq(}Te0OczeLI3~& literal 0 HcmV?d00001 diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index aca290854..9b062b050 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/training.py b/megatron/training.py index 6a4e843ab..3265680c5 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -970,7 +970,28 @@ def train( # to monitor if we've skipped many iterations in a row and trigger an early exit overflow_monitor = OverflowMonitor(optimizer) + + if neox_args.profile: + schedule = torch.profiler.schedule( + wait=neox_args.profile_step_start, + warmup=1, + active=neox_args.profile_step_stop - neox_args.profile_step_start, + ) + prof = torch.profiler.profile( + schedule=schedule, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + neox_args.tensorboard_dir + ), + record_shapes=True, + profile_memory=True, + with_flops=True, + with_modules=True, + with_stack=True, + ) + prof.start() while iteration < neox_args.train_iters: + if neox_args.profile: + prof.step() if neox_args.profile and iteration == neox_args.profile_step_start: torch.cuda.cudart().cudaProfilerStart() loss_dict, skipped_iter = train_step( @@ -983,6 +1004,7 @@ def train( ) if neox_args.profile and iteration == neox_args.profile_step_stop: torch.cuda.cudart().cudaProfilerStop() + prof.stop() iteration += 1 neox_args.iteration = iteration if neox_args.precision == "fp16": From 1d557086119553f261ea53bb5c3e4ecf4d2ec5e3 Mon Sep 17 00:00:00 2001 From: Lang Xu <59843980+R0n12@users.noreply.github.com> Date: Tue, 21 May 2024 18:37:57 -0400 Subject: [PATCH 05/27] fixed fused_rope naming in JIT + added readme for amd support (#1224) --- README.md | 11 ++++++++++- megatron/fused_kernels/__init__.py | 6 +++--- tests/model/test_fused_kernels.py | 4 +--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e11122f5e..e63a59f28 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,6 @@ To install the remaining basic dependencies, run: pip install -r requirements/requirements.txt pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard -python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels ``` from the repository root. @@ -106,6 +105,16 @@ from the repository root. +### Fused Kernels +We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build: + +```python +python +from megatron.fused_kernels import load +load() +``` +This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py` + ### Flash Attention To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details. diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index 1e4c9efac..3694e964b 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -135,8 +135,8 @@ def _cpp_extention_load_helper( srcpath / "fused_rotary_positional_embedding.cpp", srcpath / "fused_rotary_positional_embedding_cuda.cu", ] - fused_rotary_positional_embedding_cuda = _cpp_extention_load_helper( - "fused_rotary_positional_embedding_cuda", + fused_rotary_positional_embedding = _cpp_extention_load_helper( + "fused_rotary_positional_embedding", sources, extra_cuda_flags, extra_include_paths, @@ -174,7 +174,7 @@ def load_fused_kernels(): print(e) print("=" * 100) print( - f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them" + f"ERROR: Fused kernels configured but not properly installed. Please run `from megatron.fused_kernels import load()` then `load()` to load them correctly" ) print("=" * 100) exit() diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py index cc458bf4a..125eb6c52 100644 --- a/tests/model/test_fused_kernels.py +++ b/tests/model/test_fused_kernels.py @@ -30,9 +30,7 @@ ) -@pytest.mark.xfail( - reason="ModuleNotFoundError: No module named 'scaled_masked_softmax_cuda'" -) +@pytest.mark.xfail(reason="SystemExit: None") def test_load_fused_kernels(): load() try: From d3d59f2f1ff9b104ff024965b7519d757c955c40 Mon Sep 17 00:00:00 2001 From: yang <7129+yang@users.noreply.github.com> Date: Tue, 21 May 2024 15:39:08 -0700 Subject: [PATCH 06/27] Small tidying (#1222) * Tolerate no fused kernels * Fix requirements file syntax * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: Yang Zhang Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- megatron/model/norms.py | 7 +++++-- requirements/requirements.txt | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 1dbb4dd8a..c6d369524 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = b68ba6d + Default = 1b85a2f current git hash of repository diff --git a/megatron/model/norms.py b/megatron/model/norms.py index 8b06b177c..dda44659f 100644 --- a/megatron/model/norms.py +++ b/megatron/model/norms.py @@ -14,7 +14,6 @@ import torch from torch.nn import LayerNorm as LayerNorm -from .fused_layer_norm import MixedFusedLayerNorm def get_norm(neox_args): @@ -23,7 +22,11 @@ def get_norm(neox_args): eps = neox_args.rms_norm_epsilon elif neox_args.norm == "layernorm": eps = neox_args.layernorm_epsilon - norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm + if neox_args.layernorm_fusion: + from .fused_layer_norm import MixedFusedLayerNorm + norm = MixedFusedLayerNorm + else: + norm = LayerNorm elif neox_args.norm == "scalenorm": eps = neox_args.scalenorm_epsilon norm = ScaleNorm diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 501edf345..3ac92598a 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ -git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed +deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed ftfy>=6.0.1 -git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 +lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0 jinja2==3.1.4 lm_eval>=0.4.0,<=0.4.1 From dfc6722f2ab0e3efb65ce5b49449a2a8b14a26b7 Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Sun, 26 May 2024 17:46:02 -0400 Subject: [PATCH 07/27] Fix markdown formatting error (#1217) * Update README.md * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/README.md | 2 +- configs/neox_arguments.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/README.md b/configs/README.md index d8ae81739..e14274b56 100644 --- a/configs/README.md +++ b/configs/README.md @@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model. For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md) -Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future. +Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future. ```yaml # GPT-3 pretraining setup { diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c6d369524..306a0da5d 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 1b85a2f + Default = a3fb470 current git hash of repository From b5c0afe42851dc4878aaa1b56021dc71b3ee435d Mon Sep 17 00:00:00 2001 From: Jacob Hatef <74274091+jahatef@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:22:14 -0400 Subject: [PATCH 08/27] add workflow_dispatch to gh actions pr so we can run on command (#1233) * add workflow_dispatch to gh actions pr so we can run on command * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions --- .github/workflows/pull_request.yml | 2 +- configs/neox_arguments.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 3213718df..a2b1a2fc2 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1,6 +1,6 @@ name: Pull Request -on: [pull_request] +on: [pull_request, workflow_dispatch] jobs: pre-commit: diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 306a0da5d..c60d1e15f 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = a3fb470 + Default = 516169c current git hash of repository From 4a34e0a565f19a8578210654afcb3bb835fcc35e Mon Sep 17 00:00:00 2001 From: jaimemcc <99298642+jaimemcc-intel@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:26:54 -0700 Subject: [PATCH 09/27] init changes to README (#1232) * init changes to README * Update NeoXArgs docs automatically * Update README.md * Update NeoXArgs docs automatically * Update README.md * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- tests/README.md | 77 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c60d1e15f..f6c3ecde3 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 516169c + Default = 7aa0074 current git hash of repository diff --git a/tests/README.md b/tests/README.md index 316096cc5..390a52898 100644 --- a/tests/README.md +++ b/tests/README.md @@ -32,7 +32,7 @@ pytest --forked tests/model/test_model_generation.py Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu. The test cases for cpu can be run with: -```` +``` pytest tests -m cpu ``` @@ -49,3 +49,78 @@ if You see this kind of error: RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method ``` It usually means that you used some pytorch.cuda function before the test creates the processes. However just importing `from torch.utils import cpp_extension` can also trigger this. + + +## CPU Test Integration + +Tests can be run against physical CPUs through GitHub Actions. To have tests run on the physical CPU test, here is generally how the CI should be written: + +### runs-on + +The CI needs to be written to target the CPU Github Action runner. The jobs that need to run on CPU should use the hardware runner's labels: +```yaml +jobs: + cpu-test-job: + runs-on: [ 'self-hosted', 'aws', 'test'] # these labels tell GitHub to execute on the runner with the 'aws' and 'test' labels +``` + +### Software dependencies + +Hardware tests that need python and docker should install them as part of the test execution to make sure the tests run as expected: +```yaml +steps: + # sample syntax to setup python with pip + - uses: actions/setup-python@v4 + with: + python-version: "3.8" + cache: "pip" + + # sample setup of docker (there's no official Docker setup action) + - name: Docker setup + run: | # taken from Docker's installation page: https://docs.docker.com/engine/install/ubuntu/ + # Add Docker's official GPG key: + sudo apt-get update + sudo apt-get install ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + # Add the repository to Apt sources: + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y +``` + +Any other software dependencies should be assumed to be missing and installed as part of the CI. + +### Using Docker image + +Using the Docker image and running tests in a container is recommended to resolve environment issues. There is a modified docker-compose.yml in tests/cpu_tests directory that is recommended to be used for CPU tests: + +```bash +cp tests/cpu_tests/docker-compose.yml . +# export any env variables here that should be used: +export NEOX_DATA_PATH='./data/enwik8' +docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null +# then can set up and run tests in the container using docker exec +docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt +# etc. +# please clean up the container as part of the CI: +docker rm $CONTAINER +``` + +At the time of writing there is no built-in method to provide an offline-built Docker image to `jobs..container`. + +### Using existing CPU test CI + +There is an existing CPU test workflow that can be included in existing CI: + +```yaml +steps: + - name: Run CPU Tests + uses: + target_test_ref: $GITHUB_REF # replace with the ref/SHA that the tests should be run on + # have a look at the reusable workflow here: https://github.com/EleutherAI/gpt-neox/blob/main/tests/cpu_tests/action.yml +``` From 90a6cdb35f11d3a1892da4cb242c6a2576bcfb6a Mon Sep 17 00:00:00 2001 From: dmahan93 <44207705+dmahan93@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:24:49 -0500 Subject: [PATCH 10/27] fix summed biases not being divided by mp size (#1220) --- tools/ckpts/convert_hf_to_sequential.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/ckpts/convert_hf_to_sequential.py b/tools/ckpts/convert_hf_to_sequential.py index c53f28391..55cfc6517 100644 --- a/tools/ckpts/convert_hf_to_sequential.py +++ b/tools/ckpts/convert_hf_to_sequential.py @@ -119,16 +119,27 @@ def shard_sequential_mp(num_mp_ranks, sequential): ranks = {x: dict() for x in range(num_mp_ranks)} for k, v in sequential.items(): if reduce( + np.logical_or, + [ + x in k + for x in [ + "dense_4h_to_h.bias", + "attention.dense.bias", + ] + ], + ): + # Divide by tp_size since they get added together + for x in range(num_mp_ranks): + ranks[x][k] = v / num_mp_ranks + elif reduce( np.logical_or, [ x in k for x in [ "layernorm", "rotary_emb", - "dense_4h_to_h.bias", "norm.weight", "norm.bias", - "attention.dense.bias", ] ], ): From 2382bd4a6bfd0ec7199e1b7876cd8c457029e8e1 Mon Sep 17 00:00:00 2001 From: yang <7129+yang@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:27:01 -0700 Subject: [PATCH 11/27] Fix changed behavior of pipe_parallel (#1219) * Fix changed behavior of pipe_parallel * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: Yang Zhang Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- megatron/neox_arguments/arguments.py | 17 ++++------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index f6c3ecde3..7a56e361e 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 7aa0074 + Default = 8451671 current git hash of repository diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 98a444ea4..9cad02c43 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -180,7 +180,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None) config_files = dict() # iterate of all to be loaded yaml files for conf_file_name in paths_to_yml_files: - # load file with open(conf_file_name) as conf_file: conf = yaml.load(conf_file, Loader=yaml.FullLoader) @@ -477,7 +476,6 @@ def get_extra_deepspeed_args(self): return extra_ds_args def get_deepspeed_main_args(self): - args_list = list() if self.autotuning_run is not None: @@ -796,14 +794,11 @@ def calculate_batch_parameters( # either none of the three parameters are provided or just gradient_accumulation_step is provided else: - assert ( - False - ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" + assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" return int(train_batch), int(micro_batch), int(grad_acc) @staticmethod def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc): - assert ( train_batch > 0 ), f"Train batch size: {train_batch} has to be greater than 0" @@ -1033,10 +1028,7 @@ def calculate_derived(self): # Update 'is pipe parallel' flag # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs - self.update_value( - "is_pipe_parallel", - self.pipe_parallel_size > 1 and self.moe_num_experts == 1, - ) + self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) if self.moe_num_experts > 1: assert not ( self.is_pipe_parallel or self.pipe_parallel_size > 1 @@ -1106,8 +1098,8 @@ def calculate_derived(self): if "flash" in self.attention_config: _flash_version = packaging.version.Version(version("flash-attn")) if self.sliding_window_width is not None: - assert _flash_version >= packaging.version.Version( - "2.3.0" + assert ( + _flash_version >= packaging.version.Version("2.3.0") ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention." if self.pos_emb == "alibi": if not _flash_version >= packaging.version.Version("2.4.0.post1"): @@ -1234,7 +1226,6 @@ def validate_values(self): # Parameters sharing does not work with torch DDP. if (self.num_unique_layers is not None) and (self.num_layers is not None): - if not (self.num_unique_layers <= self.num_layers): error_message = ( self.__class__.__name__ From 4c426da8b6149e2313bc6e00584531f004cfe457 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Thu, 6 Jun 2024 21:37:48 -0400 Subject: [PATCH 12/27] Conversion script bugfixes (#1218) * update is_pipe_parallel logic ; handle tied-embeddings case correctly * Update NeoXArgs docs automatically * revert PP to be consistent * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- tools/ckpts/convert_neox_to_hf.py | 65 ++++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 7a56e361e..c884afd97 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 8451671 + Default = 714b299 current git hash of repository diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py index 35812383e..f4e0ccf9f 100644 --- a/tools/ckpts/convert_neox_to_hf.py +++ b/tools/ckpts/convert_neox_to_hf.py @@ -580,30 +580,59 @@ def convert( # Load output embedding if not sequential: - loaded_tp_ranks = load_partitions( - input_checkpoint_path, - mp_partitions, - get_key(loaded_config, "num-layers") + 4, - sequential=sequential, - ) + if get_key(loaded_config, "no-weight-tying", False): + # if we have trained input + output embedding layers without tied weights + loaded_tp_ranks = load_partitions( + input_checkpoint_path, + mp_partitions, + get_key(loaded_config, "num-layers") + 4, + sequential=sequential, + ) + else: + # in this case, output embedding layer and input embedding layer are tied. + # load + save the input embed weights into the output embedding layer's place. + loaded_tp_ranks = load_partitions( + input_checkpoint_path, + mp_partitions, + layer_idx=0, + sequential=sequential, + ) # output embedding / LM head if architecture == "neox": # name of lm head / final linear proj varies lm_head = hf_model.embed_out else: lm_head = hf_model.lm_head - lm_head.load_state_dict( - { - "weight": torch.cat( - get_state( - loaded_tp_ranks, - "final_linear.weight", - layer_idx=get_key(loaded_config, "num-layers") + 4, - sequential=sequential, + + if get_key(loaded_config, "no-weight-tying", False): + # save the (untied) final linear into LM head for HF + lm_head.load_state_dict( + { + "weight": torch.cat( + get_state( + loaded_tp_ranks, + "final_linear.weight", + layer_idx=get_key(loaded_config, "num-layers") + 4, + sequential=sequential, + ), + dim=0, ), - dim=0, - ), - } - ) + } + ) + else: + # embedding layers are tied. transpose input layer and save + lm_head.load_state_dict( + { + "weight": torch.cat( + get_state( + loaded_tp_ranks, + "word_embeddings.weight", + layer_idx=0, + sequential=sequential, + ), + dim=0, + ), + } + ) del loaded_tp_ranks From 2608972a4957553bf6556044c8faf0bc28bcdafc Mon Sep 17 00:00:00 2001 From: Jacob Hatef <74274091+jahatef@users.noreply.github.com> Date: Wed, 19 Jun 2024 16:57:53 -0400 Subject: [PATCH 13/27] fix python version and pytest install (#1234) * fix python version and pytest install * Update NeoXArgs docs automatically * python3 * Update NeoXArgs docs automatically * pip not pip3 * Update NeoXArgs docs automatically * python3 pip * Update NeoXArgs docs automatically * python3 -m pip * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * add docker setup to workflow * Update NeoXArgs docs automatically * python setup * Update NeoXArgs docs automatically * python setup v2 * Update NeoXArgs docs automatically * python setup v3 * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * python setup v3 * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Add hash back to deep speed version * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- .github/workflows/pull_request.yml | 14 ++++++++++++-- configs/neox_arguments.md | 2 +- requirements/requirements.txt | 4 ++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index a2b1a2fc2..99f7f988d 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -9,7 +9,7 @@ jobs: - uses: actions/checkout@v2 - uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: "3.10.14" cache: "pip" cache-dependency-path: "**/requirements*.txt" # Need the right version of clang-format @@ -43,7 +43,17 @@ jobs: runs-on: self-hosted steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: "3.10.13" + cache-dependency-path: "**/requirements*.txt" - name: prepare data - run: python prepare_data.py + run: python3 prepare_data.py + - name: install pytest + run: python3 -m pip install pytest pytest-forked pyyaml requests wandb + - name: install torch + run: python3 -m pip install torch + - name: install requirements + run: pip install -r requirements/requirements.txt - name: Run Tests run: pytest --forked tests diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c884afd97..1e67685ed 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 714b299 + Default = 455446c current git hash of repository diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 3ac92598a..b5a84674b 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,11 +1,11 @@ deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed ftfy>=6.0.1 -lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0 jinja2==3.1.4 +lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 lm_eval>=0.4.0,<=0.4.1 mpi4py>=3.0.3 -numpy>=1.22.0 +numpy<2.0 pybind11>=2.6.2 regex sentencepiece From 0e5f6db140819d80cd480d54c63cdc1aa5b818e3 Mon Sep 17 00:00:00 2001 From: dmahan93 <44207705+dmahan93@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:30:02 -0500 Subject: [PATCH 14/27] Add a chat data preprocessing script (#1239) * Add a chat data preprocessing script * add EOT at end of a chat * update README.md * apply pre-commit --------- Co-authored-by: Quentin Anthony --- .github/workflows/pull_request.yml | 2 +- megatron/data/helpers.cpp | 12 +- megatron/model/norms.py | 1 + megatron/neox_arguments/arguments.py | 8 +- tools/datasets/README.md | 51 +++ .../preprocess_data_with_chat_template.py | 348 ++++++++++++++++++ 6 files changed, 412 insertions(+), 10 deletions(-) create mode 100644 tools/datasets/preprocess_data_with_chat_template.py diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 99f7f988d..53be528ae 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -52,7 +52,7 @@ jobs: - name: install pytest run: python3 -m pip install pytest pytest-forked pyyaml requests wandb - name: install torch - run: python3 -m pip install torch + run: python3 -m pip install torch - name: install requirements run: pip install -r requirements/requirements.txt - name: Run Tests diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 9b062b050..aca290854 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/model/norms.py b/megatron/model/norms.py index dda44659f..19e1aeae6 100644 --- a/megatron/model/norms.py +++ b/megatron/model/norms.py @@ -24,6 +24,7 @@ def get_norm(neox_args): eps = neox_args.layernorm_epsilon if neox_args.layernorm_fusion: from .fused_layer_norm import MixedFusedLayerNorm + norm = MixedFusedLayerNorm else: norm = LayerNorm diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 9cad02c43..054689eda 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -794,7 +794,9 @@ def calculate_batch_parameters( # either none of the three parameters are provided or just gradient_accumulation_step is provided else: - assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" + assert ( + False + ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" return int(train_batch), int(micro_batch), int(grad_acc) @staticmethod @@ -1098,8 +1100,8 @@ def calculate_derived(self): if "flash" in self.attention_config: _flash_version = packaging.version.Version(version("flash-attn")) if self.sliding_window_width is not None: - assert ( - _flash_version >= packaging.version.Version("2.3.0") + assert _flash_version >= packaging.version.Version( + "2.3.0" ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention." if self.pos_emb == "alibi": if not _flash_version >= packaging.version.Version("2.4.0.post1"): diff --git a/tools/datasets/README.md b/tools/datasets/README.md index f8215959c..af3009a23 100644 --- a/tools/datasets/README.md +++ b/tools/datasets/README.md @@ -93,6 +93,57 @@ output data: --dataset-impl {lazy,cached,mmap} Dataset implementation to use. Default: mmap +runtime: + --workers WORKERS Number of worker processes to launch + --log-interval LOG_INTERVAL + Interval between progress updates +``` +## `preprocess_data_with_chat_template.py` +Similar, but uses huggingface's [chat templates](https://huggingface.co/docs/transformers/main/en/chat_templating) to +tokenize the data to support multiturn and more complicated use cases. + +N.B. If using this, you **must** specify your data when training/finetuning with the following configs +```json +"train_data_paths": ["train_documents"], +"test_data_paths": ["test_documents"], +"valid_data_paths": ["test_documents"], +"label_data_paths": ["label_documents"] +``` + +the `"data_path"` option will not work with `"label_data_paths"`. + + +``` +usage: preprocess_data_with_chat_template.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--no-mask] + [--generation-role GENERATION_ROLE] [--only-last] [--num-docs NUM_DOCS] + --tokenizer-path TOKENIZER_PATH [--ftfy] --output-prefix OUTPUT_PREFIX + [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] + [--log-interval LOG_INTERVAL] + +options: + -h, --help show this help message and exit + +input data: + --input INPUT Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list + --jsonl-keys JSONL_KEYS [JSONL_KEYS ...] + space separate listed of keys to extract from jsonl. Default: text + --no-mask If set, this will not mask any tokens in the input data. + --generation-role GENERATION_ROLE + The role of the model generating the chat, usually 'assistant'. Default: assistant + --only-last If set, this will mask everything except the last turn in the chat. + --num-docs NUM_DOCS Optional: Number of documents in the input data (if known) for an accurate progress bar. + +tokenizer: + --tokenizer-path TOKENIZER_PATH + Path to HF Tokenizer. + --ftfy Use ftfy to clean text + +output data: + --output-prefix OUTPUT_PREFIX + Path to binary output file without suffix + --dataset-impl {lazy,cached,mmap} + Dataset implementation to use. Default: mmap + runtime: --workers WORKERS Number of worker processes to launch --log-interval LOG_INTERVAL diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py new file mode 100644 index 000000000..81770deff --- /dev/null +++ b/tools/datasets/preprocess_data_with_chat_template.py @@ -0,0 +1,348 @@ +# Copyright (c) 2024, EleutherAI +# This file is based on code by the authors denoted below and has been modified from its original version. +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A script for processing a dataset such that chat templates are utilized in the creation of the data. +These are then used to perform instruction/chat model finetunes (for example, finetuning a model on only the assistant +portions of a chatml dataset). + +This follows the same output format as 'preprocess_data_with_mask.py' but using chat templates to generate the data. +This way we can support multiturn chat data in the finetuning process. instead of relying on a single turn of data. + +To run this script, first edit `tools/datasets/corpora.py` such that the command to call + `tools/datasets/preprocess_data_with_chat_template.py` is as follows: + +``` +cmd = f"python tools/datasets/preprocess_data_with_with_chat_template.py \ + --input {jsonl_filepath} \ + --output-prefix {parent_folder}/{self.name} \ + --tokenizer-path {hf-tokenizer} \ + --jsonl-keys {jsonl_keys} \ + --dataset-impl mmap \ + --workers {self.num_workers} " + +if self.only_last: + cmd += f"--only-last " + +if self.no_mask: + cmd += f"--no-mask " +``` + +Then, specify +``` +"train_data_paths": ["/path/to/dataset/name_text_document"], +"label_data_paths": ["/path/to/dataset/name_label_document"] +``` +in your YML config. This will then allow for finetuning on the data with loss masks set appropriately. + +""" + +import argparse +import multiprocessing +import os +import sys + +import lm_dataformat as lmd +import numpy as np + +sys.path.append( + os.path.abspath( + os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir) + ) +) + +import time +import tqdm +import jsonlines + +from megatron.data import indexed_dataset +from threading import Semaphore +from typing import List, Dict, Tuple +from transformers import AutoTokenizer, PreTrainedTokenizer + + +def build_chat( + chat: List[Dict[str, str]], + generation_role: str, + apply_mask: bool, + tokenizer: PreTrainedTokenizer, + only_last_turn: bool = False, +) -> Tuple[List[int], List[int]]: + """ + Build a chat from a list of dictionaries. Each dictionary should have a "role" and "content" key, this follows the + Chat Template from https://huggingface.co/docs/transformers/main/en/chat_templating + + :param chat: A list of dictionaries with "role" and "content" keys + :param generation_role: The role of the model generating the chat, usually "assistant" + :param apply_mask: Whether to apply a loss mask to the chat, if False, all tokens will be included in the loss + :param tokenizer: A HF tokenizer + :param only_last_turn: Whether to only include the last turn in the chat, needed for some fine-tuning tasks + """ + tokens = [] + mask = [] + if apply_mask is False: + tokens = tokenizer.apply_chat_template(chat) + mask = tokens + return tokens, mask + for i, turn in enumerate(chat): + add_gen = ( + False if i == len(chat) - 1 else chat[i + 1]["role"] == generation_role + ) + chat_tokens = tokenizer.apply_chat_template( + chat[: i + 1], add_generation_prompt=add_gen + ) + # remove previous stuff... + tokens.extend(chat_tokens) + if only_last_turn and (i != len(chat) - 1): + mask.extend([-100] * len(chat_tokens)) + elif apply_mask and (turn["role"] != generation_role): + mask.extend([-100] * len(chat_tokens)) + else: + mask.extend(chat_tokens) + if tokenizer.eos_token_id is not None: + mask.append(tokenizer.eos_token_id if mask[-1] != -100 else -100) + tokens.append(tokenizer.eos_token_id) + return tokens, mask + + +class Encoder(object): + def __init__(self, args): + self.args = args + + def initializer(self): + # Use Encoder class as a container for global data + Encoder.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_path) + + def encode(self, text): + ids = {} + for key in self.args.jsonl_keys: + text_ids, label_ids = build_chat( + text[key], + self.args.generation_role, + not self.args.no_mask, + Encoder.tokenizer, + self.args.only_last, + ) + ids[key] = (text_ids, label_ids) + return ids, len(text) + + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title="input data") + group.add_argument( + "--input", + type=str, + required=True, + help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated " + "list", + ) + group.add_argument( + "--jsonl-keys", + nargs="+", + default=["conversation"], + help="space separate listed of keys to extract from jsonl. Default: text", + ) + group.add_argument( + "--no-mask", + help="If set, this will not mask any tokens in the input data.", + action="store_true", + ) + group.add_argument( + "--generation-role", + type=str, + default="assistant", + help="The role of the model generating the chat, usually 'assistant'. Default: assistant", + ) + group.add_argument( + "--only-last", + help="If set, this will mask everything except the last turn in the chat.", + action="store_true", + ) + group.add_argument( + "--num-docs", + default=None, + help="Optional: Number of documents in the input data (if known) for an accurate progress bar.", + type=int, + ) + group = parser.add_argument_group(title="tokenizer") + group.add_argument( + "--tokenizer-path", + type=str, + required=True, + help="Path to HF Tokenizer.", + ) + group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text") + group = parser.add_argument_group(title="output data") + group.add_argument( + "--output-prefix", + type=str, + required=True, + help="Path to binary output file without suffix", + ) + group.add_argument( + "--dataset-impl", + type=str, + default="mmap", + choices=["lazy", "cached", "mmap"], + help="Dataset implementation to use. Default: mmap", + ) + + group = parser.add_argument_group(title="runtime") + group.add_argument( + "--workers", type=int, default=1, help="Number of worker processes to launch" + ) + group.add_argument( + "--log-interval", + type=int, + default=100, + help="Interval between progress updates", + ) + args = parser.parse_args() + args.keep_empty = False + + # some default/dummy values for the tokenizer + args.rank = 0 + args.make_vocab_size_divisible_by = 128 + args.model_parallel_size = 1 + + return args + + +def yield_from_files(fnames: list, semaphore): + """ + Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts / + other compressed formats. Also filters out empty documents. + + :param fnames: list of filenames + """ + + def yielder(fname, semaphore): + with open(fname, encoding="utf-8") as f: + reader = jsonlines.Reader(f) + for f in reader: + semaphore.acquire() + yield f + + for fname in fnames: + semaphore.acquire() + + yield from yielder(fname, semaphore) + + +def main(): + args = get_args() + encoder = Encoder(args) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) + print(f"Vocab size: {tokenizer.vocab_size}") + print(f"Output prefix: {args.output_prefix}") + + # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and + # hence building up memory + semaphore = Semaphore(10000 + args.workers) + + # use multiprocessing to iterate over input documents + fin = yield_from_files(args.input.split(","), semaphore) + + if args.workers > 1: + pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) + encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) + else: + encoder.initializer() + encoded_docs = (encoder.encode(doc) for doc in fin) + + # make a dataset builder for each key in args.jsonl_keys + # each key will output to a different file beginning with args.output_prefix + output_bin_files = {} + output_idx_files = {} + builders = {} + for key in args.jsonl_keys: + output_bin_files[key] = "{}_{}_{}.bin".format( + args.output_prefix, key, "document" + ) + output_idx_files[key] = "{}_{}_{}.idx".format( + args.output_prefix, key, "document" + ) + builders[key] = indexed_dataset.make_builder( + output_bin_files[key], + impl=args.dataset_impl, + vocab_size=tokenizer.vocab_size, + ) + builders[key]._dtype = np.int32 + if not args.no_mask: + assert ( + key + "_label" not in args.jsonl_keys + ), "label should not be included as it will be generated according to the mask." + key += "_label" + output_bin_files[key] = "{}_{}_{}.bin".format( + args.output_prefix, key, "document" + ) + output_idx_files[key] = "{}_{}_{}.idx".format( + args.output_prefix, key, "document" + ) + builders[key] = indexed_dataset.make_builder( + output_bin_files[key], + impl=args.dataset_impl, + vocab_size=tokenizer.vocab_size, + ) + builders[key]._dtype = np.int32 + + # actually do tokenization + proc_start = time.time() + total_bytes_processed = 0 + pbar = tqdm.tqdm() + for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): + total_bytes_processed += bytes_processed + + # release semaphore so `yield_from_files` can add another file to the buffer + semaphore.release() + + # add each tokenized document / sentence + for key, conv in doc.items(): + tokens = conv[0] + token_mask = conv[1] + builders[key].add_item(np.array(tokens, dtype=builders[key].dtype)) + builders[key + "_label"].add_item( + np.array(token_mask, dtype=builders[key + "_label"].dtype) + ) + # add indx... + builders[key].end_document() + builders[key + "_label"].end_document() + if i == 1: + print("key: ", key) + print("tokens: ", tokens) + print("token_mask: ", token_mask) + # log progress + if i % args.log_interval == 0: + current = time.time() + elapsed = current - proc_start + mbs = total_bytes_processed / elapsed / 1024 / 1024 + pbar.set_description( + f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)." + ) + if i != 0: + pbar.update(args.log_interval) + + # save output file + update_keys = args.jsonl_keys + for key in update_keys: + builders[key].finalize(output_idx_files[key]) + builders[key + "_label"].finalize(output_idx_files[key + "_label"]) + + +if __name__ == "__main__": + main() From 1cee5b7c7074302de4867ad5cac3f1ea26f7a7d7 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Fri, 28 Jun 2024 11:21:46 -0500 Subject: [PATCH 15/27] Fix paper reference in init_functions.py (#1241) --- megatron/model/init_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 86a003dbd..8a0b8e251 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -145,7 +145,7 @@ def init_(tensor, use_mup=use_mup_outer): def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0): """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving - the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution.""" + the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution.""" std = math.sqrt(2 / (5 * dim)) def init_(tensor, use_mup=use_mup_outer): From c1ea2a1ff1c062ed89ede27780cecf0122ae3f42 Mon Sep 17 00:00:00 2001 From: dmahan93 <44207705+dmahan93@users.noreply.github.com> Date: Mon, 5 Aug 2024 23:14:27 -0500 Subject: [PATCH 16/27] Add hf llama to neox conversion (#1247) * - Add conversion of HF llama models to NeoX * - Add conversion of HF llama models to NeoX * - minor fix * pre-commit --------- Co-authored-by: Quentin Anthony --- tools/ckpts/README.md | 17 ++ tools/ckpts/convert_hf_llama_to_neox.py | 219 ++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 tools/ckpts/convert_hf_llama_to_neox.py diff --git a/tools/ckpts/README.md b/tools/ckpts/README.md index 24d5cf31c..770cfb9c6 100644 --- a/tools/ckpts/README.md +++ b/tools/ckpts/README.md @@ -131,3 +131,20 @@ options: --num_output_shards NUM_OUTPUT_SHARDS --pipeline_parallel Only use if PP>1 ``` + +### `convert_hf_llama_to_neox.py` +Takes an HF Llama checkpoint and puts it into a NeoX-compatible format. + +Note that this does not support pipeline parallelism! + +``` +usage: convert_hf_llama_to_neox.py [-h] [--tp TP] [--pp PP] [--model MODEL] [--model_path MODEL_PATH] + +options: + -h, --help show this help message and exit + --tp TP Number of tensor parallelism ranks + --pp PP Number of pipeline parallelism stages + --model MODEL HF model name + --model_path MODEL_PATH + Path to save model +``` diff --git a/tools/ckpts/convert_hf_llama_to_neox.py b/tools/ckpts/convert_hf_llama_to_neox.py new file mode 100644 index 000000000..2adddb19d --- /dev/null +++ b/tools/ckpts/convert_hf_llama_to_neox.py @@ -0,0 +1,219 @@ +import torch +import argparse +from transformers import AutoTokenizer, AutoModelForCausalLM +import os +import tqdm + + +def convert_model(hf_state_dict, hf_config, tp_ranks): + conv_state_dicts = [{} for _ in range(tp_ranks)] + # get embeddings... + for i, chunk in enumerate( + torch.chunk(hf_state_dict["model.embed_tokens.weight"], tp_ranks, dim=0) + ): + conv_state_dicts[i][ + "sequential.0.word_embeddings.weight" + ] = chunk.clone().detach() + print( + "model.embed_tokens.weight", + hf_state_dict["model.embed_tokens.weight"].shape, + "sequential.0.word_embeddings.weight", + conv_state_dicts[0]["sequential.0.word_embeddings.weight"].shape, + ) + # Get config data... + num_kv_heads = hf_config.num_key_value_heads + num_q_heads = hf_config.num_attention_heads + head_dim = hf_config.hidden_size // num_q_heads + # do layers... + for layer_num in tqdm.tqdm(range(model.model.config.num_hidden_layers)): + # --- attention --- + # Output first since it's a simple row parallel... + for i, chunk in enumerate( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"], + tp_ranks, + dim=1, + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.attention.dense.weight" + ] = chunk.clone().detach() + print( + f"model.layers.{layer_num}.self_attn.o_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"].shape, + f"sequential.{layer_num+2}.attention.dense.weight", + conv_state_dicts[0][ + f"sequential.{layer_num+2}.attention.dense.weight" + ].shape, + ) + # Now for attention... + # Split into heads... + q = hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"] + k = hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"] + v = hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"] + # The GQA code splits the heads by the num_q_heads so we also do that + # here to ensure it matches... + q = q.view(num_q_heads, -1, q.shape[-1]) + k = k.view(num_q_heads, -1, q.shape[-1]) + v = v.view(num_q_heads, -1, q.shape[-1]) + # Chunk for tensor parallelism... + for i, q_chunk, k_chunk, v_chunk in zip( + range(tp_ranks), + torch.chunk(q, tp_ranks, dim=0), + torch.chunk(k, tp_ranks, dim=0), + torch.chunk(v, tp_ranks, dim=0), + ): + # Need to join the heads across q, k, v... + conv_state_dicts[i][ + f"sequential.{layer_num+2}.attention.query_key_value.weight" + ] = ( + torch.cat([q_chunk, k_chunk, v_chunk], dim=1) + .view(-1, q.shape[-1]) + .clone() + .detach() + ) + print( + f"model.layers.{layer_num}.self_attn.(q/k/v)_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"].shape, + hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"].shape, + hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"].shape, + f"sequential.{layer_num+2}.attention.query_key_value.weight", + conv_state_dicts[0][ + f"sequential.{layer_num+2}.attention.query_key_value.weight" + ].shape, + ) + # --- mlp --- + # Do SwiGLU weights... + # w1... + for i, chunk in enumerate( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"], + tp_ranks, + dim=0, + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.mlp.w1.weight" + ] = chunk.clone().detach() + print( + f"model.layers.{layer_num}.mlp.gate_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"].shape, + f"sequential.{layer_num+2}.mlp.w1.weight", + conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.w1.weight"].shape, + ) + # w3... + for i, chunk in enumerate( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"], + tp_ranks, + dim=0, + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.mlp.w3.weight" + ] = chunk.clone().detach() + print( + f"model.layers.{layer_num}.mlp.up_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"].shape, + f"sequential.{layer_num+2}.mlp.w3.weight", + conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.w3.weight"].shape, + ) + # w2 (output)... + for i, chunk in enumerate( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"], + tp_ranks, + dim=1, + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.mlp.w2.weight" + ] = chunk.clone().detach() + print( + f"model.layers.{layer_num}.mlp.down_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"].shape, + f"sequential.{layer_num+2}.mlp.w2.weight", + conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.w2.weight"].shape, + ) + # --- norm --- + for i in range(tp_ranks): + conv_state_dicts[i][f"sequential.{layer_num+2}.input_layernorm.scale"] = ( + hf_state_dict[f"model.layers.{layer_num}.input_layernorm.weight"] + .clone() + .detach() + ) + conv_state_dicts[i][ + f"sequential.{layer_num+2}.post_attention_layernorm.scale" + ] = ( + hf_state_dict[ + f"model.layers.{layer_num}.post_attention_layernorm.weight" + ] + .clone() + .detach() + ) + + # Get final ln/linear.... + index = model.model.config.num_hidden_layers + 3 + for i in range(tp_ranks): + conv_state_dicts[i][f"sequential.{index}.norm.scale"] = ( + hf_state_dict["model.norm.weight"].clone().detach() + ) + index += 1 + # do output... + for i, chunk in enumerate( + torch.chunk(hf_state_dict["lm_head.weight"], tp_ranks, dim=0) + ): + conv_state_dicts[i][ + f"sequential.{index}.final_linear.weight" + ] = chunk.clone().detach() + print( + "lm_head.weight", + hf_state_dict["lm_head.weight"].shape, + f"sequential.{index}.final_linear.weight", + conv_state_dicts[0][f"sequential.{index}.final_linear.weight"].shape, + ) + return conv_state_dicts + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tp", type=int, default=1, help="Number of tensor parallelism ranks" + ) + parser.add_argument( + "--pp", type=int, default=0, help="Number of pipeline parallelism stages" + ) + parser.add_argument("--model", type=str, default="gpt2", help="HF model name") + parser.add_argument( + "--model_path", type=str, default=None, help="Path to save model" + ) + args = parser.parse_args() + assert args.pp == 0, "Pipeline parallelism not supported yet" + tokenizer = AutoTokenizer.from_pretrained(args.model).save_pretrained( + args.model_path + "/tokenizer" + ) + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype="auto") + state_dict = model.state_dict() + for key in state_dict.keys(): + print(key, state_dict[key].shape) + os.makedirs(args.model_path, exist_ok=True) + # Setup model directory... + os.makedirs(f"{args.model_path}/0", exist_ok=True) + # Save the latest file so neox can figure out where to grab the weights... + with open(f"{args.model_path}/latest", "w") as f: + f.write("0") + # Convert the model... + tp_state_dicts = convert_model(state_dict, model.model.config, args.tp) + for i in range(args.tp): + torch.save( + { + "dp_world_size": 1, + "mp_world_size": args.tp, + "optimizer": {}, + "global_steps": 1, + "skipped_steps": 1, + "iteration": 1, + "module": tp_state_dicts[i], + }, + f"{args.model_path}/0/mp_rank_{i:02d}_model_states.pt", + ) From 0ef2c074ac03c2b888e9003e7ce4c166cb78cc82 Mon Sep 17 00:00:00 2001 From: dmahan93 <44207705+dmahan93@users.noreply.github.com> Date: Thu, 15 Aug 2024 16:26:15 -0500 Subject: [PATCH 17/27] bugfix: chat turns instead of repeating the conversation in preprocess_data_with_chat_template.py (#1258) * bugfix: chat turns instead of repeating the conversation * pre-commit --- tools/datasets/preprocess_data_with_chat_template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py index 81770deff..55623b303 100644 --- a/tools/datasets/preprocess_data_with_chat_template.py +++ b/tools/datasets/preprocess_data_with_chat_template.py @@ -104,7 +104,7 @@ def build_chat( ) chat_tokens = tokenizer.apply_chat_template( chat[: i + 1], add_generation_prompt=add_gen - ) + )[len(tokens) :] # remove previous stuff... tokens.extend(chat_tokens) if only_last_turn and (i != len(chat) - 1): From f8c9e68c4984a0b6f7f5f276b563d2612a6dce9f Mon Sep 17 00:00:00 2001 From: jaimemcc <99298642+jaimemcc-intel@users.noreply.github.com> Date: Thu, 15 Aug 2024 14:57:02 -0700 Subject: [PATCH 18/27] Conversion for CI from self-hosted hardware (#1245) * changing from self-hosted runners to Github's ubuntu-22.04 runner environment * adding warning about not using 'self-hosted' runner labels and using Github runners instead * updated some guidance in comments for coverity scan CI * moving CPU tests to workflow_dispatch only --- .github/workflows/{cpu_ci_on_pr.yml => .cpu_ci_on_pr.yml} | 4 +++- .github/workflows/coverity_scan.yml | 5 +++-- .github/workflows/cpu_ci.yml | 2 +- .github/workflows/cpu_ci_dispatch.yml | 2 +- .github/workflows/pull_request.yml | 5 +++-- tests/README.md | 2 ++ 6 files changed, 13 insertions(+), 7 deletions(-) rename .github/workflows/{cpu_ci_on_pr.yml => .cpu_ci_on_pr.yml} (58%) diff --git a/.github/workflows/cpu_ci_on_pr.yml b/.github/workflows/.cpu_ci_on_pr.yml similarity index 58% rename from .github/workflows/cpu_ci_on_pr.yml rename to .github/workflows/.cpu_ci_on_pr.yml index 971640c18..43ce025c0 100644 --- a/.github/workflows/cpu_ci_on_pr.yml +++ b/.github/workflows/.cpu_ci_on_pr.yml @@ -1,3 +1,5 @@ +# This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed. + name: "Pull Request CPU Tests" on: @@ -7,7 +9,7 @@ on: jobs: run-tests: - runs-on: [ 'test', 'self-hosted' ] + runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest steps: - name: Checkout Repository uses: actions/checkout@v4 diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml index a79d0d8fb..128d279cc 100644 --- a/.github/workflows/coverity_scan.yml +++ b/.github/workflows/coverity_scan.yml @@ -17,9 +17,10 @@ jobs: runs-on: ubuntu-latest env: - COV_USER: ${{ secrets.COV_USER }} + COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }} - COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} + COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard: + # https://scan.coverity.com/projects/?tab=project_settings steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml index 9160fccab..6910b8a1c 100644 --- a/.github/workflows/cpu_ci.yml +++ b/.github/workflows/cpu_ci.yml @@ -5,7 +5,7 @@ on: "push" jobs: run-tests: #runs-on: ubuntu-latest - runs-on: [ 'test', 'self-hosted' ] + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/cpu_ci_dispatch.yml b/.github/workflows/cpu_ci_dispatch.yml index b1d108b3b..38485d6a6 100644 --- a/.github/workflows/cpu_ci_dispatch.yml +++ b/.github/workflows/cpu_ci_dispatch.yml @@ -10,7 +10,7 @@ on: jobs: run-tests: - runs-on: [ 'test', 'self-hosted' ] + runs-on: ubuntu-22.04 steps: - name: Checkout Repository uses: actions/checkout@v4 diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 53be528ae..7b06256bf 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1,6 +1,7 @@ name: Pull Request -on: [pull_request, workflow_dispatch] +#on: [pull_request, workflow_dispatch] +on: workflow_dispatch jobs: pre-commit: @@ -40,7 +41,7 @@ jobs: git commit -m "Update NeoXArgs docs automatically" git push run-tests: - runs-on: self-hosted + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v4 diff --git a/tests/README.md b/tests/README.md index 390a52898..f5ba5e560 100644 --- a/tests/README.md +++ b/tests/README.md @@ -57,6 +57,8 @@ Tests can be run against physical CPUs through GitHub Actions. To have tests run ### runs-on +#### NOTE: These BKMs were written to work with CI infrastructure that is no longer in place. To use the Github runners (ubuntu-22.04 / ubuntu-latest), skip the 'runs-on' section. + The CI needs to be written to target the CPU Github Action runner. The jobs that need to run on CPU should use the hardware runner's labels: ```yaml jobs: From 8b43196fbd832b797be9f3d88d54481171010507 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Fri, 23 Aug 2024 14:02:59 -0400 Subject: [PATCH 19/27] Megatron-LM style Sequence Parallel (#1257) * first draft (shape errors occurring) * training works (but poor convergence) * debugging progress: current commit works if we do regular TP via impl-ing AR in rowparallel as RS then AG * Update NeoXArgs docs automatically * push most recent code (updated mark_norms fn, back to 'real' sequence parallel) * Update NeoXArgs docs automatically * Fix LayerNorm all reduce gradient hook * Sum instead of average for LayerNorm gradient all reduce * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Fix gather and reduce scatter ops on sequence dimension * Fix sequence parallel with tied weight embeddings * Update NeoXArgs docs automatically * cleanup pass + add MoE arguments.py guard * pre-commit and clean up comments * remove vestigial debug code * remove unused debugging code * remove dummy test config * update fp32_allreduce to handle fp16 ; don't cast to fp32 for gathers * run linter on the rest of the files * Improve performance of sequence parallel gather, scatter, and reduce * Add comment * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Brandon Yang Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 12 +- megatron/model/__init__.py | 5 +- megatron/model/gpt2_model.py | 5 +- megatron/model/transformer.py | 29 ++++- megatron/model/utils.py | 56 ++++++-- megatron/model/word_embeddings.py | 10 ++ megatron/mpu/__init__.py | 3 + megatron/mpu/layers.py | 39 +++++- megatron/mpu/mappings.py | 187 +++++++++++++++++++++++++-- megatron/mpu/utils.py | 22 ++++ megatron/neox_arguments/arguments.py | 4 + megatron/neox_arguments/neox_args.py | 7 + megatron/training.py | 3 + 13 files changed, 349 insertions(+), 33 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 1e67685ed..413138597 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 455446c + Default = 53d0ae8 current git hash of repository @@ -1056,6 +1056,16 @@ Parallelism Arguments +- **sequence_parallel**: bool + + Default = False + + flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198) + (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1. + **Set by user, in contrast to neox_args.is_pipe_parallel.** + + + - **expert_interval**: int Default = 2 diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 619b4c33d..23be28936 100755 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -16,5 +16,8 @@ # limitations under the License. from .gpt2_model import GPT2ModelPipe -from .utils import get_params_for_weight_decay_optimization +from .utils import ( + get_params_for_weight_decay_optimization, + mark_norms_for_sequence_parallel_grad_sync, +) from .word_embeddings import SoftEmbedding diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 9e643874a..7899048db 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -308,7 +308,10 @@ def _logits_helper(embedding, lm_output): ) logits = parallel_lm_logits( - lm_output, embedding.word_embeddings_weight, self.parallel_output + lm_output, + embedding.word_embeddings_weight, + self.parallel_output, + seq_parallel=self.neox_args.sequence_parallel, ) return logits diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index c154b09f4..62e7d3a9c 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -254,6 +254,7 @@ def __init__( gather_output=not parallel_output, skip_bias_add=False, mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here + seq_dim=1, # important: must mark that this layer receives shape [b, s, h] not [s, b, h] and so Seq. Parallel comms must gather along dim=1 rather than dim=0 ) # else: @@ -1024,7 +1025,14 @@ def __init__( self.moe_type = neox_args.moe_type if self.gpt_j_residual: - self.reduce = mpu.mappings.reduce_from_model_parallel_region + # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers. + # the reduction we use is a simple allreduce for pure Tensor Parallel, + # but needs to be a reduce-scatter when using Megatron-style Sequence Parallel (LN sharding.) + self.reduce = ( + mpu.mappings.reduce_from_model_parallel_region + if not neox_args.sequence_parallel + else mpu.mappings.reduce_scatter_to_sequence_parallel_region + ) # Self attention. self.attention = ParallelSelfAttention( @@ -1339,10 +1347,25 @@ def forward(self, args): return self.norm(args) -def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None): +def parallel_lm_logits( + input_, + word_embeddings_weight, + parallel_output, + seq_parallel=False, + seq_dim=1, + bias=None, +): """LM logits using word embedding weights.""" # Parallel logits. - input_parallel = mpu.copy_to_model_parallel_region(input_) + if seq_parallel: + # if using Sequence Parallelism, our logits are sharded along the sequence dimension. + # gather them here. (backward pass: reduce-scatter) + input_parallel = mpu.gather_from_sequence_parallel_region( + input_, seq_dim=seq_dim + ) + else: + # Set up backprop all-reduce. + input_parallel = mpu.copy_to_model_parallel_region(input_) # Matrix multiply. if bias is None: diff --git a/megatron/model/utils.py b/megatron/model/utils.py index c3da2ce8b..97b409c1d 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -18,8 +18,8 @@ """Utilities for models.""" import torch -from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm from megatron.model.fused_softmax import SoftmaxFusionTypes +from megatron import mpu from types import GeneratorType import torch.distributed as dist @@ -35,15 +35,9 @@ def get_params_for_weight_decay_optimization(module, neox_args): "name": "no_weight_decay_params", } for module_ in module.modules(): - if any( - [ - isinstance(module_, LayerNorm), - isinstance(module_, RMSNorm), - isinstance(module_, ScaleNorm), - ] - ) or ( - neox_args.weight_decay == 0.0 - ): # also include all parameters here if no weight decay is being done + # apply weight decay to any "...Norm" modules. + if "norm" in type(module_).__name__.lower() or neox_args.weight_decay == 0.0: + # also include all parameters here if no weight decay is being done no_weight_decay_params["params"].extend( [p for p in list(module_._parameters.values()) if p is not None] ) @@ -359,3 +353,45 @@ def get_fusion_type(neox_args): elif neox_args.scaled_masked_softmax_fusion: fusion_type = SoftmaxFusionTypes.general return fusion_type + + +def reduce_weight_grads_from_model_parallel_region(input_): + """A hook that can be applied to any weight tensor via .register_hook(). + Allreduces grads for e.g. LN weights across the model parallel group. + Needed to keep LNs in sync, despite them getting diff data -> diff gradients when using sequence parallel. + """ + # Bypass the function if no TP -> no comm needed. + if mpu.get_model_parallel_world_size() == 1: + return input_ + + # Bf16 convert + dt = input_.dtype + if dt == torch.bfloat16 and mpu.get_fp32_allreduce(): + input_ = input_.float() + + # All-reduce. + torch.distributed.all_reduce(input_, group=mpu.get_model_parallel_group()) + + # Bf16 convert + if dt == torch.bfloat16 and mpu.get_fp32_allreduce(): + input_ = input_.bfloat16() + + return input_ + + +def mark_norms_for_sequence_parallel_grad_sync(module, neox_args): + """Iterate through the modules in our model, and for any "...Norm" classnames, + register a hook on each of that module's parameters which will allreduce norms' weights' grads across + the model (sequence) parallel region. + """ + + if not neox_args.sequence_parallel: + # if we aren't using sequence parallelism, this is a no-op + return + + for module_ in module.modules(): + if "norm" in type(module_).__name__.lower(): + # this is a norm, we want to allreduce its weight grads across sequence parallel region + for name, param in module_.named_parameters(): + if param.requires_grad: + param.register_hook(reduce_weight_grads_from_model_parallel_region) diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index f7372bc55..ce3c1117e 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -50,6 +50,11 @@ def __init__( self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes + + self.sequence_parallel = ( + neox_args.sequence_parallel + ) # if we are using sequence parallelism, then we'll want to scatter our inputs across the seqlen dim across TP ranks + self.use_mup = neox_args.use_mup self.mup_embedding_mult = neox_args.mup_embedding_mult self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult @@ -159,6 +164,11 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): with torch.no_grad(): embeddings.mul_(self.mup_embedding_mult) + if self.sequence_parallel: + # TODO: megatron-lm does dropout using the scattered embs. This would save a tiny bit of time, perhaps? + # Not a priority since we don't often use dropout + embeddings = mpu.scatter_to_sequence_parallel_region(embeddings) + return embeddings diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py index 2365507d9..780fb33e8 100644 --- a/megatron/mpu/__init__.py +++ b/megatron/mpu/__init__.py @@ -47,6 +47,9 @@ from .mappings import gather_from_model_parallel_region from .mappings import reduce_from_model_parallel_region from .mappings import scatter_to_model_parallel_region +from .mappings import reduce_scatter_to_sequence_parallel_region +from .mappings import gather_from_sequence_parallel_region +from .mappings import scatter_to_sequence_parallel_region from .random import checkpoint from .random import get_cuda_rng_tracker diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 0d14806ac..d59edab94 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -33,6 +33,8 @@ from .mappings import gather_from_model_parallel_region from .mappings import reduce_from_model_parallel_region from .mappings import scatter_to_model_parallel_region +from .mappings import reduce_scatter_to_sequence_parallel_region +from .mappings import gather_from_sequence_parallel_region from .random import get_cuda_rng_tracker from .utils import divide from .utils import VocabUtility @@ -416,6 +418,7 @@ def __init__( MOE=False, MoE_mp_size=1, mup_rescale_parameters=False, + seq_dim=0, # Dimension which is the seq_len dimension. final ParallelLinear overrides this to be 1 ; otherwise, the default is used throughout. ): super(ColumnParallelLinear, self).__init__() @@ -427,6 +430,10 @@ def __init__( world_size = MoE_mp_size if MOE else get_model_parallel_world_size() self.output_size_per_partition = divide(output_size, world_size) self.skip_bias_add = skip_bias_add + + self.sequence_parallel = neox_args.sequence_parallel + self.seq_dim = seq_dim + self.init_method = init_method self.stride = stride self.mup_rescale_parameters = mup_rescale_parameters @@ -551,14 +558,29 @@ def set_parallel_output(self, value: bool): def forward(self, input_): if self.use_mup and self.mup_rescale_parameters: input_ /= self.width_mult() - # Set up backprop all-reduce. - input_parallel = copy_to_model_parallel_region(input_) + + if self.sequence_parallel: + input_parallel = input_ + else: + # Set up backprop all-reduce. + input_parallel = copy_to_model_parallel_region(input_) # Matrix multiply. + if self.sequence_parallel: + # do an AG in the fwd pass, RS in bwd pass. + # gather / scatter portion happens across the sequence dim (self.seq_dim)-- + # almost always is [s, b, h] and so dim 0, but for lm_head ParallelLinear it is seq_dim=1 and [b, s, h] + input_parallel = gather_from_sequence_parallel_region( + input_parallel, seq_dim=self.seq_dim + ) + bias = self.bias if not self.skip_bias_add else None output_parallel = F.linear(input_parallel, self.weight, bias) if self.gather_output: # All-gather across the partitions. + assert ( + not self.sequence_parallel + ), "sequence_parallel=True and gather_output=True are incompatible!" output = gather_from_model_parallel_region(output_parallel) else: output = output_parallel @@ -623,6 +645,12 @@ def __init__( self.input_size_per_partition = divide(input_size, world_size) self.skip_bias_add = skip_bias_add self.parallel_output = parallel_output + + self.sequence_parallel = neox_args.sequence_parallel + assert not ( + self.sequence_parallel and not self.input_is_parallel + ), "Cannot have self.input_is_parallel=False and self.sequence_parallel=True." + self.init_method = init_method self.stride = stride self.keep_master_weight_for_test = keep_master_weight_for_test @@ -748,7 +776,12 @@ def forward(self, input_): # Matrix multiply. output_parallel = F.linear(input_parallel, self.weight) # All-reduce across all the partitions. - if not self.parallel_output: + if self.sequence_parallel and not self.parallel_output: + # do an RS in the fwd pass, AG in bwd pass. + # skip in the gpt-j parallel sublayer case (self.parallel_output=True) + # (user responsible for calling reduce-scatter) + output_ = reduce_scatter_to_sequence_parallel_region(output_parallel) + elif not self.parallel_output: output_ = reduce_from_model_parallel_region(output_parallel) else: output_ = output_parallel diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py index 535fe6255..f11d9e6ab 100644 --- a/megatron/mpu/mappings.py +++ b/megatron/mpu/mappings.py @@ -23,7 +23,7 @@ get_model_parallel_rank, get_fp32_allreduce, ) -from .utils import split_tensor_along_last_dim +from .utils import split_tensor_along_last_dim, split_tensor_along_any_dim def _reduce(input_): @@ -33,17 +33,17 @@ def _reduce(input_): if get_model_parallel_world_size() == 1: return input_ - # Bf16 convert + # upcast to fp32 if using fp32 allreduce dt = input_.dtype - if dt == torch.bfloat16 and get_fp32_allreduce(): + if get_fp32_allreduce(): input_ = input_.float() # All-reduce. torch.distributed.all_reduce(input_, group=get_model_parallel_group()) - # Bf16 convert - if dt == torch.bfloat16 and get_fp32_allreduce(): - input_ = input_.bfloat16() + # reconvert to original Bf16/Fp16 dtype + if get_fp32_allreduce(): + input_ = input_.to(dt) return input_ @@ -75,11 +75,6 @@ def _gather(input_): if world_size == 1: return input_ - # Bf16 convert - dt = input_.dtype - if dt == torch.bfloat16 and get_fp32_allreduce(): - input_ = input_.float() - # Size and dimension. last_dim = input_.dim() - 1 rank = get_model_parallel_rank() @@ -91,9 +86,100 @@ def _gather(input_): # Note: torch.cat already creates a contiguous tensor. output = torch.cat(tensor_list, dim=last_dim).contiguous() - # Bf16 convert - if dt == torch.bfloat16 and get_fp32_allreduce(): - output = output.bfloat16() + return output + + +def _reduce_scatter_along_seq_dim(input_, seq_dim): + """Reduce-scatter the input tensor across model parallel group, scattering across sequence dim.""" + world_size = get_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + # upcast to fp32 if using fp32 allreduce + dt = input_.dtype + if get_fp32_allreduce(): + input_ = input_.float() + + dim_size = list(input_.size()) + assert ( + isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0 + ), "seq_dim must be a valid tensor dim" + assert dim_size[seq_dim] % world_size == 0 + + if seq_dim == 0: + # reduce_scatter_tensor is faster but only works correctly on dimension 0 + dim_size[seq_dim] = dim_size[seq_dim] // world_size + output = torch.empty( + dim_size, dtype=input_.dtype, device=torch.cuda.current_device() + ) + torch.distributed.reduce_scatter_tensor( + output, input_.contiguous(), group=get_model_parallel_group() + ) + else: + tensor_list = list( + torch.split(input_, input_.shape[seq_dim] // world_size, seq_dim) + ) + output = torch.empty_like(tensor_list[0]) + torch.distributed.reduce_scatter(output, tensor_list) + + # reconvert to original Bf16/Fp16 dtype + if get_fp32_allreduce(): + output = output.to(dt) + + return output + + +def _gather_along_seq_dim(input_, seq_dim): + """Gather tensors and concatinate along the (manually-specified) sequence dimension.""" + + world_size = get_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + assert ( + isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0 + ), "seq_dim must be a valid tensor dim" + dim_size[seq_dim] = dim_size[seq_dim] * world_size + + if seq_dim == 0: + # reduce_gather_tensor is faster but only works correctly on dimension 0 + output = torch.empty( + dim_size, dtype=input_.dtype, device=torch.cuda.current_device() + ) + torch.distributed.all_gather_into_tensor( + output, input_.contiguous(), group=get_model_parallel_group() + ) + else: + input_ = input_.contiguous() + rank = get_model_parallel_rank() + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank] = input_ + torch.distributed.all_gather( + tensor_list, input_, group=get_model_parallel_group() + ) + output = torch.cat(tensor_list, dim=seq_dim) + + return output + + +def _split_along_seq_dim(input_, seq_dim): + """Split the tensor along the sequence dimension (as manually selected) and keep the + corresponding slice.""" + + world_size = get_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + # Split along second dimension. + input_list = split_tensor_along_any_dim(input_, world_size, seq_dim) + + # Note: torch.split does not create contiguous tensors by default. + rank = get_model_parallel_rank() + output = input_list[rank].contiguous() return output @@ -162,6 +248,65 @@ def backward(ctx, grad_output): return _split(grad_output) +class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function): + """Reduce-Scatter across sequence parallel region (same as model parallel region.) + Note: same region as model parallel region + """ + + @staticmethod + def symbolic(graph, input_, seq_dim): + return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def forward(ctx, input_, seq_dim): + ctx.seq_dim = seq_dim + return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def backward(ctx, grad_output): + seq_dim = ctx.seq_dim + return _gather_along_seq_dim(grad_output, seq_dim=seq_dim), None + + +class _GatherFromSequenceParallelRegion(torch.autograd.Function): + """All-Gather across sequence parallel region (same region as model parallel region.)""" + + @staticmethod + def symbolic(graph, input_, seq_dim): + return _gather_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def forward(ctx, input_, seq_dim): + ctx.seq_dim = seq_dim + return _gather_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def backward(ctx, grad_output): + seq_dim = ctx.seq_dim + return _reduce_scatter_along_seq_dim(grad_output, seq_dim=seq_dim), None + + +class _ScatterToSequenceParallelRegion(torch.autograd.Function): + """Scatter (split) sequence length across sequence parallel region (=> same region as model parallel.)""" + + @staticmethod + def symbolic(graph, input_, seq_dim): + return _split_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def forward(ctx, input_, seq_dim): + ctx.seq_dim = seq_dim + return _split_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def backward(ctx, grad_output): + seq_dim = ctx.seq_dim + return ( + _gather_along_seq_dim(grad_output, seq_dim=seq_dim), + None, + ) + + # ----------------- # Helper functions. # ----------------- @@ -181,3 +326,17 @@ def scatter_to_model_parallel_region(input_): def gather_from_model_parallel_region(input_): return _GatherFromModelParallelRegion.apply(input_) + + +def reduce_scatter_to_sequence_parallel_region(input_, seq_dim=0): + return _ReduceScatterToSequenceParallelRegion.apply(input_, seq_dim) + + +def gather_from_sequence_parallel_region(input_, seq_dim=0): + return _GatherFromSequenceParallelRegion.apply(input_, seq_dim) + + +def scatter_to_sequence_parallel_region( + input_, seq_dim=1 +): # use this fn in scattering input embeds across TP ranks. There, shape of inps is [b, s, h] instead of the usual [s, b, h] + return _ScatterToSequenceParallelRegion.apply(input_, seq_dim) diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py index 13941dc29..1f97e0e76 100644 --- a/megatron/mpu/utils.py +++ b/megatron/mpu/utils.py @@ -53,6 +53,28 @@ def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks= return tensor_list +def split_tensor_along_any_dim( + tensor, num_partitions, seq_dim, contiguous_split_chunks=False +): + """Split a tensor along a user-specified dimension. + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + seq_dim: dimension along which to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + """ + # Get the size and dimension. + seq_dim_size = divide(tensor.size()[seq_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, seq_dim_size, dim=seq_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + class VocabUtility: """Split the vocabulary into `world_size` chunks amd return the first and last index of the vocabulary belonging to the `rank` diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 054689eda..6a84df6c7 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1038,6 +1038,10 @@ def calculate_derived(self): assert self.zero_optimization["stage"] != 3, "MoE not compatible with zero3" assert self.mlp_type == "regular", "MoE not compatible with LLaMA" + assert ( + self.sequence_parallel is False + ), "MoE not compatible with Sequence Parallel" + # Attention config if self.attention_config is None: self.update_value("attention_config", [[["global"], self.num_layers]]) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index febefb3c2..7993f785f 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -85,6 +85,13 @@ class NeoXArgsParallelism(NeoXArgsTemplate): according to pipeline parallel size. """ + sequence_parallel: bool = False + """ + flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198) + (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1. + **Set by user, in contrast to neox_args.is_pipe_parallel.** + """ + expert_interval: int = 2 """ Have one MoE layer every expert_interval layers diff --git a/megatron/training.py b/megatron/training.py index 3265680c5..ce59b242a 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -43,6 +43,7 @@ GPT2ModelPipe, SoftEmbedding, get_params_for_weight_decay_optimization, + mark_norms_for_sequence_parallel_grad_sync, ) from megatron.checkpointing import load_checkpoint, save_checkpoint from megatron.data.data_utils import build_train_valid_test_data_iterators @@ -765,6 +766,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): # config_params=neox_args.deepspeed_config, mpu=mpu if not neox_args.is_pipe_parallel else None, ) + mark_norms_for_sequence_parallel_grad_sync(model, neox_args) if neox_args.moe_num_experts > 1 and neox_args.moe_type == "megablocks": # We need to additionally set this flag to ensure DS parallelism properly handles this foreign MoE. model.has_moe_layers = True @@ -891,6 +893,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) and neox_args.iteration <= neox_args.profile_step_stop ): torch.cuda.nvtx.range_push(f"Optimizer step") + timers("optimizer").start() if neox_args.deepspeed: model.step() From e7c0182f4b8f0be7c894e292baebabc6d16cecc2 Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Fri, 23 Aug 2024 21:38:52 -0400 Subject: [PATCH 20/27] Add new cites (#1255) * Update README.md I added new models that have come out trained with the GPT-NeoX library. The library itself is sufficiently well-used that simply listing all citing papers is rapidly becoming non-viable. I'm currently leaning towards providing a curated list of "exciting" papers? I haven't looked at other libraries to see what they do yet. * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e63a59f28..b5fc0d877 100644 --- a/README.md +++ b/README.md @@ -736,7 +736,7 @@ The following publications by other research groups use this library: The following models were trained using this library: ### English LLMs -- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631) +- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia) - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B) - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM) - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1) @@ -747,13 +747,15 @@ The following models were trained using this library: ### Non-English LLMs - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean) - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean) -- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) +- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese) - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean) - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese) - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese) - The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese) - The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese) - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi) +- Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese) +- The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque) ### Code Models - Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm) @@ -761,11 +763,13 @@ The following models were trained using this library: - CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B) ### AI for Science +- EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631) - Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge) -- Oak Ridge National Lab and EleutherAI's [Unnamed Material Science Domain Models (7B)](https://github.com/at-aaims/forge) +- Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691) - Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr) ### Other Modalities +- Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text) - University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1) - Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table) From 591563d3f6a54af2279aad40444c3a04033cf22b Mon Sep 17 00:00:00 2001 From: Jacob Hatef <74274091+jahatef@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:51:56 -0400 Subject: [PATCH 21/27] mamba fixes and cleaning (#1262) * mamba fixes and cleaning * space * revert assertion change for now --------- Co-authored-by: Jacob Hatef --- configs/mamba/mamba-1.4B.yml | 68 ++++++++++++++++++++++++++- configs/mamba/mamba-130M.yml | 69 ++++++++++++++++++++++++++- configs/mamba/mamba-2.8B.yml | 68 ++++++++++++++++++++++++++- configs/mamba/mamba-370M.yml | 69 ++++++++++++++++++++++++++- configs/mamba/mamba-790M.yml | 70 +++++++++++++++++++++++++++- megatron/model/mamba/mamba.py | 5 +- megatron/neox_arguments/arguments.py | 2 +- 7 files changed, 339 insertions(+), 12 deletions(-) diff --git a/configs/mamba/mamba-1.4B.yml b/configs/mamba/mamba-1.4B.yml index 2898a72fd..eae467d0e 100644 --- a/configs/mamba/mamba-1.4B.yml +++ b/configs/mamba/mamba-1.4B.yml @@ -19,5 +19,71 @@ "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0002, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00002, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 1, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml index d9a6ab92e..7187048e6 100644 --- a/configs/mamba/mamba-130M.yml +++ b/configs/mamba/mamba-130M.yml @@ -19,5 +19,70 @@ "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", -} + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0006, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00006, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0.0, + "attention_dropout": 0.0, + + # precision settings + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, diff --git a/configs/mamba/mamba-2.8B.yml b/configs/mamba/mamba-2.8B.yml index 1aacb264b..d5afef368 100644 --- a/configs/mamba/mamba-2.8B.yml +++ b/configs/mamba/mamba-2.8B.yml @@ -19,5 +19,71 @@ "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00016, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.000016, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-370M.yml b/configs/mamba/mamba-370M.yml index 5e5a78cca..0058f1c0e 100644 --- a/configs/mamba/mamba-370M.yml +++ b/configs/mamba/mamba-370M.yml @@ -12,12 +12,77 @@ "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, - "attention_config": [[["mamba"], 64]], + "attention_config": [[["mamba"], 48]], "mamba_selective_scan_fusion": true, "mamba_causal_conv_fusion": true, "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0003, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00003, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-790M.yml b/configs/mamba/mamba-790M.yml index fcd324d9d..4aef7e813 100644 --- a/configs/mamba/mamba-790M.yml +++ b/configs/mamba/mamba-790M.yml @@ -12,12 +12,78 @@ "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, - "attention_config": [[["mamba"], 64]], + "attention_config": [[["mamba"], 48]], "mamba_selective_scan_fusion": true, "mamba_causal_conv_fusion": true, "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00025, + "betas": [0.9, 0.999], + "eps": 1.0e-8, + } + }, + "min_lr": 0.000025, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py index d5d6b336f..3177267cb 100644 --- a/megatron/model/mamba/mamba.py +++ b/megatron/model/mamba/mamba.py @@ -13,9 +13,8 @@ from causal_conv1d import causal_conv1d_fn import einops except ModuleNotFoundError: - print( - "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba" - ) + print( "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \ + or directly from https://github.com/state-spaces/mamba") pass from megatron.model.norms import get_norm diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 6a84df6c7..fb26fb4aa 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1189,7 +1189,7 @@ def validate_values(self): return False # Checks. - if self.hidden_size % self.num_attention_heads != 0: + if self.hidden_size % self.num_attention_heads != 0 and not ("mamba" in self.attention_config): error_message = ( self.__class__.__name__ + ".validate_values() hidden_size must be divisible by num_attention_heads" From c7863673e3c08b5886cae36cf096a0fb5789dd0e Mon Sep 17 00:00:00 2001 From: dmahan93 <44207705+dmahan93@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:58:21 -0500 Subject: [PATCH 22/27] SFT improvements (labeling fixes, different packing implementations) (#1240) * - add different packing impl (Unpacked, packing until overflow) - fix labels to also have valid/test implementations - fix label masking in _get_batch to also include anything from get_ltor_masks_and_position_ids * Update arguments.py to use train_label_data_paths instead of label_data_paths * - fix precommit --- megatron/data/data_utils.py | 37 +++++- megatron/data/gpt2_dataset.py | 188 ++++++++++++++++++++++----- megatron/neox_arguments/arguments.py | 6 +- megatron/neox_arguments/neox_args.py | 29 ++++- megatron/training.py | 21 +-- 5 files changed, 227 insertions(+), 54 deletions(-) diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py index bc5754cdb..7e4dbdb37 100644 --- a/megatron/data/data_utils.py +++ b/megatron/data/data_utils.py @@ -55,6 +55,8 @@ def build_the_dataset( data_prefix, name, data_impl, + pack_impl, + allow_chopped, num_samples, seq_length, seed, @@ -83,6 +85,8 @@ def build_the_dataset( num_samples, seq_length, seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, build_index_mappings=build_index_mappings, label_dataset=label_dataset, ) @@ -93,6 +97,8 @@ def build_train_valid_test_datasets( data_prefix, use_shared_fs, data_impl, + pack_impl, + allow_chopped, splits_string, train_valid_test_num_samples, seq_length, @@ -138,6 +144,8 @@ def build_dataset(index, name): train_valid_test_num_samples[index], seq_length, seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, use_shared_fs=use_shared_fs, ) return dataset @@ -204,12 +212,25 @@ def build_weighted_datasets( ): # build individual datasets train_datasets, valid_datasets, test_datasets = [], [], [] - for i, (train_path, label_path, valid_path, test_path) in enumerate( + for i, ( + train_path, + train_label_path, + valid_path, + valid_label_path, + test_path, + test_label_path, + ) in enumerate( zip_longest( neox_args.train_data_paths, - neox_args.label_data_paths if neox_args.label_data_paths else [], + neox_args.train_label_data_paths + if neox_args.train_label_data_paths + else [], neox_args.valid_data_paths, + neox_args.valid_label_data_paths + if neox_args.valid_label_data_paths + else [], neox_args.test_data_paths, + neox_args.test_label_data_paths if neox_args.test_label_data_paths else [], ) ): if train_path: @@ -218,12 +239,14 @@ def build_weighted_datasets( data_prefix=train_path, name=f"train_{i}", data_impl=neox_args.data_impl, + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, num_samples=train_num_samples[i], seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, - label_prefix=label_path, + label_prefix=train_label_path, ) ) @@ -233,11 +256,14 @@ def build_weighted_datasets( data_prefix=valid_path, name=f"valid_{i}", data_impl=neox_args.data_impl, + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, num_samples=valid_num_samples[i], seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, + label_prefix=valid_label_path, ) ) @@ -247,11 +273,14 @@ def build_weighted_datasets( data_prefix=test_path, name=f"test_{i}", data_impl=neox_args.data_impl, + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, num_samples=test_num_samples[i], seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, + label_prefix=test_label_path, ) ) return train_datasets, valid_datasets, test_datasets @@ -414,6 +443,8 @@ def build_train_valid_test_data_iterators(neox_args): seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, ) # Build dataloders. diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py index 75e601fda..edba57df2 100644 --- a/megatron/data/gpt2_dataset.py +++ b/megatron/data/gpt2_dataset.py @@ -36,14 +36,19 @@ def __init__( num_samples, seq_length, seed, + pack_impl="packed", + allow_chopped=True, build_index_mappings=True, use_shared_fs=True, label_dataset=None, ): self.name = name + self.pack_impl = pack_impl + self.allow_chopped = allow_chopped self.indexed_dataset = indexed_dataset self.label_dataset = label_dataset + self.seq_length = seq_length # Checks assert np.min(documents) >= 0 @@ -56,10 +61,13 @@ def __init__( data_prefix, documents, self.indexed_dataset.sizes, + self.label_dataset, num_samples, seq_length, seed, + self.pack_impl, use_shared_fs=use_shared_fs, + allow_chopped=self.allow_chopped, ) self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1 self.sample_idx_len = self.sample_idx.shape[0] - 1 @@ -113,8 +121,38 @@ def __getitem__(self, idx): samples.append(np.concatenate(sample_list)) if len(datasets) == 1: + if len(samples[0]) < (self.seq_length + 1): + # Pad with -100s so the masking function can ignore these. + samples[0] = np.pad( + samples[0], + (0, (self.seq_length + 1) - len(samples[0])), + mode="constant", + constant_values=-100, + ) + elif len(samples[0]) > (self.seq_length + 1): + # Check for overflow and truncate. + samples[0] = samples[0][: (self.seq_length + 1)] return {"text": np.array(samples[0], dtype=np.int64)} else: + if len(samples[0]) < (self.seq_length + 1): + # Pad with 0s, can use any number since it's masked. + samples[0] = np.pad( + samples[0], + (0, (self.seq_length + 1) - len(samples[0])), + mode="constant", + constant_values=0, + ) + # pad with -100s so we can mask it out + samples[1] = np.pad( + samples[1], + (0, (self.seq_length + 1) - len(samples[1])), + mode="constant", + constant_values=-100, + ) + elif len(samples[0]) > (self.seq_length + 1): + # Check for overflow and truncate. + samples[0] = samples[0][: (self.seq_length + 1)] + samples[1] = samples[1][: (self.seq_length + 1)] return { "text": np.array(samples[0], dtype=np.int64), "label": np.array(samples[1], dtype=np.int64), @@ -132,10 +170,13 @@ def _build_index_mappings( data_prefix, documents, sizes, + label_dataset, num_samples, seq_length, seed, + packing_impl, use_shared_fs=True, + allow_chopped=True, ): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. @@ -155,6 +196,9 @@ def _build_index_mappings( _filename += "_{}ns".format(num_samples) _filename += "_{}sl".format(seq_length) _filename += "_{}s".format(seed) + _filename += "_{}pi".format(packing_impl) + if allow_chopped: + _filename += "_ac" doc_idx_filename = _filename + "_doc_idx.npy" sample_idx_filename = _filename + "_sample_idx.npy" shuffle_idx_filename = _filename + "_shuffle_idx.npy" @@ -177,44 +221,116 @@ def _build_index_mappings( ) # doc-idx. start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng) - np.save(doc_idx_filename, doc_idx, allow_pickle=True) - print_rank_0( - " > elapsed time to build and save doc-idx mapping " - "(seconds): {:4f}".format(time.time() - start_time) - ) - # sample-idx. - start_time = time.time() - # Use C++ implementation for speed. - from megatron.data import helpers - - assert doc_idx.dtype == np.int32 - assert sizes.dtype == np.int32 - - num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length - if 2 * (num_samples + 1) < np.iinfo(np.int32).max: - sample_idx = helpers.build_sample_idx_int32( - sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + if packing_impl == "packed": + doc_idx = _build_doc_idx(documents, num_epochs, np_rng) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + print_rank_0( + " > elapsed time to build and save doc-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) ) - else: - sample_idx = helpers.build_sample_idx_int64( - sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + # sample-idx. + start_time = time.time() + # Use C++ implementation for speed. + from megatron.data import helpers + + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + + num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length + if 2 * (num_samples + 1) < np.iinfo(np.int32).max: + sample_idx = helpers.build_sample_idx_int32( + sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + ) + else: + sample_idx = helpers.build_sample_idx_int64( + sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + ) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0( + " > elapsed time to build and save sample-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) ) - np.save(sample_idx_filename, sample_idx, allow_pickle=True) - print_rank_0( - " > elapsed time to build and save sample-idx mapping " - "(seconds): {:4f}".format(time.time() - start_time) - ) - # shuffle-idx. - start_time = time.time() - # -1 is due to data structure used to retrieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng) - np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) - print_rank_0( - " > elapsed time to build and save shuffle-idx mapping" - " (seconds): {:4f}".format(time.time() - start_time) - ) + # shuffle-idx. + start_time = time.time() + # -1 is due to data structure used to retrieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + print_rank_0( + " > elapsed time to build and save shuffle-idx mapping" + " (seconds): {:4f}".format(time.time() - start_time) + ) + elif packing_impl == "pack_until_overflow": + # Naively pack data until it overflows, then roll it over to a new one instead. + shuffle_idx = np.arange(num_samples) # Shuffle index around epochs + np_rng.shuffle(shuffle_idx) + sample_idx = [] + doc_idx = [] + # Iterate over files until we have enough samples. + temp_shuffle_idx = np.arange(len(documents)) + np_rng.shuffle(temp_shuffle_idx) + running_length = 0 + curr_shuffle_idx = 0 + while len(sample_idx) < num_samples: + if not allow_chopped: + # +1 since we shift left/right by 1 + if sizes[temp_shuffle_idx[curr_shuffle_idx]] > seq_length + 1: + curr_shuffle_idx += 1 + continue + # First, check if we need to skip this item... + if label_dataset is not None: + if np.all( + label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[ + : seq_length + 1 + ] + == -100 + ): + curr_shuffle_idx += 1 + continue + doc_length = sizes[temp_shuffle_idx[curr_shuffle_idx]] + if running_length == 0: + sample_idx.append(np.array([len(doc_idx), 0])) + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + running_length += doc_length + else: + if running_length + doc_length > (seq_length + 1): + running_length = doc_length + sample_idx.append(np.array([len(doc_idx), 0])) + else: + running_length += doc_length + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + curr_shuffle_idx += 1 + if curr_shuffle_idx == len(documents): + curr_shuffle_idx = 0 + np_rng.shuffle(temp_shuffle_idx) + sample_idx.append(np.array([len(doc_idx), 0])) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + elif packing_impl == "unpacked": + # Unpacked data, one sample per document. + shuffle_idx = np.arange(num_samples) # Shuffle index around epochs + np_rng.shuffle(shuffle_idx) + sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64) + sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)]) + sample_idx[:, 1] = 0 + doc_idx = list() + doc_i = 0 + while len(doc_idx) <= num_samples: + if not allow_chopped: + # +1 since we shift left/right by 1 + if sizes[doc_i] > seq_length + 1: + doc_i = (doc_i + 1) % len(documents) + continue + # Just in case we have bad data in the loop... + if np.all(label_dataset.get(doc_i)[:seq_length] == -100): + doc_i = (doc_i + 1) % len(documents) + continue + doc_idx.append(doc_i) + doc_i = (doc_i + 1) % len(documents) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index fb26fb4aa..327639454 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1121,10 +1121,8 @@ def calculate_derived(self): if self.test_data_paths and (self.test_data_weights is None): self.test_data_weights = [1.0] * len(self.test_data_paths) - if self.label_data_paths: - err_str = ( - "Must use `label_data_paths` with `train_data_paths`, not `data_path`" - ) + if self.train_label_data_paths: + err_str = "Must use `train_label_data_paths` with `train_data_paths`, not `data_path`" assert self.train_data_paths and not self.data_path, err_str # if a sample input file is provided, default text_gen_type type to input-file diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 7993f785f..dd51c7778 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -855,9 +855,9 @@ class NeoXArgsTraining(NeoXArgsTemplate): List of paths to train datasets. """ - label_data_paths: list = None + train_label_data_paths: list = None """ - List of paths to label datasets (not shifted by 1 yet!). + List of paths to train label datasets (not shifted by 1 yet!). """ test_data_paths: list = None @@ -865,11 +865,21 @@ class NeoXArgsTraining(NeoXArgsTemplate): List of paths to test datasets. """ + test_label_data_paths: list = None + """ + List of paths to test label datasets (not shifted by 1 yet!). + """ + valid_data_paths: list = None """ List of paths to validation datasets. """ + valid_label_data_paths: list = None + """ + List of paths to validation label datasets (not shifted by 1 yet!). + """ + train_data_weights: list = None """ List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting. @@ -919,6 +929,21 @@ class NeoXArgsTraining(NeoXArgsTemplate): Implementation of indexed datasets, can be one of "infer", "cached", or "mmap" """ + pack_impl: Literal["packed", "pack_until_overflow", "unpacked"] = "packed" + """ + Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked". + + warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets + """ + + allow_chopped: bool = True + """ + WARNING: if your packing impl is packed, this is ignored. + + Allow chopped samples in the dataset. + (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024) + """ + mmap_warmup: bool = False """ Warm up mmap files. diff --git a/megatron/training.py b/megatron/training.py index ce59b242a..fc3d9e129 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -278,16 +278,19 @@ def pretrain(neox_args): def _get_batch(neox_args, tokenizer, keys, data, datatype): """Support function for get_batch / get_batch pipe (to avoid code repetition)""" data_b = mpu.broadcast_data(keys, data, datatype) - + token_key = keys[0] + label_key = keys[1] if len(keys) > 1 else None # Unpack. - tokens_ = data_b["text"].long() + tokens_ = data_b[token_key].long() if "label" in data_b: + label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous() labels = torch.where( - data_b["label"].long() >= 0, - data_b["label"].long(), + data_b[label_key].long() >= 0, + data_b[label_key].long(), torch.zeros_like(data_b["label"].long()), )[:, 1:].contiguous() else: + label_mask = (tokens_.long() >= 0)[:, 1:].contiguous() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() @@ -298,9 +301,9 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype): eod_mask_loss=neox_args.eod_mask_loss, sliding_window_width=neox_args.sliding_window_width, ) - # If `label` is present, any token < 0 (e.g., -100, the default for torch) skips the loss computation - if "label" in data_b: - loss_mask = (data_b["label"][:, 1:] >= 0).to(loss_mask.dtype) + + # combine loss masks from get_ltor_masks_and_position_ids with loss masks from data + loss_mask = label_mask.to(loss_mask.dtype) * loss_mask return tokens, labels, loss_mask, attention_mask, position_ids @@ -308,7 +311,7 @@ def get_batch(neox_args, data_iterator): """Generate a batch""" # Items and their type. - keys = ["text", "label"] if neox_args.label_data_paths else ["text"] + keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"] datatype = torch.int64 # Broadcast data. @@ -328,7 +331,7 @@ def get_batch(neox_args, data_iterator): def get_batch_pipe(data, neox_args, curr_scheduler=None): """A modification of get_batch() to work with the latest batch instead of an iterator.""" # Items and their type. - keys = ["text", "label"] if neox_args.label_data_paths else ["text"] + keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"] datatype = torch.int64 tokens, labels, loss_mask, attention_mask, position_ids = _get_batch( From 7548a8b76426fbf64e52343ad1846022793de58d Mon Sep 17 00:00:00 2001 From: AI-WAIFU <67525070+AI-WAIFU@users.noreply.github.com> Date: Thu, 5 Sep 2024 21:16:02 +0100 Subject: [PATCH 23/27] add assert for missing tokenizer_type in config (#1267) --- megatron/tokenizer/tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 348c7cefe..e450504c8 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -31,6 +31,8 @@ def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True) + + assert args.tokenizer_type is not None, "tokenizer_type must be specified in the .yml config" # Select and instantiate the tokenizer. if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower(): From 0d4bdb965e3bd9eed6df8f5a4f024f9ff310601c Mon Sep 17 00:00:00 2001 From: dtamayo <119006120+dtamayo-nlp@users.noreply.github.com> Date: Sat, 7 Sep 2024 06:17:14 +0200 Subject: [PATCH 24/27] Add `intermediate_size` to GPT-NeoX models (#1212) * Update transformer.py -> Add `intermediate_size` * add support for rwkv and mamba and add todos about swiglu * refactor activations and mlps * change llama config to swiglu * fixes gelu fusion * pre-commit run * add assert message to mamba linear * Update 1-3B.yml revert accidental change * Update 1-3B.yml * fixes various issues * add back swiglu check --------- Co-authored-by: jahatef Co-authored-by: Quentin Anthony Co-authored-by: Jacob Hatef <74274091+jahatef@users.noreply.github.com> --- configs/llama/13B.yml | 2 +- configs/llama/30B.yml | 2 +- configs/llama/65B.yml | 2 +- configs/llama/7B.yml | 2 +- megatron/data/helpers.cpp | 12 +- megatron/model/activations.py | 38 +++--- megatron/model/gmlp.py | 2 +- megatron/model/mamba/mamba.py | 9 +- megatron/model/rwkv/v6/rwkv.py | 17 ++- megatron/model/transformer.py | 167 +++++++++------------------ megatron/neox_arguments/neox_args.py | 23 +++- 11 files changed, 117 insertions(+), 159 deletions(-) diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml index 305567be1..7a823a43c 100644 --- a/configs/llama/13B.yml +++ b/configs/llama/13B.yml @@ -22,5 +22,5 @@ "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", } diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml index 450f8da38..2c356cea2 100644 --- a/configs/llama/30B.yml +++ b/configs/llama/30B.yml @@ -22,5 +22,5 @@ "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", } diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml index 85f199ce2..cc22d3734 100644 --- a/configs/llama/65B.yml +++ b/configs/llama/65B.yml @@ -22,5 +22,5 @@ "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", } diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml index ecbf187a8..0b134ae27 100644 --- a/configs/llama/7B.yml +++ b/configs/llama/7B.yml @@ -22,5 +22,5 @@ "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", } diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index aca290854..9b062b050 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 7a29b0716..c0b825261 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -25,9 +25,23 @@ def get_activation(neox_args): - """retrieves the activation function specified in neox_args""" + """retrieves the activation function specified in neox_args and whether or not the activation is gated""" + is_gated = False if neox_args.activation == "geglu": - activation_func = GEGLU(neox_args=neox_args) + is_gated = True + activation_func = F.gelu + elif neox_args.activation == "reglu": + is_gated = True + activation_func = F.relu + elif neox_args.activation == "bilinear": + is_gated = True + activation_func = lambda x: x + elif neox_args.activation == "swiglu": + is_gated = True + activation_func = swish + elif neox_args.activation == "glu": + is_gated = True + activation_func = F.sigmoid elif neox_args.activation == "gelu": if neox_args.onnx_safe and neox_args.bias_gelu_fusion: raise ValueError("onnx_safe + bias_gelu_fusion not compatible") @@ -49,7 +63,7 @@ def get_activation(neox_args): activation_func = F.silu else: raise ValueError(f"Activation function {neox_args.activation} not recognized") - return activation_func + return activation_func, is_gated ###### BIAS GELU FUSION/ NO AUTOGRAD ################ @@ -119,21 +133,3 @@ def swish(x, beta: float = 1.0): @torch.jit.script def mish(x): return x * torch.tanh(F.softplus(x)) - - -class GEGLU(torch.nn.Module): - def __init__(self, neox_args): - super(GEGLU, self).__init__() - if neox_args.onnx_safe: - self.activation_func = erf_gelu - else: - self.activation_func = F.gelu - - def forward(self, x, bias=None): - x, gate = x.chunk(2, dim=-1) - if bias is not None: - bias_1, bias_2 = bias.chunk(2, dim=-1) - x = x + bias_1 - gate = gate + bias_2 - intermediate_parallel = self.activation_func(gate) - return intermediate_parallel * x diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py index c3462c651..6400640bd 100644 --- a/megatron/model/gmlp.py +++ b/megatron/model/gmlp.py @@ -112,7 +112,7 @@ def __init__( init_method=init_method, skip_bias_add=True, ) - self.activation_func = get_activation(neox_args) + self.activation_func, _ = get_activation(neox_args) ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size()) if neox_args.attention_config[layer_number] == "amlp": d_attn = neox_args.gmlp_attn_dim diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py index 3177267cb..b3d9e1549 100644 --- a/megatron/model/mamba/mamba.py +++ b/megatron/model/mamba/mamba.py @@ -44,12 +44,17 @@ def __init__( neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion ), "Mamba fused inner fn and bias in x_proj not compatible!" + assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + # set variables, mostly following mamba defaults self.d_model = neox_args.hidden_size self.d_state = 16 # state dimensions per channel self.d_conv = 4 # convolution width - self.expand = 2 # linear projection expansion factors - self.d_inner = int(self.expand * self.d_model) + if neox_args.intermediate_size: + self.d_inner = neox_args.intermediate_size + else: + self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 + self.d_inner = int(self.expand * self.d_model) self.dt_rank = math.ceil(self.d_model / 16) # rank of dt / Delta parameter self.dt_scale = 1.0 diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py index 5d4e0d144..ec8cc1aa6 100644 --- a/megatron/model/rwkv/v6/rwkv.py +++ b/megatron/model/rwkv/v6/rwkv.py @@ -247,11 +247,11 @@ def __init__(self, neox_args, layer_number): self.time_maa_k = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0)) self.time_maa_r = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0)) - self.key = nn.Linear(neox_args.hidden_size, neox_args.dim_ffn, bias=False) + self.key = nn.Linear(neox_args.hidden_size, neox_args.ffn_dim, bias=False) self.receptance = nn.Linear( neox_args.hidden_size, neox_args.hidden_size, bias=False ) - self.value = nn.Linear(neox_args.dim_ffn, neox_args.hidden_size, bias=False) + self.value = nn.Linear(neox_args.ffn_dim, neox_args.hidden_size, bias=False) def forward(self, x): xx = self.time_shift(x) - x @@ -275,14 +275,19 @@ def __init__(self, neox_args, layer_number): self.layer_number = layer_number self.fp16 = neox_args.precision == "fp16" self.bf16 = neox_args.precision == "bfloat16" + assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" if not hasattr(neox_args, "dim_att"): neox_args.dim_att = neox_args.hidden_size - if not hasattr(neox_args, "dim_ffn"): - # Make hidden size 3.5x. Round to nearest multiple of 32 until we add hdim rounding logic - neox_args.dim_ffn = int((neox_args.hidden_size * 3.5) // 32 * 32) + if neox_args.intermediate_size: + neox_args.ffn_dim = neox_args.intermediate_size + else: + self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5 + neox_args.ffn_dim = int(self.expand * neox_args.hidden_size) + # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic + neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32) assert neox_args.hidden_size % 32 == 0 assert neox_args.dim_att % 32 == 0 - assert neox_args.dim_ffn % 32 == 0 + assert neox_args.ffn_dim % 32 == 0 self.neox_args.head_size = neox_args.dim_att // neox_args.num_attention_heads self.head_size = self.neox_args.head_size self.num_attention_heads = neox_args.num_attention_heads diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 62e7d3a9c..119676c54 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -93,37 +93,55 @@ def __init__( init_method, output_layer_init_method, parallel_output=False, + multiple_of=256, MOE=False, MoE_mp_size=1, ): super().__init__() + assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" - self.activation_func = get_activation(neox_args) + self.activation_func, self.is_gated = get_activation(neox_args) self.activation_type = neox_args.activation self.bias_gelu_fusion = neox_args.bias_gelu_fusion + self.multiple_of = multiple_of - # auto scale so geglu has equal parameters - ff_mult = int(4 * 2 / 3) if self.activation_type == "geglu" else 4 - ff_dim = ( - int(ff_mult * neox_args.hidden_size) * 2 - if self.activation_type == "geglu" - else ff_mult * neox_args.hidden_size + if neox_args.intermediate_size: + ffn_dim = neox_args.intermediate_size + elif neox_args.expansion_factor: + ffn_dim = int(neox_args.expansion_factor * neox_args.hidden_size) + else: + # 4h is default for ffn_dim + ffn_dim = 4 * neox_args.hidden_size + ffn_dim_in = ffn_dim + if self.is_gated: + # set activation function to be gated implementation + self.activation_func = Gated_Activation(self.activation_func) + # auto scale so gated activations has equal parameters + ffn_dim = int(ffn_dim * 2 / 3) + ffn_dim_in = ffn_dim // 2 + # set multiple + ffn_dim = int( + (2 * self.multiple_of) + * ((ffn_dim + (2 * multiple_of) - 1) // (2 * multiple_of)) + ) + ffn_dim_in = int( + self.multiple_of * ((ffn_dim_in + multiple_of - 1) // multiple_of) ) - self.dense_h_to_4h = mpu.ColumnParallelLinear( + + self.linear1 = mpu.ColumnParallelLinear( neox_args=neox_args, input_size=neox_args.hidden_size, - output_size=ff_dim, + output_size=ffn_dim, gather_output=False, init_method=init_method, skip_bias_add=True, MOE=MOE, MoE_mp_size=MoE_mp_size, ) - ff_dim_in = ff_dim // 2 if self.activation_type == "geglu" else ff_dim # Project back to h. - self.dense_4h_to_h = mpu.RowParallelLinear( + self.linear2 = mpu.RowParallelLinear( neox_args=neox_args, - input_size=ff_dim_in, + input_size=ffn_dim_in, output_size=neox_args.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, @@ -134,13 +152,10 @@ def __init__( ) def forward(self, hidden_states): + # [s, b, intermediate_size] + intermediate_parallel, bias_parallel = self.linear1(hidden_states) - # [s, b, 4hp] - intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states) - - if ( - self.activation_type == "gelu" and self.bias_gelu_fusion - ) or self.activation_type == "geglu": + if self.is_gated or (self.activation_type == "gelu" and self.bias_gelu_fusion): intermediate_parallel = self.activation_func( intermediate_parallel, bias_parallel ) @@ -150,84 +165,23 @@ def forward(self, hidden_states): ) # [s, b, h] - output, output_bias = self.dense_4h_to_h(intermediate_parallel) + output, output_bias = self.linear2(intermediate_parallel) return output, output_bias -class LLaMAParallelMLP(nn.Module): - """LLaMA's MLP. - - MLP will take the input with h hidden state, project it to 4*h - hidden dimension, perform nonlinear transformation, and project the - state back into h hidden dimension. At the end, dropout is also - applied. - - Note: multiple_of is used to compute the hidden dimension of the MLP - """ - - def __init__( - self, - neox_args, - init_method, - output_layer_init_method, - parallel_output=False, - multiple_of=256, - MOE=False, - MoE_mp_size=1, - ): +class Gated_Activation(torch.nn.Module): + def __init__(self, activation_func): super().__init__() + self.activation_func = activation_func - self.activation_func = get_activation(neox_args) - self.activation_type = neox_args.activation - - self.multiple_of = multiple_of - - # Allow custom intermediate size, e.g. for Mistral - if neox_args.intermediate_size is not None: - ff_dim = neox_args.intermediate_size - else: - ff_dim = int(2 * neox_args.hidden_size * 4 / 3) - ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) - - self.w1 = mpu.ColumnParallelLinear( - neox_args=neox_args, - input_size=neox_args.hidden_size, - output_size=ff_dim, - gather_output=False, - init_method=init_method, - skip_bias_add=True, - bias=False, - MOE=MOE, - MoE_mp_size=MoE_mp_size, - ) - self.w3 = mpu.ColumnParallelLinear( - neox_args=neox_args, - input_size=neox_args.hidden_size, - output_size=ff_dim, - gather_output=False, - init_method=init_method, - skip_bias_add=True, - bias=False, - MOE=MOE, - MoE_mp_size=MoE_mp_size, - ) - self.w2 = mpu.RowParallelLinear( - neox_args=neox_args, - input_size=ff_dim, - output_size=neox_args.hidden_size, - input_is_parallel=True, - init_method=output_layer_init_method, - skip_bias_add=True, - parallel_output=parallel_output, - bias=False, - MOE=MOE, - MoE_mp_size=MoE_mp_size, - ) - - def forward(self, hidden_states): - w1_out, _ = self.w1(hidden_states) - w3_out, _ = self.w3(hidden_states) - return self.w2(self.activation_func(w1_out) * w3_out) + def forward(self, x, bias=None): + x, gate = x.chunk(2, dim=-1) + if bias is not None: + bias_1, bias_2 = bias.chunk(2, dim=-1) + x = x + bias_1 + gate = gate + bias_2 + intermediate_parallel = self.activation_func(gate) + return intermediate_parallel * x class ParallelLinear(nn.Module): @@ -1054,24 +1008,13 @@ def __init__( # MLP def get_mlp(mlp_type, **kw): - if mlp_type == "regular": - return ParallelMLP( - neox_args=neox_args, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - parallel_output=self.gpt_j_residual, - **kw, - ) - elif mlp_type == "llama": - return LLaMAParallelMLP( - neox_args=neox_args, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - parallel_output=self.gpt_j_residual, - **kw, - ) - else: - raise KeyError(mlp_type) + return ParallelMLP( + neox_args=neox_args, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + parallel_output=self.gpt_j_residual, + **kw, + ) self.num_experts = ( neox_args.moe_num_experts @@ -1287,11 +1230,7 @@ def forward(self, x, attention_mask, layer_past=None): raise KeyError(self.moe_type) with torch.enable_grad(): - if ( - self.mlp_type == "llama" - or self.num_experts > 1 - and self.moe_type == "deepspeed" - ): + if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed": # No dropout either assert mlp_bias is None output = mlp_output + attention_output diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index dd51c7778..818c86d31 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -121,9 +121,12 @@ class NeoXArgsModel(NeoXArgsTemplate): intermediate_size: int = None """ - Transformer intermediate size. Currently only used for "mlp_type": "llama". + Transformer intermediate size. Default = 4h + """ - If not passed, will be set to a reasonable default. + expansion_factor: float = None + """ + Transformer intermediate size. Default = 4 """ num_attention_heads: int = None @@ -278,10 +281,20 @@ class NeoXArgsModel(NeoXArgsTemplate): """ activation: Literal[ - "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu" + "gelu", + "geglu", + "relu", + "softsign", + "swish", + "mish", + "silu", + "reglu", + "swiglu", + "bilinear", + "glu", ] = "gelu" """ - Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] + Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"] """ scaled_upper_triang_masked_softmax_fusion: bool = False @@ -421,9 +434,9 @@ class NeoXArgsModel(NeoXArgsTemplate): mlp_type: str = "regular" """ + Currently, the only mlp_type is "regular." This behavior is currently deprecated. Types: regular: Megatron implementation - llama: LLaMA MLP (SiLU-gated MLP) """ soft_prompt_tuning: dict = None From ec82c05780d40404c618d4905ad14b670a91bd3c Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Sun, 8 Sep 2024 14:14:33 -0700 Subject: [PATCH 25/27] apply pre-commit and add missing close-paren to mamba config (#1270) --- configs/mamba/mamba-130M.yml | 1 + megatron/data/helpers.cpp | 12 ++++++------ megatron/model/mamba/mamba.py | 14 ++++++++++---- megatron/model/rwkv/v6/rwkv.py | 8 ++++++-- megatron/model/transformer.py | 10 ++++++++-- megatron/neox_arguments/arguments.py | 4 +++- megatron/tokenizer/tokenizer.py | 6 ++++-- 7 files changed, 38 insertions(+), 17 deletions(-) diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml index 7187048e6..bd05723b2 100644 --- a/configs/mamba/mamba-130M.yml +++ b/configs/mamba/mamba-130M.yml @@ -86,3 +86,4 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, +} diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 9b062b050..aca290854 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py index b3d9e1549..950e36fed 100644 --- a/megatron/model/mamba/mamba.py +++ b/megatron/model/mamba/mamba.py @@ -13,8 +13,10 @@ from causal_conv1d import causal_conv1d_fn import einops except ModuleNotFoundError: - print( "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \ - or directly from https://github.com/state-spaces/mamba") + print( + "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \ + or directly from https://github.com/state-spaces/mamba" + ) pass from megatron.model.norms import get_norm @@ -44,7 +46,9 @@ def __init__( neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion ), "Mamba fused inner fn and bias in x_proj not compatible!" - assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" # set variables, mostly following mamba defaults self.d_model = neox_args.hidden_size @@ -53,7 +57,9 @@ def __init__( if neox_args.intermediate_size: self.d_inner = neox_args.intermediate_size else: - self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 + self.expand = ( + neox_args.expansion_factor if neox_args.expansion_factor else 2 + ) self.d_inner = int(self.expand * self.d_model) self.dt_rank = math.ceil(self.d_model / 16) # rank of dt / Delta parameter self.dt_scale = 1.0 diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py index ec8cc1aa6..b3741a3fc 100644 --- a/megatron/model/rwkv/v6/rwkv.py +++ b/megatron/model/rwkv/v6/rwkv.py @@ -275,13 +275,17 @@ def __init__(self, neox_args, layer_number): self.layer_number = layer_number self.fp16 = neox_args.precision == "fp16" self.bf16 = neox_args.precision == "bfloat16" - assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" if not hasattr(neox_args, "dim_att"): neox_args.dim_att = neox_args.hidden_size if neox_args.intermediate_size: neox_args.ffn_dim = neox_args.intermediate_size else: - self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5 + self.expand = ( + neox_args.expansion_factor if neox_args.expansion_factor else 3.5 + ) neox_args.ffn_dim = int(self.expand * neox_args.hidden_size) # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 119676c54..d2b93eb06 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -98,7 +98,9 @@ def __init__( MoE_mp_size=1, ): super().__init__() - assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" self.activation_func, self.is_gated = get_activation(neox_args) self.activation_type = neox_args.activation @@ -1230,7 +1232,11 @@ def forward(self, x, attention_mask, layer_past=None): raise KeyError(self.moe_type) with torch.enable_grad(): - if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed": + if ( + self.activation == "swiglu" + or self.num_experts > 1 + and self.moe_type == "deepspeed" + ): # No dropout either assert mlp_bias is None output = mlp_output + attention_output diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 327639454..8fbe045bb 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1187,7 +1187,9 @@ def validate_values(self): return False # Checks. - if self.hidden_size % self.num_attention_heads != 0 and not ("mamba" in self.attention_config): + if self.hidden_size % self.num_attention_heads != 0 and not ( + "mamba" in self.attention_config + ): error_message = ( self.__class__.__name__ + ".validate_values() hidden_size must be divisible by num_attention_heads" diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index e450504c8..d39e18243 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -31,8 +31,10 @@ def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True) - - assert args.tokenizer_type is not None, "tokenizer_type must be specified in the .yml config" + + assert ( + args.tokenizer_type is not None + ), "tokenizer_type must be specified in the .yml config" # Select and instantiate the tokenizer. if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower(): From 77e8158ac845c44ba3d3d63e15083b766a41afe0 Mon Sep 17 00:00:00 2001 From: dmahan93 <44207705+dmahan93@users.noreply.github.com> Date: Sun, 8 Sep 2024 18:58:11 -0500 Subject: [PATCH 26/27] Add DPO training (#1242) * Add a chat data preprocessing script * add EOT at end of a chat * - add different packing impl (Unpacked, packing until overflow) - fix labels to also have valid/test implementations - fix label masking in _get_batch to also include anything from get_ltor_masks_and_position_ids * update README.md * - Add metrics to forward step to add DPO specific metrics that are useful (accuracy, etc) - Add reference model setup for DPO - Add pairwise dataset for positive/negative pairs - Add DPO loss * Update arguments.py to use train_label_data_paths instead of label_data_paths * - Bugfixes from upstreaming.... * - add precompute logprobs... * - Finishing up precompute logprobs... * - update readme for DPO... * fix varname * Fix pipeline parallelism and incorrect neox_args name * apply precommit --------- Co-authored-by: Quentin Anthony --- configs/README.md | 27 ++ generate.py | 3 + megatron/data/data_utils.py | 178 +++++-- megatron/data/pairwise_dataset.py | 457 ++++++++++++++++++ megatron/neox_arguments/arguments.py | 6 + megatron/neox_arguments/neox_args.py | 63 ++- megatron/text_generation_utils.py | 192 +++++++- megatron/training.py | 313 +++++++++--- megatron/utils.py | 2 +- .../preprocess_data_with_chat_template.py | 1 + 10 files changed, 1145 insertions(+), 97 deletions(-) create mode 100644 megatron/data/pairwise_dataset.py diff --git a/configs/README.md b/configs/README.md index e14274b56..3102a34d1 100644 --- a/configs/README.md +++ b/configs/README.md @@ -235,6 +235,33 @@ Additional DeepSpeed settings besides those mentioned above should be wrapped in "eval_iters": 10, ``` +However, if you want to use DPO style training you'll need to set pos/neg data paths instead of a single one, e.g. + +```yaml + "dataset_impl": "pairwise", + "train_impl": "dpo", + "pack_impl": "unpacked", + "dpo_beta": 0.1, + "dpo_fp32": true, + "pos_train_data_path": "data/enwik8/enwik8_text_pos_document", + "pos_valid_data_path": "data/enwik8/enwik8_text_pos_document", + "pos_test_data_path": "data/enwik8/enwik8_text_pos_document", + "neg_train_data_path": "data/enwik8/enwik8_text_neg_document", + "neg_valid_data_path": "data/enwik8/enwik8_text_neg_document", + "neg_test_data_path": "data/enwik8/enwik8_text_neg_document", + ## If you have labels... (likely to mask out user turns) + "pos_train_label_data_path": "data/enwik8/enwik8_text_pos_label_document", + "pos_valid_label_data_path": "data/enwik8/enwik8_text_pos_label_document", + "pos_test_label_data_path": "data/enwik8/enwik8_text_pos_label_document", + "neg_train_label_data_path": "data/enwik8/enwik8_text_neg_label_document", + "neg_valid_label_data_path": "data/enwik8/enwik8_text_neg_label_document", + "neg_test_label_data_path": "data/enwik8/enwik8_text_neg_label_document", + ## If you want to precompute the logits over your dataset... + "precompute_model_name": "gpt2", + ## Needed for the generation.py step, if precomputing + "text_gen_type": "precompute" +``` + ### LR Scheduler settings ```yaml diff --git a/generate.py b/generate.py index 743e350d0..e19ef2e0e 100755 --- a/generate.py +++ b/generate.py @@ -23,6 +23,7 @@ generate_samples_from_prompt, generate_samples_unconditional, generate_samples_interactive, + precompute_logits, ) @@ -83,6 +84,8 @@ def main(input_args=None, overwrite_values=None): top_p=neox_args.top_p, ) + elif neox_args.text_gen_type == "precompute": + precompute_logits(neox_args=neox_args, model=model) else: raise ValueError( f"`text_gen_type` either not specified or not recognised: {neox_args.text_gen_type}" diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py index 7e4dbdb37..7c13131ad 100644 --- a/megatron/data/data_utils.py +++ b/megatron/data/data_utils.py @@ -23,6 +23,7 @@ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset from megatron.data.blendable_dataset import BlendableDataset from megatron.data.gpt2_dataset import GPT2Dataset +from megatron.data.pairwise_dataset import PairwiseDataset from megatron.data.samplers import DistributedBatchSampler @@ -53,9 +54,12 @@ def make_data_loader(dataset, neox_args): def build_the_dataset( data_prefix, + pos_data_prefix, + neg_data_prefix, name, data_impl, pack_impl, + dataset_impl, allow_chopped, num_samples, seq_length, @@ -63,33 +67,100 @@ def build_the_dataset( skip_warmup, build_index_mappings=True, label_prefix=None, + pos_label_prefix=None, + neg_label_prefix=None, + precompute_model_name=None, ): """Build train/valid/test datasets.""" - - indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) - if label_prefix is None: - label_dataset = None + if dataset_impl == "gpt2": + indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) + if label_prefix is None: + label_dataset = None + else: + label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup) + if precompute_model_name is not None: + # If we have the name, assume it exists. If it doesn't, it will just be None which is fine. + precompute_indexed_dataset = make_indexed_dataset( + data_prefix + "_" + precompute_model_name, data_impl, skip_warmup + ) + precompute_indexed_dataset = precompute_indexed_dataset + elif dataset_impl == "pairwise": + pos_indexed_dataset = make_indexed_dataset( + pos_data_prefix, data_impl, skip_warmup + ) + neg_indexed_dataset = make_indexed_dataset( + neg_data_prefix, data_impl, skip_warmup + ) + if pos_label_prefix is None: + pos_label_dataset = None + # Also do neg here since they both must be the same + assert neg_label_prefix is None + neg_label_dataset = None + else: + pos_label_dataset = make_indexed_dataset( + pos_label_prefix, data_impl, skip_warmup + ) + # Also do neg here since they both must be the same + assert neg_label_prefix is not None + neg_label_dataset = make_indexed_dataset( + neg_label_prefix, data_impl, skip_warmup + ) + if precompute_model_name is None: + pos_ref_dataset = None + neg_ref_dataset = None + else: + pos_ref_dataset = make_indexed_dataset( + pos_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup + ) + neg_ref_dataset = make_indexed_dataset( + neg_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup + ) else: - label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup) + raise NotImplementedError(f"dataset_impl={dataset_impl} not implemented") - total_num_of_documents = indexed_dataset.sizes.shape[0] + total_num_of_documents = ( + indexed_dataset.sizes.shape[0] + if dataset_impl == "gpt2" + else pos_indexed_dataset.sizes.shape[0] + ) print_rank_0(" {}:".format(name)) print_rank_0(" no. of documents:{}".format(total_num_of_documents)) dataset = None documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32) - dataset = GPT2Dataset( - name, - data_prefix, - documents, - indexed_dataset, - num_samples, - seq_length, - seed, - pack_impl=pack_impl, - allow_chopped=allow_chopped, - build_index_mappings=build_index_mappings, - label_dataset=label_dataset, - ) + + if dataset_impl == "gpt2": + dataset = GPT2Dataset( + name, + data_prefix, + documents, + indexed_dataset, + num_samples, + seq_length, + seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, + build_index_mappings=build_index_mappings, + label_dataset=label_dataset, + ) + elif dataset_impl == "pairwise": + dataset = PairwiseDataset( + name, + pos_data_prefix, + documents, + pos_indexed_dataset, + neg_indexed_dataset, + num_samples, + seq_length, + seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, + build_index_mappings=build_index_mappings, + pos_label_dataset=pos_label_dataset, + neg_label_dataset=neg_label_dataset, + pos_ref_dataset=pos_ref_dataset, + neg_ref_dataset=neg_ref_dataset, + ) + return dataset @@ -135,7 +206,6 @@ def build_dataset(index, name): documents = np.arange( start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32 ) - dataset = GPT2Dataset( name, data_prefix, @@ -219,21 +289,57 @@ def build_weighted_datasets( valid_label_path, test_path, test_label_path, + pos_train_path, + neg_train_path, + pos_train_label_path, + neg_train_label_path, + pos_valid_path, + neg_valid_path, + pos_valid_label_path, + neg_valid_label_path, + pos_test_path, + neg_test_path, + pos_test_label_path, + neg_test_label_path, ) in enumerate( zip_longest( - neox_args.train_data_paths, + neox_args.train_data_paths if neox_args.train_data_paths else [], neox_args.train_label_data_paths if neox_args.train_label_data_paths else [], - neox_args.valid_data_paths, + neox_args.valid_data_paths if neox_args.valid_data_paths else [], neox_args.valid_label_data_paths if neox_args.valid_label_data_paths else [], - neox_args.test_data_paths, + neox_args.test_data_paths if neox_args.test_data_paths else [], neox_args.test_label_data_paths if neox_args.test_label_data_paths else [], + neox_args.pos_train_data_paths if neox_args.pos_train_data_paths else [], + neox_args.neg_train_data_paths if neox_args.neg_train_data_paths else [], + neox_args.pos_train_label_data_paths + if neox_args.pos_train_label_data_paths + else [], + neox_args.neg_train_label_data_paths + if neox_args.neg_train_label_data_paths + else [], + neox_args.pos_valid_data_paths if neox_args.pos_valid_data_paths else [], + neox_args.neg_valid_data_paths if neox_args.neg_valid_data_paths else [], + neox_args.pos_valid_label_data_paths + if neox_args.pos_valid_label_data_paths + else [], + neox_args.neg_valid_label_data_paths + if neox_args.neg_valid_label_data_paths + else [], + neox_args.pos_test_data_paths if neox_args.pos_test_data_paths else [], + neox_args.neg_test_data_paths if neox_args.neg_test_data_paths else [], + neox_args.pos_test_label_data_paths + if neox_args.pos_test_label_data_paths + else [], + neox_args.neg_test_label_data_paths + if neox_args.neg_test_label_data_paths + else [], ) ): - if train_path: + if train_path or pos_train_path: train_datasets.append( build_the_dataset( data_prefix=train_path, @@ -247,10 +353,16 @@ def build_weighted_datasets( skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, label_prefix=train_label_path, + dataset_impl=neox_args.dataset_impl, + pos_data_prefix=pos_train_path, + neg_data_prefix=neg_train_path, + pos_label_prefix=pos_train_label_path, + neg_label_prefix=neg_train_label_path, + precompute_model_name=neox_args.precompute_model_name, ) ) - if valid_path: + if valid_path or pos_valid_path: valid_datasets.append( build_the_dataset( data_prefix=valid_path, @@ -264,10 +376,16 @@ def build_weighted_datasets( skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, label_prefix=valid_label_path, + dataset_impl=neox_args.dataset_impl, + pos_data_prefix=pos_valid_path, + neg_data_prefix=neg_valid_path, + pos_label_prefix=pos_valid_label_path, + neg_label_prefix=neg_valid_label_path, + precompute_model_name=neox_args.precompute_model_name, ) ) - if test_path: + if test_path or pos_test_path: test_datasets.append( build_the_dataset( data_prefix=test_path, @@ -281,6 +399,12 @@ def build_weighted_datasets( skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, label_prefix=test_label_path, + dataset_impl=neox_args.dataset_impl, + pos_data_prefix=pos_test_path, + neg_data_prefix=neg_test_path, + pos_label_prefix=pos_test_label_path, + neg_label_prefix=neg_test_label_path, + precompute_model_name=neox_args.precompute_model_name, ) ) return train_datasets, valid_datasets, test_datasets @@ -352,7 +476,7 @@ def build_train_valid_test_data_iterators(neox_args): test_iters * neox_args.train_batch_size, ] - if neox_args.train_data_paths: + if (neox_args.train_data_paths) or (neox_args.pos_train_data_paths): # when individual train / valid / test data paths are provided # normalize weight values and get num samples for each dataset train_weights, train_num_samples = get_normalized_weights_and_num_samples( diff --git a/megatron/data/pairwise_dataset.py b/megatron/data/pairwise_dataset.py new file mode 100644 index 000000000..e39b4d626 --- /dev/null +++ b/megatron/data/pairwise_dataset.py @@ -0,0 +1,457 @@ +# Copyright (c) 2024, EleutherAI +# This file is based on code by the authors denoted below and has been modified from its original version. +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pairwise style dataset.""" + +import os +import time + +import numpy as np +import torch + +from megatron import mpu, print_rank_0 + + +class PairwiseDataset(torch.utils.data.Dataset): + def __init__( + self, + name, + pos_data_prefix, # Don't need neg since it's assumed you have paired the data already. + documents, + pos_indexed_dataset, + neg_indexed_dataset, + num_samples, + seq_length, + seed, + pack_impl="unpacked", + build_index_mappings=True, + use_shared_fs=True, + pos_label_dataset=None, + pos_ref_dataset=None, + neg_label_dataset=None, + neg_ref_dataset=None, + allow_chopped=True, + ): + + self.name = name + self.pos_indexed_dataset = pos_indexed_dataset + self.pos_label_dataset = pos_label_dataset + self.pos_ref_dataset = pos_ref_dataset + self.neg_indexed_dataset = neg_indexed_dataset + self.neg_label_dataset = neg_label_dataset + self.neg_ref_dataset = neg_ref_dataset + self.pack_impl = pack_impl + self.seq_length = seq_length + # Checks + assert np.min(documents) >= 0 + assert (neg_label_dataset is not None and pos_label_dataset is not None) or ( + neg_label_dataset is None and pos_label_dataset is None + ), "Label datasets must be both None or both not None" + assert np.max(documents) < pos_indexed_dataset.sizes.shape[0] + assert pos_indexed_dataset.sizes.shape[0] == neg_indexed_dataset.sizes.shape[0] + assert ( + pack_impl != "packed" + ), "Packed implementation not supported for pairwise dataset" + + if build_index_mappings: + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, + pos_data_prefix, + documents, + self.pos_indexed_dataset.sizes, + self.neg_indexed_dataset.sizes, + self.pos_label_dataset, + self.neg_label_dataset, + num_samples, + seq_length, + seed, + pack_impl, + use_shared_fs=use_shared_fs, + allow_chopped=allow_chopped, + ) + self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1 + self.sample_idx_len = self.sample_idx.shape[0] - 1 + + if self.shuffle_idx_len != self.sample_idx_len - 1: + print( + f"WARNING: shuffle index length ({self.shuffle_idx_len}) is not equal to sample index length ({self.sample_idx_len})" + ) + + def __len__(self): + return min(self.shuffle_idx_len, self.sample_idx_len) + + def __getitem__(self, idx): + try: + # Get the shuffled index. + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # Labels and texts are supposed to be fully in sync. + datasets = [self.pos_indexed_dataset, self.neg_indexed_dataset] + + if self.pos_label_dataset is not None: + datasets += [ + self.pos_label_dataset, + self.neg_label_dataset, + ] + if self.pos_ref_dataset is not None: + datasets += [ + self.pos_ref_dataset, + self.neg_ref_dataset, + ] + samples = [] + pos_ref_samples = [] + neg_ref_samples = [] + # If we are within the same document, just extract the chunk. + for n, dataset in enumerate(datasets): + if doc_index_f == doc_index_l: + samples.append( + dataset.get( + self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1, + ) + ) + else: + # Otherwise, get the rest of the initial document. + sample_list = [ + dataset.get(self.doc_idx[doc_index_f], offset=offset_f) + ] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append( + dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1) + ) + samples.append(np.concatenate(sample_list)) + for i in range(len(samples)): + if len(samples[i]) < (self.seq_length + 1): + if ((i == 2) or (i == 3)) and self.pos_label_dataset is not None: + # Labels... So pad with -100 + samples[i] = np.pad( + samples[i], + (0, (self.seq_length + 1) - len(samples[i])), + mode="constant", + constant_values=-100, + ) + else: + # Pad with 0s, can use any number since it's masked. + samples[i] = np.pad( + samples[i], + (0, (self.seq_length + 1) - len(samples[i])), + mode="constant", + constant_values=0, + ) + elif len(samples[i]) > (self.seq_length + 1): + # Check for overflow and truncate. + samples[i] = samples[i][: (self.seq_length + 1)] + ret = {} + ret["pos"] = np.array(samples[0], dtype=np.int64) + ret["neg"] = np.array(samples[1], dtype=np.int64) + if self.pos_label_dataset is not None: + ret["pos_label"] = np.array(samples[2], dtype=np.int64) + ret["neg_label"] = np.array(samples[3], dtype=np.int64) + if self.pos_ref_dataset is not None: + ret["pos_ref"] = np.array(samples[4], dtype=np.float32) + ret["neg_ref"] = np.array(samples[5], dtype=np.float32) + elif self.pos_ref_dataset is not None: + # Don't have labels... + ret["pos_ref"] = np.array(samples[2], dtype=np.float32) + ret["neg_ref"] = np.array(samples[3], dtype=np.float32) + return ret + except IndexError: + new_idx = idx % len(self) + print( + f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" + ) + return self[new_idx] + + +def _build_index_mappings( + name, + pos_data_prefix, + documents, + pos_sizes, + neg_sizes, + pos_label_dataset, + neg_label_dataset, + num_samples, + seq_length, + seed, + packing_impl, + use_shared_fs=True, + allow_chopped=True, +): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, pos_sizes) + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = pos_data_prefix + _filename += "_{}_indexmap".format(name) + _filename += "_{}ns".format(num_samples) + _filename += "_{}sl".format(seq_length) + _filename += "_{}s".format(seed) + _filename += "_{}pi".format(packing_impl) + doc_idx_filename = _filename + "_doc_idx.npy" + sample_idx_filename = _filename + "_sample_idx.npy" + shuffle_idx_filename = _filename + "_shuffle_idx.npy" + + if not use_shared_fs: + should_process_dataset = int(os.environ["LOCAL_RANK"]) == 0 + else: + should_process_dataset = torch.distributed.get_rank() == 0 + + # Build the indexed mapping if not exist. + if should_process_dataset: + if ( + (not os.path.isfile(doc_idx_filename)) + or (not os.path.isfile(sample_idx_filename)) + or (not os.path.isfile(shuffle_idx_filename)) + ): + print_rank_0( + " > WARNING: could not find index map files, building " + "the indices on rank 0 ..." + ) + # doc-idx. + start_time = time.time() + if packing_impl == "pack_until_overflow": + # Naively pack data until it overflows, then roll it over to a new one instead. + shuffle_idx = np.arange(num_samples) # Shuffle index around epochs + np_rng.shuffle(shuffle_idx) + sample_idx = [] + doc_idx = [] + # Iterate over files until we have enough samples. + temp_shuffle_idx = np.arange(len(documents)) + np_rng.shuffle(temp_shuffle_idx) + running_length = 0 + curr_shuffle_idx = 0 + while len(sample_idx) < num_samples: + # If not allow_chopped, skip this item if it's chopped. + if not allow_chopped: + if ( + pos_sizes[temp_shuffle_idx[curr_shuffle_idx]] + < seq_length + 1 + ): + curr_shuffle_idx += 1 + continue + if ( + neg_sizes[temp_shuffle_idx[curr_shuffle_idx]] + < seq_length + 1 + ): + curr_shuffle_idx += 1 + continue + # Then, check if we need to skip this item... + if pos_label_dataset is not None: + if np.all( + pos_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[ + : seq_length + 1 + ] + == -100 + ): + curr_shuffle_idx += 1 + continue + if np.all( + neg_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[ + : seq_length + 1 + ] + == -100 + ): + curr_shuffle_idx += 1 + continue + doc_length = max( + pos_sizes[temp_shuffle_idx[curr_shuffle_idx]], + neg_sizes[temp_shuffle_idx[curr_shuffle_idx]], + ) + if running_length == 0: + sample_idx.append(np.array([len(doc_idx), 0])) + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + running_length += doc_length + else: + if running_length + doc_length > (seq_length + 1): + running_length = doc_length + sample_idx.append(np.array([len(doc_idx), 0])) + else: + running_length += doc_length + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + curr_shuffle_idx += 1 + if curr_shuffle_idx == len(documents): + curr_shuffle_idx = 0 + np_rng.shuffle(temp_shuffle_idx) + sample_idx.append(np.array([len(doc_idx), 0])) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + elif packing_impl == "unpacked": + # Unpacked data, one sample per document. + shuffle_idx = np.array([i % len(documents) for i in range(num_samples)]) + np_rng.shuffle(shuffle_idx) + sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64) + sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)]) + sample_idx[:, 1] = 0 + doc_idx = list() + doc_i = 0 + while len(doc_idx) <= num_samples: + # Check if we need to skip this item... + if not allow_chopped: + # +1 since we shift left/right by 1 + if pos_sizes[doc_i] > seq_length + 1: + doc_i = (doc_i + 1) % len(documents) + continue + if neg_sizes[doc_i] > seq_length + 1: + doc_i = (doc_i + 1) % len(documents) + continue + # In theory if we don't allow chopped we should be able to skip it, but the warm fuzzies I get + # from this are worth the extra bool check + if np.all(pos_label_dataset.get(doc_i)[:seq_length] == -100): + doc_i = (doc_i + 1) % len(documents) + continue + if np.all(neg_label_dataset.get(doc_i)[:seq_length] == -100): + doc_i = (doc_i + 1) % len(documents) + continue + doc_idx.append(doc_i) + doc_i = (doc_i + 1) % len(documents) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group()) + assert counts[0].item() == torch.distributed.get_world_size( + group=mpu.get_io_parallel_group() + ) + + # Load mappings. + start_time = time.time() + print_rank_0(" > loading doc-idx mapping from {}".format(doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r") + print_rank_0(" > loading sample-idx mapping from {}".format(sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r") + print_rank_0(" > loading shuffle-idx mapping from {}".format(shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r") + print_rank_0( + " loaded indexed file in {:3.3f} seconds".format(time.time() - start_time) + ) + print_rank_0(" total number of samples: {}".format(sample_idx.shape[0])) + print_rank_0(" total number of epochs: {}".format(num_epochs)) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _num_epochs(tokens_per_epoch, seq_length, num_samples): + """Based on number of samples and sequence length, calculate how many + epochs will be needed.""" + num_epochs = 0 + total_tokens = 0 + while True: + num_epochs += 1 + total_tokens += tokens_per_epoch + # -1 is because we need to retrieve seq_length + 1 token each time + # but the last token will overlap with the first token of the next + # sample except for the last sample. + if ((total_tokens - 1) // seq_length) >= num_samples: + return num_epochs + + +def _build_doc_idx(documents, num_epochs, np_rng): + """Build an array with length = number-of-epochs * number-of-documents. + Each index is mapped to a corresponding document.""" + doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + +def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): + """Sample index mapping is a 2D array with sizes + [number-of-samples + 1, 2] where [..., 0] contains + the index into `doc_idx` and [..., 1] is the + starting offset in that document.""" + + # Total number of samples. For -1 see comments in `_num_epochs`. + num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length + sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int64) + + # Index into sample_idx. + sample_index = 0 + # Index into doc_idx. + doc_idx_index = 0 + # Beginning offset for each document. + doc_offset = 0 + # Start with first document and no offset. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + while sample_index <= num_samples: + # Start with a fresh sequence. + remaining_seq_length = seq_length + 1 + while remaining_seq_length != 0: + # Get the document length. + doc_id = doc_idx[doc_idx_index] + doc_length = sizes[doc_id] - doc_offset + # And add it to the current sequence. + remaining_seq_length -= doc_length + # If we have more than a full sequence, adjust offset and set + # remaining length to zero so we return from the while loop. + # Note that -1 here is for the same reason we have -1 in + # `_num_epochs` calculations. + if remaining_seq_length <= 0: + doc_offset += remaining_seq_length + doc_length - 1 + remaining_seq_length = 0 + else: + # Otherwise, start from the beginning of the next document. + doc_idx_index += 1 + doc_offset = 0 + # Record the sequence. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + + return sample_idx + + +def _build_shuffle_idx(size, np_rng): + """Build the range [0, size) and shuffle.""" + dtype_ = np.uint32 + if size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx) + return shuffle_idx diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 8fbe045bb..1677bf072 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1116,10 +1116,16 @@ def calculate_derived(self): # Adding equal dataset weights if none are provided if self.train_data_paths and (self.train_data_weights is None): self.train_data_weights = [1.0] * len(self.train_data_paths) + elif self.pos_train_data_paths and (self.train_data_weights is None): + self.train_data_weights = [1.0] * len(self.pos_train_data_paths) if self.valid_data_paths and (self.valid_data_weights is None): self.valid_data_weights = [1.0] * len(self.valid_data_paths) + elif self.pos_valid_data_paths and (self.valid_data_weights is None): + self.valid_data_weights = [1.0] * len(self.pos_valid_data_paths) if self.test_data_paths and (self.test_data_weights is None): self.test_data_weights = [1.0] * len(self.test_data_paths) + elif self.pos_test_data_paths and (self.test_data_weights is None): + self.test_data_weights = [1.0] * len(self.pos_test_data_paths) if self.train_label_data_paths: err_str = "Must use `train_label_data_paths` with `train_data_paths`, not `data_path`" diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 818c86d31..814622a5b 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -893,6 +893,42 @@ class NeoXArgsTraining(NeoXArgsTemplate): List of paths to validation label datasets (not shifted by 1 yet!). """ + pos_train_data_paths: list = None + neg_train_data_paths: list = None + """ + List of paths to positive and negative training datasets. + """ + + pos_train_label_data_paths: list = None + neg_train_label_data_paths: list = None + """ + List of paths to positive and negative training label datasets (not shifted by 1 yet!). + """ + + pos_valid_data_paths: list = None + neg_valid_data_paths: list = None + """ + List of paths to positive and negative validation datasets. + """ + + pos_valid_label_data_paths: list = None + neg_valid_label_data_paths: list = None + """ + List of paths to positive and negative validation label datasets (not shifted by 1 yet!). + """ + + pos_test_data_paths: list = None + neg_test_data_paths: list = None + """ + List of paths to positive and negative test datasets. + """ + + pos_test_label_data_paths: list = None + neg_test_label_data_paths: list = None + """ + List of paths to positive and negative test label datasets (not shifted by 1 yet!). + """ + train_data_weights: list = None """ List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting. @@ -949,6 +985,26 @@ class NeoXArgsTraining(NeoXArgsTemplate): warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets """ + dataset_impl: Literal["gpt2", "pairwise"] = "gpt2" + """ + Dataset implementation, can be one of "gpt2" or "pairwise" + """ + + train_impl: Literal["normal", "dpo"] = "normal" + """ + Training implementation, can be one of "normal" or "dpo" + """ + + dpo_fp32: bool = True + """ + Whether to cast logits to fp32 for DPO loss calculation. + """ + + dpo_beta: float = 0.1 + """ + Beta value for DPO + """ + allow_chopped: bool = True """ WARNING: if your packing impl is packed, this is ignored. @@ -1245,7 +1301,12 @@ class NeoXArgsTextgen(NeoXArgsTemplate): text_gen_type: str = None """ How to generate text/sample the model. - Options: `unconditional`, `input-file`, `interactive` + Options: `unconditional`, `input-file`, `interactive`, `precompute` + """ + + precompute_model_name: str = None + """ + Model name to use for saving precomputed logprobs """ temperature: float = 0.0 diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py index 7b7a390ab..02926c2c3 100644 --- a/megatron/text_generation_utils.py +++ b/megatron/text_generation_utils.py @@ -23,12 +23,15 @@ import time from typing import List, Union +import numpy as np import torch import torch.nn.functional as F from megatron import print_rank_0 from megatron import mpu from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0 +from megatron.data.indexed_dataset import make_builder, make_dataset +from megatron.mpu.mappings import gather_from_model_parallel_region def get_batch(neox_args, context_tokens: torch.Tensor): @@ -52,7 +55,9 @@ def get_batch(neox_args, context_tokens: torch.Tensor): return tokens, attention_mask, position_ids -def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int): +def pad_batch( + context_tokens: List[List[int]], pad_id: int, pad_len: int, truncate: bool = False +): """ pads context lengths in context_tokens with pad_id to equal neox_args.seq_length, and returns the padded batch and the new lengths. @@ -60,17 +65,21 @@ def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int): context_tokens: list of lists of tokens pad_id: int, integer to use as padding token pad_len: int, context length to be padded; all batch items will be padded to the same length + truncate: bool, if True, truncate context tokens to pad_len if they are longer than pad_len returns: tuple of padded context tokens and a list of unpadded token count """ context_lengths = [] - for tokens in context_tokens: + for i, tokens in enumerate(context_tokens): context_length = len(tokens) if context_length < pad_len: tokens.extend([pad_id] * (pad_len - context_length)) elif context_length > pad_len: - raise ValueError("context_length is bigger than to be padded length") + if not truncate: + raise ValueError("context_length is bigger than to be padded length") + context_tokens[i] = tokens[:pad_len] + context_length = pad_len context_lengths.append(context_length) return context_tokens, context_lengths @@ -807,3 +816,180 @@ def generate_samples_interactive( print_rank_0("Generated Text: " + generated_text) if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0: _ = input("\n") + + +def get_logp(logits, labels, force_fp32=False): + if force_fp32: + logits = logits.float() + logp = logits.log_softmax(dim=-1) + return torch.gather(logp, dim=2, index=labels.unsqueeze(2)).squeeze(2) + + +def precompute_logits(neox_args, model): + """ + Precomputes logprobs from training/testing/validation datasets + + Saves it to the same directory as the dataset with the model name appended to it + + neox_args: NeoXArgs. + model: a Megatron model + + """ + if neox_args.precompute_model_name is None: + mdl_name = str(hash(neox_args.load)) + else: + mdl_name = neox_args.precompute_model_name + print_rank_0("Precomputing logprobs...") + model.eval() + data_paths = list() + if neox_args.train_data_paths is not None: + for path in neox_args.train_data_paths: + data_paths.append(path) + for path in neox_args.test_data_paths: + data_paths.append(path) + for path in neox_args.valid_data_paths: + data_paths.append(path) + elif neox_args.pos_train_data_paths is not None: + # Pairwise data... + for path in neox_args.pos_train_data_paths: + data_paths.append(path) + for path in neox_args.neg_train_data_paths: + data_paths.append(path) + for path in neox_args.pos_valid_data_paths: + data_paths.append(path) + for path in neox_args.neg_valid_data_paths: + data_paths.append(path) + for path in neox_args.pos_test_data_paths: + data_paths.append(path) + for path in neox_args.neg_test_data_paths: + data_paths.append(path) + for path in data_paths: + print_rank_0(f"Precomputing logits for {path}") + # Add hash to path... + out_path = path + f"_{mdl_name}" + if os.path.exists(out_path + ".idx"): + continue + dataset = make_dataset(path, neox_args.data_impl, not neox_args.mmap_warmup) + if is_mp_rank_0(): + out_dataset = make_builder(out_path + ".bin", neox_args.data_impl) + out_dataset._dtype = np.float32 + i = 0 + while i < len(dataset): + start = time.time() + model.module.clear_cache() # clear kv cache between batches + if is_mp_rank_0(): + offset = ( + mpu.get_data_parallel_rank() + * neox_args.train_micro_batch_size_per_gpu + ) + context_tokens = [ + [int(x) for x in dataset.get(j % len(dataset)).tolist()] + for j in range( + i + offset, + i + (neox_args.train_micro_batch_size_per_gpu + offset), + ) + ] + # grab microbatch + # pad batch in order to allow conversion to tensor + context_tokens, context_lengths = pad_batch( + copy.deepcopy(context_tokens), + pad_id=0, + pad_len=neox_args.seq_length + 1, + truncate=True, + ) + # print(context_tokens) + label_tokens = [tokens[1:] for tokens in context_tokens] + context_tokens = [tokens[:-1] for tokens in context_tokens] + else: + context_tokens = [ + [0 for _ in range(neox_args.seq_length)] + for _ in range(neox_args.batch_size) + ] + label_tokens = [ + [0 for _ in range(neox_args.seq_length)] + for _ in range(neox_args.batch_size) + ] + context_lengths = [0 for _ in range(neox_args.batch_size)] + i += ( + neox_args.train_micro_batch_size_per_gpu + * mpu.get_data_parallel_world_size() + ) + # print(context_tokens) + # convert to tensor and broadcast + context_tokens = torch.cuda.LongTensor(context_tokens) + label_tokens = torch.cuda.LongTensor(label_tokens) + # Make sure context tokens + start tokens are the same across all ranks + token_generation_start_index = torch.cuda.LongTensor(context_lengths) + torch.distributed.broadcast( + context_tokens, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group(), + ) + torch.distributed.broadcast( + token_generation_start_index, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group(), + ) + torch.distributed.broadcast( + label_tokens, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group(), + ) + # context_tokens = context_tokens[:, :chop_len].contiguous() + # label_tokens = label_tokens[:, :chop_len].contiguous() + with torch.no_grad(): + # get attention mask / position ids + context_tokens, attention_mask, position_ids = get_batch( + neox_args, context_tokens + ) + model_inputs = ( + context_tokens, + position_ids, + attention_mask, + ) + maybe_tuple = forward_model( + model, model_inputs, neox_args.is_pipe_parallel + ) + if isinstance(maybe_tuple, tuple): + logits, _ = maybe_tuple + else: + logits = maybe_tuple + if logits is not None: # if pipe parallel, not all ranks return logits + logits = gather_from_model_parallel_region(logits) + logp = get_logp(logits, label_tokens, True).squeeze() + if neox_args.is_pipe_parallel: + # broadcast generated tokens to pipe parallel group + src_rank = model.grid.stage_to_global(model.num_stages - 1) + logp = ( + logp + if logits is not None + else torch.zeros( + neox_args.batch_size, dtype=torch.float32 + ).cuda() + ) + torch.distributed.broadcast( + tensor=logp, + src=src_rank, + group=mpu.get_pipe_parallel_group(), + ) + logp = logp.squeeze() + logp_list = [ + torch.zeros_like(logp) + for _ in range(mpu.get_data_parallel_world_size()) + ] + torch.distributed.all_gather( + logp_list, logp, group=mpu.get_data_parallel_group() + ) + logp = torch.cat(logp_list, dim=0).cpu().numpy() + if (mpu.get_model_parallel_rank() == 0) and ( + mpu.get_data_parallel_rank() == 0 + ): + for j in range(logp.shape[0]): + out_dataset.add_item(logp[j]) + out_dataset.end_document() + print_rank_0(f"Processed {i} / {len(dataset)} in {time.time() - start}") + if is_mp_rank_0(): + out_dataset.finalize( + out_path + ".idx", + ) + torch.distributed.barrier() diff --git a/megatron/training.py b/megatron/training.py index fc3d9e129..d9932483a 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -21,12 +21,14 @@ """Pretrain utilities.""" from datetime import datetime from functools import partial +from collections import defaultdict import math import sys from contextlib import nullcontext import torch +import torch.nn.functional as F import deepspeed from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler import numpy as np @@ -45,6 +47,7 @@ get_params_for_weight_decay_optimization, mark_norms_for_sequence_parallel_grad_sync, ) +from megatron.mpu.mappings import gather_from_model_parallel_region from megatron.checkpointing import load_checkpoint, save_checkpoint from megatron.data.data_utils import build_train_valid_test_data_iterators from megatron.initialize import initialize_megatron @@ -137,7 +140,7 @@ def gen(): old_hidden_size = neox_args.hidden_size neox_args.hidden_size = hidden_size - model, optimizer, _ = setup_model_and_optimizer( + model, optimizer, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) @@ -193,7 +196,7 @@ def pretrain(neox_args): # Model, optimizer, and learning rate. timers("model and optimizer").start() - model, optimizer, lr_scheduler = setup_model_and_optimizer( + model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer( neox_args=neox_args, use_cache=False, iteration=neox_args.iteration ) timers("model and optimizer").stop() @@ -231,6 +234,7 @@ def pretrain(neox_args): neox_args=neox_args, timers=timers, model=model, + reference_model=reference_model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_data_iterator=train_data_iterator, @@ -282,12 +286,12 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype): label_key = keys[1] if len(keys) > 1 else None # Unpack. tokens_ = data_b[token_key].long() - if "label" in data_b: + if label_key in data_b: label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous() labels = torch.where( data_b[label_key].long() >= 0, data_b[label_key].long(), - torch.zeros_like(data_b["label"].long()), + torch.zeros_like(data_b[label_key].long()), )[:, 1:].contiguous() else: label_mask = (tokens_.long() >= 0)[:, 1:].contiguous() @@ -311,7 +315,14 @@ def get_batch(neox_args, data_iterator): """Generate a batch""" # Items and their type. - keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"] + if neox_args.train_impl == "normal": + keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"] + elif neox_args.train_impl == "dpo": + keys = ( + [["pos", "pos_label"], ["neg", "neg_label"]] + if neox_args.pos_train_label_data_paths + else [["pos"], ["neg"]] + ) datatype = torch.int64 # Broadcast data. @@ -319,13 +330,43 @@ def get_batch(neox_args, data_iterator): data = next(data_iterator) else: data = None - return _get_batch( - neox_args=neox_args, - tokenizer=neox_args.tokenizer, - keys=keys, - data=data, - datatype=datatype, - ) + if neox_args.train_impl == "normal": + return _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys, + data=data, + datatype=datatype, + ) + elif neox_args.train_impl == "dpo": + pos_tup = _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys[0], + data=data, + datatype=datatype, + ) + neg_tup = _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys[1], + data=data, + datatype=datatype, + ) + if neox_args.precompute_model_name: + ref_data = mpu.broadcast_data(["pos_ref", "neg_ref"], data, torch.float) + else: + ref_data = {"pos_ref": None} + return [ + torch.cat((pos_item, neg_item), dim=0) + for pos_item, neg_item in zip(pos_tup, neg_tup) + ] + [ + torch.cat((ref_data["pos_ref"], ref_data["neg_ref"]), dim=0)[ + :, :-1 + ].contiguous() + if ref_data["pos_ref"] is not None + else None + ] def get_batch_pipe(data, neox_args, curr_scheduler=None): @@ -419,8 +460,23 @@ def mb_moe_loss_func(args, loss_mask, output_tensor=None): return averaged_lbl, loss_dict +def get_pos_neg_logp(logits, labels, force_fp32=False): + if force_fp32: + logits = logits.float() + logp = logits.log_softmax(dim=-1) + per_token_logp = torch.gather(logp, dim=2, index=labels.unsqueeze(2)).squeeze(2) + # Split to pos/neg... + return torch.chunk(per_token_logp, 2, 0) + + def forward_step( - data_iterator, model, neox_args, timers, return_logits=False, is_train=False + data_iterator, + model, + neox_args, + timers, + return_logits=False, + is_train=False, + reference_model=None, ): """Forward step.""" if neox_args.is_pipe_parallel: @@ -431,9 +487,14 @@ def forward_step( torch.cuda.nvtx.range_push(f"Get batch") if timers is not None: timers("batch generator").start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - neox_args=neox_args, data_iterator=data_iterator - ) + if neox_args.train_impl == "normal": + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + neox_args=neox_args, data_iterator=data_iterator + ) + if neox_args.train_impl == "dpo": + tokens, labels, loss_mask, attention_mask, position_ids, ref_logp = get_batch( + neox_args=neox_args, data_iterator=data_iterator + ) if timers is not None: timers("batch generator").stop() @@ -442,38 +503,100 @@ def forward_step( if neox_args.memory_profiling: torch.cuda.nvtx.range_push(f"Forward pass") - # Sequential returns moe_losses, but this is not yet supported by pipe parallel - maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args) - if type(maybe_tuple) is tuple: - outputs, moe_losses = maybe_tuple - else: - outputs = maybe_tuple - moe_losses = [] - if ( - is_train - and neox_args.curriculum_learning - and neox_args.curriculum_seqlen < neox_args.seq_length - ): - loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous() - labels = labels[:, : neox_args.curriculum_seqlen].contiguous() - main_loss = cross_entropy( - outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy - ) - if neox_args.moe_num_experts > 1: - if neox_args.moe_type == "deepspeed": - moe_loss = neox_args.moe_loss_coeff * sum(m.item() for m in moe_losses) - elif neox_args.moe_type == "megablocks": - moe_loss = mb_moe_loss_func(neox_args, loss_mask, outputs)[0] + metrics = {} + if neox_args.train_impl == "normal": + # Sequential returns moe_losses, but this is not yet supported by pipe parallel + maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args) + if type(maybe_tuple) is tuple: + outputs, moe_losses = maybe_tuple else: - raise ValueError(f"Unsupported moe_type: {neox_args.moe_type}") - else: - moe_loss = 0.0 - loss = main_loss + moe_loss + outputs = maybe_tuple + moe_losses = [] + if ( + is_train + and neox_args.curriculum_learning + and neox_args.curriculum_seqlen < neox_args.seq_length + ): + loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous() + labels = labels[:, : neox_args.curriculum_seqlen].contiguous() + main_loss = cross_entropy( + outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy + ) + if neox_args.moe_num_experts > 1: + if neox_args.moe_type == "deepspeed": + moe_loss = neox_args.moe_loss_coeff * sum(m.item() for m in moe_losses) + elif neox_args.moe_type == "megablocks": + moe_loss = mb_moe_loss_func(neox_args, loss_mask, outputs)[0] + else: + raise ValueError(f"Unsupported moe_type: {neox_args.moe_type}") + else: + moe_loss = 0.0 + loss = main_loss + moe_loss + elif neox_args.train_impl == "dpo": + # Based on https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90 + with torch.no_grad(): + # So we can gather token logps... + token_logp_labels = labels.clone() + token_logp_labels[token_logp_labels == -100] = 0 + pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0) + if ref_logp is None: + ref_maybe_tuple = reference_model( + (tokens, position_ids, attention_mask), neox_args=neox_args + ) + if type(ref_maybe_tuple) is tuple: + # We should ignore MoE losses yeah? + ref_outputs, _ = ref_maybe_tuple + else: + ref_outputs = ref_maybe_tuple + # gather across tensor parallel group + ref_outputs = gather_from_model_parallel_region(ref_outputs) + ref_pos, ref_neg = get_pos_neg_logp( + ref_outputs, token_logp_labels, neox_args.dpo_fp32 + ) + else: + ref_pos, ref_neg = torch.chunk(ref_logp, 2, 0) + ref_pos = (ref_pos * pos_loss_mask).sum(-1) + ref_neg = (ref_neg * neg_loss_mask).sum(-1) + chosen_maybe_tuple = model( + (tokens, position_ids, attention_mask), neox_args=neox_args + ) + if type(chosen_maybe_tuple) is tuple: + # We should ignore MoE losses yeah? + chosen_outputs, _ = chosen_maybe_tuple + else: + chosen_outputs = chosen_maybe_tuple + chosen_outputs = gather_from_model_parallel_region(chosen_outputs) + chosen_pos, chosen_neg = get_pos_neg_logp( + chosen_outputs, token_logp_labels, neox_args.dpo_fp32 + ) + chosen_pos = (chosen_pos * pos_loss_mask).sum(-1) + chosen_neg = (chosen_neg * neg_loss_mask).sum(-1) + with torch.no_grad(): + # Collect metrics... + metrics["ref_neg"] = ref_neg.clone().detach().mean() + metrics["ref_pos"] = ref_pos.clone().detach().mean() + metrics["chosen_neg"] = chosen_neg.clone().detach().mean() + metrics["chosen_pos"] = chosen_pos.clone().detach().mean() + chosen_rewards = neox_args.dpo_beta * ( + chosen_pos.clone().detach() - ref_pos.clone().detach() + ) + rejected_rewards = neox_args.dpo_beta * ( + chosen_neg.clone().detach() - ref_neg.clone().detach() + ) + reward_acc = (chosen_rewards > rejected_rewards).float() + metrics["reward_acc"] = reward_acc.mean() + metrics["chosen_rewards"] = chosen_rewards.mean() + metrics["rejected_rewards"] = rejected_rewards.mean() + metrics["margins"] = (chosen_rewards - rejected_rewards).mean() + pi_logrations = chosen_pos - chosen_neg + ref_logrations = ref_pos - ref_neg + logits = pi_logrations - ref_logrations + loss = -F.logsigmoid(neox_args.dpo_beta * logits).mean() if neox_args.memory_profiling: torch.cuda.nvtx.range_pop() if return_logits: - return loss, outputs - return loss + return loss, outputs, metrics + return loss, metrics def get_model(neox_args, use_cache=False): @@ -548,9 +671,14 @@ def get_model(neox_args, use_cache=False): raise ValueError("Must be using deepspeed to run neox") -def get_optimizer(model, neox_args): +def get_optimizer(model, neox_args, dummy=False): """Set up the optimizer.""" - if neox_args.no_load_optim: + if neox_args.no_load_optim and neox_args.deepspeed: + # Required to have something so... + dummy = True + neox_args.optimizer = {"params": {"lr": 0.0}} + neox_args.optimizer_type = "adam" + elif neox_args.no_load_optim: return None, None if neox_args.optimizer is None: @@ -584,8 +712,13 @@ def get_optimizer(model, neox_args): _param_groups = [] for param_group in param_groups: trainable_params = [p for p in param_group["params"] if p.requires_grad] + if dummy: + trainable_params = [trainable_params[0]] # just take the first one param_group["params"] = trainable_params _param_groups.append(param_group) + if dummy: + # Only need one. + break param_groups = _param_groups # If we're using mup, then the optimizer must be adam or sgd @@ -699,7 +832,7 @@ def get_optimizer(model, neox_args): def get_learning_rate_scheduler(optimizer, neox_args): """Build the learning rate scheduler.""" - if neox_args.no_load_optim: + if (neox_args.no_load_optim) and not neox_args.deepspeed: # TODO: this should be configured as a separate arg return None if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam": @@ -744,19 +877,30 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): ) """Setup model and optimizer.""" + needs_reference_model = (neox_args.train_impl == "dpo") and ( + neox_args.precompute_model_name is None + ) model = get_model(neox_args=neox_args, use_cache=use_cache) + if needs_reference_model: + reference_model = get_model(neox_args=neox_args, use_cache=use_cache) + else: + reference_model = None optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) - + if neox_args.deepspeed and needs_reference_model: + # Need an optimizer & lr_scheduler so make a very small one to keep deepspeed happy... + ref_optimizer, ref_param_groups = get_optimizer( + model=reference_model, neox_args=neox_args, dummy=True + ) + ref_lr_scheduler = get_learning_rate_scheduler( + optimizer=ref_optimizer, neox_args=neox_args + ) + else: + ref_optimizer, ref_param_groups, ref_lr_scheduler = None, None, None if neox_args.deepspeed: print_rank_0("DeepSpeed is enabled.") - if neox_args.no_load_optim: - assert optimizer is None - _model_params = None - _lr_scheduler = None - else: - _model_params = param_groups if optimizer is None else None - _lr_scheduler = lr_scheduler + _model_params = param_groups if optimizer is None else None + _lr_scheduler = lr_scheduler model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, @@ -769,6 +913,16 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): # config_params=neox_args.deepspeed_config, mpu=mpu if not neox_args.is_pipe_parallel else None, ) + if needs_reference_model: + reference_model, _, _, _ = deepspeed.initialize( + model=reference_model, + optimizer=ref_optimizer, + args=neox_args, + lr_scheduler=ref_lr_scheduler, + dist_init_required=False, + model_parameters=ref_param_groups, + mpu=mpu if not neox_args.is_pipe_parallel else None, + ) mark_norms_for_sequence_parallel_grad_sync(model, neox_args) if neox_args.moe_num_experts > 1 and neox_args.moe_type == "megablocks": # We need to additionally set this flag to ensure DS parallelism properly handles this foreign MoE. @@ -805,6 +959,14 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): lr_scheduler=lr_scheduler, iteration=iteration, ) + if needs_reference_model: + _ = load_checkpoint( + neox_args=neox_args, + model=reference_model, + optimizer=ref_optimizer, + lr_scheduler=ref_lr_scheduler, + iteration=iteration, + ) print_rank_0( f"Loading checkpoint and starting from iteration {neox_args.iteration}" ) @@ -816,7 +978,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): if lr_scheduler is not None: lr_scheduler.optimizer = model.optimizer - return model, optimizer, lr_scheduler + return model, optimizer, lr_scheduler, reference_model def backward_step(neox_args, timers, optimizer, model, loss): @@ -838,7 +1000,15 @@ def backward_step(neox_args, timers, optimizer, model, loss): raise ValueError("Must be using deepspeed to run neox") -def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler): +def train_step( + neox_args, + timers, + data_iterator, + model, + optimizer, + lr_scheduler, + reference_model=None, +): """Single training step.""" # Pipeline parallelism schedules forward/backward/step @@ -846,6 +1016,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) reduced_loss = train_step_pipe( neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator ) + reduce_metrics = reduced_loss if ( neox_args.memory_profiling and neox_args.iteration >= neox_args.profile_step_start @@ -855,18 +1026,22 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) save_snapshot(neox_args) else: losses = [] + metric_dicts = defaultdict(list) for _ in range(neox_args.gradient_accumulation_steps): # Forward model for one step. timers("forward").start() - loss = forward_step( + loss, metric_dict = forward_step( neox_args=neox_args, timers=timers, data_iterator=data_iterator, model=model, is_train=True, + reference_model=reference_model, ) timers("forward").stop() losses.append(loss) + for key in metric_dict.keys(): + metric_dicts[key].append(metric_dict[key]) # Calculate gradients, reduce across processes, and clip. if ( neox_args.profile @@ -916,17 +1091,19 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) and torch.distributed.get_rank() == 0 ): save_snapshot(neox_args) - reduced_loss = { - "lm_loss": reduce_losses(losses).mean() - } # reduces losses across machines for logging + # reduces metrics across machines for logging + reduce_metrics = { + key: reduce_losses(metric_dicts[key]).mean() for key in metric_dicts.keys() + } + reduce_metrics["lm_loss"] = reduce_losses(losses).mean() if neox_args.precision == "fp16" and model.optimizer.overflow: skipped_iter = 1 else: skipped_iter = 0 - collect_loss_for_unit_test(reduced_loss["lm_loss"]) - return reduced_loss, skipped_iter + collect_loss_for_unit_test(reduce_metrics["lm_loss"]) + return reduce_metrics, skipped_iter def train_step_pipe(neox_args, timers, model, data_iterator): @@ -952,6 +1129,7 @@ def train( neox_args, timers, model, + reference_model, optimizer, lr_scheduler, train_data_iterator, @@ -1007,6 +1185,7 @@ def train( model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, + reference_model=reference_model, ) if neox_args.profile and iteration == neox_args.profile_step_stop: torch.cuda.cudart().cudaProfilerStop() @@ -1097,6 +1276,7 @@ def evaluate( # Turn on evaluation mode which disables dropout. model.eval() losses = [] + metric_dicts = defaultdict(list) if neox_args.char_level_ppl: data_iterator = CharCounter(data_iterator, neox_args.tokenizer) @@ -1118,14 +1298,15 @@ def evaluate( else neox_args.gradient_accumulation_steps ): # Forward evaluation - loss = forward_step_fn( + loss, metric_dict = forward_step_fn( model=model, data_iterator=data_iterator, neox_args=neox_args, timers=timers, ) losses.append(loss) - + for key in metric_dict.keys(): + metric_dicts[key].append(metric_dict[key]) # When contiguous memory optimizations are enabled, the buffers # allocated by the optimizations are deallocated during backward pass # in the absence of backward pass the buffers should be reset after each @@ -1135,6 +1316,8 @@ def evaluate( # reduces losses across processes for logging & run eval harness tasks eval_results = {"lm_loss": reduce_losses(losses).mean().item()} + for key in metric_dicts.keys(): + eval_results[key] = reduce_losses(metric_dicts[key]).mean().item() eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"]) if neox_args.char_level_ppl: diff --git a/megatron/utils.py b/megatron/utils.py index 26b4439bd..a64a8ba6c 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -449,7 +449,7 @@ def setup_for_inference_or_eval(use_cache=True, overwrite_values=None, input_arg initialize_megatron(neox_args) # set up model and load checkpoint. - model, _, _ = setup_model_and_optimizer( + model, _, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=use_cache, iteration=neox_args.iteration, diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py index 55623b303..4e101ea5a 100644 --- a/tools/datasets/preprocess_data_with_chat_template.py +++ b/tools/datasets/preprocess_data_with_chat_template.py @@ -105,6 +105,7 @@ def build_chat( chat_tokens = tokenizer.apply_chat_template( chat[: i + 1], add_generation_prompt=add_gen )[len(tokens) :] + # remove previous stuff... tokens.extend(chat_tokens) if only_last_turn and (i != len(chat) - 1): From 836aefaab925ba4f6afb0c265aa540b34573198c Mon Sep 17 00:00:00 2001 From: Aurelion <32250326+aurelion-source@users.noreply.github.com> Date: Sun, 8 Sep 2024 20:19:58 -0400 Subject: [PATCH 27/27] LayerNorm Refactor (#1269) * Add TE skeleton * Update NeoXArgs docs automatically * added option for te version of norms * import TERMSNorm * add te norm options to norm arg * add TE objects in weight decay function * reformat * add TERMSNorm and TELayerNorm * Update NeoXArgs docs automatically * - add Fused RMS Norm from apex * - make it consistent with how layernorm looks * Merged transformer engine and apex fused layernorm branches * Added assertion if TE is used * Removed unnecessary transformer-engine import * Changed importerror text for TE * Added requirements/requirements-transformerengine.txt * Add TE skeleton * Update NeoXArgs docs automatically * added option for te version of norms * import TERMSNorm * add te norm options to norm arg * add TE objects in weight decay function * reformat * add TERMSNorm and TELayerNorm * Update NeoXArgs docs automatically * - add Fused RMS Norm from apex * - make it consistent with how layernorm looks * Merged transformer engine and apex fused layernorm branches * Added assertion if TE is used * Removed unnecessary transformer-engine import * Changed importerror text for TE * Added requirements/requirements-transformerengine.txt * update comments * precommit --------- Co-authored-by: Quentin Anthony Co-authored-by: github-actions Co-authored-by: lintangsutawika Co-authored-by: lintangsutawika Co-authored-by: dmahan93 Co-authored-by: aurelion-source Co-authored-by: aurelion-source --- .pre-commit-config.yaml | 2 +- configs/neox_arguments.md | 6 +- megatron/model/fused_layer_norm.py | 114 ++++++++++++++- megatron/model/norms.py | 17 ++- megatron/model/transformer_engine.py | 137 ++++++++++++++++++ megatron/model/utils.py | 15 +- megatron/neox_arguments/neox_args.py | 11 +- .../requirements-transformerengine.txt | 1 + tests/README.md | 1 + 9 files changed, 293 insertions(+), 11 deletions(-) create mode 100644 megatron/model/transformer_engine.py create mode 100644 requirements/requirements-transformerengine.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7de35027a..249255306 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: hooks: - id: codespell args: [ - '--ignore-words-list=reord,dout', # Word used in error messages that need rewording + '--ignore-words-list=reord,dout,te', # Word used in error messages that need rewording. te --> transformerengine --check-filenames, --check-hidden, ] diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 413138597..d24b2b60a 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 53d0ae8 + Default = 217b4c5 current git hash of repository @@ -335,11 +335,11 @@ Model Arguments -- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm'] +- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm', 'te_rmsnorm', 'te_layernorm'] Default = layernorm - Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm". + Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm". diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index d33ded506..3fd251147 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -21,7 +21,10 @@ except: HAVE_PERSIST_LAYER_NORM = False -from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction +from apex.normalization.fused_layer_norm import ( + FusedLayerNormAffineFunction, + FusedRMSNormAffineFunction, +) global fused_layer_norm_cuda @@ -148,3 +151,112 @@ def forward(self, input): ) return output + + +class MixedFusedRMSNorm(torch.nn.Module): + def __init__( + self, + normalized_shape, + eps=1e-5, + no_persist_layer_norm=True, + sequence_parallel=False, + apply_rmsnorm_1p=False, + mem_efficient_rms=True, + ): + super(MixedFusedRMSNorm, self).__init__() + + self.apply_rmsnorm_1p = apply_rmsnorm_1p + self.mem_efficient_rms = mem_efficient_rms + self.norm_fn = FusedRMSNormAffineFunction + + global fused_layer_norm_cuda + fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda") + + # List of hiddens sizes supported in the persistent layer norm kernel + # If the hidden size is not supported, fall back to the non-persistent + # kernel. + persist_ln_hidden_sizes = [ + 1024, + 1536, + 2048, + 2304, + 3072, + 3840, + 4096, + 5120, + 6144, + 8192, + 10240, + 12288, + 12800, + 15360, + 16384, + 18432, + 20480, + 24576, + 25600, + 30720, + 32768, + 40960, + 49152, + 65536, + ] + if ( + normalized_shape not in persist_ln_hidden_sizes + or not HAVE_PERSIST_LAYER_NORM + ): + no_persist_layer_norm = True + + if isinstance(normalized_shape, numbers.Integral): + normalized_shape = (normalized_shape,) + self.normalized_shape = torch.Size(normalized_shape) + self.eps = eps + self.scale = Parameter(torch.Tensor(*normalized_shape)) + self.reset_parameters() + self.no_persist_layer_norm = no_persist_layer_norm + self.sequence_parallel = sequence_parallel + + # set sequence parallelism flag on weight and bias parameters + setattr(self.scale, "sequence_parallel", self.sequence_parallel) + + def reset_parameters(self): + + if self.apply_rmsnorm_1p: + init.zeros_(self.scale) + else: + init.ones_(self.scale) + + def forward(self, input): + + weight = self.scale + 1 if self.apply_rmsnorm_1p else self.scale + # CPU path is here for unittest sake. + if not input.is_cuda: + print( + "WARNING! The input of FusedLayerNorm should be on the GPU." + "This warning should only be triggered in the FusedRMSNorm unit tests." + ) + # Latest pytorch actually supports F.rms_norm but I don't want to break builds so... + return F.layer_norm(input, self.normalized_shape, weight, None, self.eps) + + # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect + # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet + if "memory_efficient" in inspect.getfullargspec(self.norm_fn.forward).args: + return self.norm_fn.apply( + input, + weight, + self.normalized_shape, + self.eps, + self.mem_efficient_rms, + ) + else: + return self.norm_fn.apply(input, weight, self.normalized_shape, self.eps) + + # Apex's fast layer norm function outputs a 'view' tensor (i.e., has + # a populated '_base' field). This will result in schedule.py's + # deallocate_output_tensor() throwing an error, so a viewless tensor is + # created to prevent this. + output = make_viewless_tensor( + inp=output, requires_grad=input.requires_grad, keep_graph=True + ) + + return output diff --git a/megatron/model/norms.py b/megatron/model/norms.py index 19e1aeae6..ba175d3eb 100644 --- a/megatron/model/norms.py +++ b/megatron/model/norms.py @@ -18,8 +18,13 @@ def get_norm(neox_args): if neox_args.norm == "rmsnorm": - norm = RMSNorm eps = neox_args.rms_norm_epsilon + if neox_args.rmsnorm_fusion: + from .fused_layer_norm import MixedFusedRMSNorm + + norm = MixedFusedRMSNorm + else: + norm = RMSNorm elif neox_args.norm == "layernorm": eps = neox_args.layernorm_epsilon if neox_args.layernorm_fusion: @@ -31,6 +36,16 @@ def get_norm(neox_args): elif neox_args.norm == "scalenorm": eps = neox_args.scalenorm_epsilon norm = ScaleNorm + elif neox_args.norm == "te_rmsnorm": + from .transformer_engine import TERMSNorm + + norm = TERMSNorm + eps = neox_args.rms_norm_epsilon + elif neox_args.norm == "te_layernorm": + from .transformer_engine import TELayerNorm + + norm = TELayerNorm + eps = neox_args.layernorm_epsilon else: raise ValueError(f"norm {neox_args.norm} not recognized") return norm, eps diff --git a/megatron/model/transformer_engine.py b/megatron/model/transformer_engine.py new file mode 100644 index 000000000..338513a97 --- /dev/null +++ b/megatron/model/transformer_engine.py @@ -0,0 +1,137 @@ +import torch + +try: + import transformer_engine as te +except ImportError: + raise ImportError( + "Unable to import transformer-engine. Please refer to " + "https://github.com/NVIDIA/TransformerEngine for installation instructions." + ) + + +class TERMSNorm(torch.nn.Module): + def __init__(self, dim, eps=1e-8, **kwargs): + """ + A conditional wrapper to initialize an instance of Transformer-Engine's + `RMSNorm` based on input + :param dim: model size + :param eps: epsilon value, default 1e-8 + """ + super(TERMSNorm, self).__init__() + + self.d = dim + self.eps = eps + self.norm = te.pytorch.RMSNorm( + hidden_size=self.d, + eps=self.eps, + **kwargs, + ) + + def forward(self, x): + return self.norm(x) + + +class TELayerNorm(torch.nn.Module): + def __init__(self, dim, eps=1.0e-5, **kwargs): + """ + A conditional wrapper to initialize an instance of Transformer-Engine's + `LayerNorm` based on input + :param dim: model size + :param eps: epsilon value, default 1.0e-5 + """ + super(TELayerNorm, self).__init__() + + self.d = dim + self.eps = eps + self.norm = te.pytorch.LayerNorm( + hidden_size=self.d, + eps=self.eps, + **kwargs, + ) + + def forward(self, x): + return self.norm(x) + + +class TELinear(te.pytorch.Linear): + """ + Wrapper for the Transformer-Engine's `Linear` layer. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear): + """ + Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines + layernorm and linear layers + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TEColumnParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `ColumnParallelLinear` layer. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TERowParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `RowParallelLinear` layer. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TEDotProductAttention(te.pytorch.DotProductAttention): + """ + Wrapper for the Transformer-Engine's `DotProductAttention` layer that also + has "flash attention" enabled. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TEDelayedScaling(te.common.recipe.DelayedScaling): + """ + Wrapper for the Transformer-Engine's `DelayedScaling` layer. + """ + + def __init__(self): + # TODO + return diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 97b409c1d..77e7f521d 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -18,6 +18,7 @@ """Utilities for models.""" import torch +from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm from megatron.model.fused_softmax import SoftmaxFusionTypes from megatron import mpu from types import GeneratorType @@ -35,9 +36,17 @@ def get_params_for_weight_decay_optimization(module, neox_args): "name": "no_weight_decay_params", } for module_ in module.modules(): - # apply weight decay to any "...Norm" modules. - if "norm" in type(module_).__name__.lower() or neox_args.weight_decay == 0.0: - # also include all parameters here if no weight decay is being done + if any( + [ + isinstance(module_, LayerNorm), + isinstance(module_, RMSNorm), + isinstance(module_, TELayerNorm), + isinstance(module_, TERMSNorm), + isinstance(module_, ScaleNorm), + ] + ) or ( + neox_args.weight_decay == 0.0 + ): # also include all parameters here if no weight decay is being done no_weight_decay_params["params"].extend( [p for p in list(module_._parameters.values()) if p is not None] ) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 814622a5b..b5e7a619d 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -162,9 +162,11 @@ class NeoXArgsModel(NeoXArgsTemplate): Maximum number of position embeddings to use. This is the size of position embedding. """ - norm: Literal["layernorm", "rmsnorm", "scalenorm"] = "layernorm" + norm: Literal[ + "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm" + ] = "layernorm" """ - Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm". + Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm". """ layernorm_fusion: bool = False @@ -172,6 +174,11 @@ class NeoXArgsModel(NeoXArgsTemplate): Use fused layer norm kernel (if `norm` is `layernorm`). """ + rmsnorm_fusion: bool = False + """ + Use fused RMS norm kernel (if `norm` is `rmsnorm`). + """ + use_qk_layernorm: bool = False """ Use QK Normalization diff --git a/requirements/requirements-transformerengine.txt b/requirements/requirements-transformerengine.txt new file mode 100644 index 000000000..2050d7566 --- /dev/null +++ b/requirements/requirements-transformerengine.txt @@ -0,0 +1 @@ +pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable diff --git a/tests/README.md b/tests/README.md index f5ba5e560..32618d757 100644 --- a/tests/README.md +++ b/tests/README.md @@ -3,6 +3,7 @@ Tests use pytests with coverage and forked plugins. Install with: ```bash +pip install -r requirements/requirements.txt pip install -r requirements/requirements-dev.txt ```