From 49cd41f7f1fb6b0fb34fdc3a648dbe4db8cc92e8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 13 May 2024 11:27:14 -0400
Subject: [PATCH 01/27] Bump jinja2 from 3.1.3 to 3.1.4 in /requirements
 (#1211)

Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.3 to 3.1.4.
- [Release notes](https://github.com/pallets/jinja/releases)
- [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/jinja/compare/3.1.3...3.1.4)

---
updated-dependencies:
- dependency-name: jinja2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index a051200b5..501edf345 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -2,7 +2,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4
 ftfy>=6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
-jinja2==3.1.3
+jinja2==3.1.4
 lm_eval>=0.4.0,<=0.4.1
 mpi4py>=3.0.3
 numpy>=1.22.0

From d037756332ea226358314c489347ad677b363af1 Mon Sep 17 00:00:00 2001
From: Jacob Hatef <74274091+jahatef@users.noreply.github.com>
Date: Thu, 16 May 2024 10:20:44 -0400
Subject: [PATCH 02/27] Run document update again (#1216)

* misc changes to neox_args

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
---
 configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c8e1492ae..dd10a0e09 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 6fb840e
+    Default = 8d175ed
 
     current git hash of repository
 
@@ -1201,7 +1201,7 @@ Text Generation arguments
 
 
 
-- **num_experts**: int
+- **moe_num_experts**: int
 
     Default = 1
 
@@ -1243,7 +1243,7 @@ Text Generation arguments
 
 - **moe_token_dropping**: bool
 
-    Default = True
+    Default = False
 
     Whether to drop tokens when exceeding capacity
 
@@ -1273,6 +1273,47 @@ Text Generation arguments
 
 
 
+- **moe_type**: str
+
+    Default = megablocks
+
+    Either `deepspeed` or `megablocks`
+
+
+
+- **moe_glu**: bool
+
+    Default = False
+
+    Use gated linear units in MoE
+
+
+
+- **moe_lbl_in_fp32**: bool
+
+    Default = False
+
+    Whether to compute the load balancing loss in fp32.
+
+
+
+- **moe_jitter_eps**: float
+
+    Default = None
+
+    Coefficient for MoE routing jitter. Jitter is
+    not used if set to None
+
+
+
+- **enable_expert_tensor_parallelism**: bool
+
+    Default = False
+
+    Enable expert tensor parallelism
+
+
+
 ## NeoXArgsTokenizer
 
 Tokenizer Arguments

From 153e732f7df9676d33d97f07f5e2009ae7b1b2a4 Mon Sep 17 00:00:00 2001
From: Jacob Hatef <74274091+jahatef@users.noreply.github.com>
Date: Tue, 21 May 2024 18:34:53 -0400
Subject: [PATCH 03/27] Rwkv pipeline parallelism (#1221)

* misc changes to neox_args

* allow rwkv pp

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md            | 2 +-
 megatron/neox_arguments/arguments.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index dd10a0e09..48c03f15a 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 8d175ed
+    Default = 0d5992f
 
     current git hash of repository
 
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index ff4f4bc21..98a444ea4 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1070,8 +1070,8 @@ def calculate_derived(self):
             ), "Mamba does not yet have dropout implemented"
         if "rwkv" in self.attention_config:
             assert (
-                not self.is_pipe_parallel and self.model_parallel_size == 1
-            ), "RWKV not currently compatible with parallelism"
+                self.model_parallel_size == 1
+            ), "RWKV not currently compatible with model parallelism"
             if isinstance(self.zero_stage, int):
                 assert self.zero_stage <= 2, "Zero stage 3 not compatible with RWKV"
             assert (

From 2746d43ede314e74fd3bda818d6a044ac3c71b9b Mon Sep 17 00:00:00 2001
From: Colin <colin.dablain@gmail.com>
Date: Tue, 21 May 2024 18:36:21 -0400
Subject: [PATCH 04/27] Add Torch Profiler Support (#1226)

* format: flagged on pre-commit

* feat: add pytorch profiling

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 README.md                    |  11 ++++++++++-
 configs/neox_arguments.md    |   2 +-
 images/pytorch_profiling.png | Bin 0 -> 89473 bytes
 megatron/data/helpers.cpp    |  12 ++++++------
 megatron/training.py         |  22 ++++++++++++++++++++++
 5 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 images/pytorch_profiling.png

diff --git a/README.md b/README.md
index e7f61bf20..e11122f5e 100644
--- a/README.md
+++ b/README.md
@@ -640,7 +640,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher,
 
 # Profiling
 
-We support profiling with Nsight Systems and PyTorch Memory Profiling.
+We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
 
 ## Nsight Systems Profiling
 
@@ -656,6 +656,15 @@ The generated output file can then by viewed with the Nsight Systems GUI:
 
 ![Alt text](images/nsight_profiling.png)
 
+## PyTorch Profiling
+
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`.
+
+The PyTorch profiler will save traces to your `tensorboard` log directory.  You can view these traces within
+TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+
+![Alt text](images/pytorch_profiling.png)
+
 ## PyTorch Memory Profiling
 
 To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 48c03f15a..1dbb4dd8a 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 0d5992f
+    Default = b68ba6d
 
     current git hash of repository
 
diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png
new file mode 100644
index 0000000000000000000000000000000000000000..e85324dc694d7c11e922a0cdc54f38d364f99d59
GIT binary patch
literal 89473
zcmZ^~Wk8hc_XUa~AkrZn(nvQ$E8QX8-QCSdH%JH!skC%ShcrXOfPhGMch`OOob$W?
z5BK^-X6B72_Fg;IdLoq-rO;jxy@G>-Lz9sf{{ROE{{jx~S?^0k;2ZgI;a|XiFI+`s
z)LsJr{9c+x0G|onB(&UA9WC5Eja|&)EFB!}&6!<IUChlLT&*14j^I0lfr~h;)wJBi
zT+EH#tQ{TRs9D>a17E_yaj<c4Z8X)fac^cuuybyv7QEr$*n|Xeb8ke1adB-GB{oNr
zYrw(1fs+w`ujZAxzhIe0rs;ir@-z&)t8K$CuPgnmF<GM9tB#lQo1F(SI2P?|D>8K;
zW@2jM_#*YY;!YzIyM6+v1mA`zb&Z-|75Sg*+O7<83@jHh7%RW0bZTIK>byE?_Pd#T
zyY<^LPIy>oE0b`ez2r+ArszwG5X@B}=r-p(s~hS+pTOE2SajIuE64fz+sfm0)Dxe`
zOBX`X#l-^b6py^K;w$p#8Qw2}6dn7#r*k4CfGaGbNpa0Re5u)DC+s?{>f7xJn^UOw
zbGjav8nZ1M3{}uvFGKy3wCvDPY0tb<?<Af0SC+Z(GLozYl$fx)tZIxri*o5-C43@h
zhWyw(eLgtMz5*v1jh4|Je7jVs{bz`xt-1=R9;c_Q?H*sd;ofj@_XXOE(5z<|xm0Cz
zkNv*uQ+zm}E0sU@oLu@K$ArB;bx1eX@;IuBWG&nZuQ0P|cLZw6iR?V|RO`Ebhwn>%
zv*JC2<~sjmQTe!QNO)*>?fChQf2oHpJm-ccNla+J@ky7&Yt^##Ji3ydds2#e-SSFo
zX=6IPU5<%_n&`axXoB2%`6Q=$t=*AB=x6*1SgqynT6$>`?%8ilEf%p6j8LztU{}9H
zLozfeYxrDukDzAY%xU^?z(u~O$hDofI!PQU)pgD@{e_+(J;iK8y<y@*TXc1{hP(#x
zYkWf`%>`qV6}W1rPp4>_YfeSZCNrkgYwRv~Oa*S!w72Q;6FRlTrLgW*v>B5+Exz;f
zHm!Zd=Sg{RsP#Ern~Hgt)BTX`qc46Bc(ma@#f|HT=z=Gl43&+L$Dc{^@s~K6M0s9N
zeo3G79l;BtjHCNkky;^-DLANg+0+RUd2k+{43Ck<8uo2jTGY}974}w<FXUJ|BBwGb
zDc_TqOh`{ti2!e*A&gh4_uzzzXL@nTciXaZ080f-&8)Y;Y^?Ol$+(<6+SRH&^zY<m
z{9?R6<Gb43IZklLE>986(a5C{9d_=X*Q`$xZ`+rBJ-8;@uTaBt6l{2y_>!|GK(>*n
z`;>kQKbE9g^8$`^o8SLa_HEu5dZw0pc(2-5a4+xZ_)hQPL4rFkiu$#hHmKY(i%p;f
zyx8#)e79ZjW_T<;NsxF>nqcz`hp$ExL<YCLnvpMxs#{cLhH{i<d6C~yuhZ@4tlp0K
z$Cva6h$=m(5K{cz#A{cV?YX-FM%_S0ZXA6W0r|FBV_x9&%x{_Q$1TD`oBTEL*My<w
zY7Xt6Qv*)8X~TWfh<5V)9yIRPf=QN6UM2-EFwMK6@SUZyvCm4_jyOc6+UI-CA;z`c
z+m;ymM6^!IGO#pd2d3yC&MT>0%z~?KcjH&&q4{|pB1rWQ$#t#Rr?RO@G5v7c#X|2a
zj0s1iJgS@)@Z1zkzg$qm9rRyfBHwP?7x$UCerVdUxD-AW)BC24%cK)LxA~=DQymx(
zgpAbaB+M4n{$Q+UJ$K!Q2qD;C9dNx$($#^s4#tvr-52%bVRf}5>%E1L1RMt7l{G)N
zlts*<!x&cuCx2Tp@FBLXYH4L!x~HgrI^A>25|$Puve-~3H#Vt?&JN3qxu{Q6tqE!G
zGbM<T9M%wEmh!AIn%;=HtY@2C=qbhU51-aO0(^PW!;clIenF_{;Y2H>-n8Wzft`Sa
zlMgD~(kz^Q5k|FDH#e@jp7@1aNBY5-7PUB7ju|0ZBhOl9w<O!tr8xEE5nU(lkQx%P
zmP62&l^xuBNNKxhLUwbhPi<oSq2^WfY!+^Er6~8Xy!r>+*UXSNc(2&FCx@IolVULc
z;>JB6;N3>B1H&#6cKQCu2f)Dd9iCiKJ*KeGZXg8P`FZuPo_gr<T5y6V#09Y_YpYO|
zd{_uWQIO^1mU9SJmsPK-tRAMlIL@R;-gafBC<x+F;30Fedj(zM+_6E*XY~wPwL?)o
zxlf1VJKk|%nRX5i?N}FEEZm92mXtJ3){H00&?557Kv^hhO5q$ep0!Ur&dnO;2%5PT
zeM;1Sr?co6q-shqf>(ylix_NuIXm^+<hM27?1Hr!Zt~>3Q?m3ADcH)vIX7vy5r-*T
zT7-nyi^BdB3%uo#VLE0{vd(WR&3kd5<cF!>qAL--<?GEi!_T!QqFPXxI{YQM)|vWj
zfQhLafY_Q@wzT2=tyB)7Nu;Qt?u<C7iFtktp^_xaV%hUam#qnyvV!LV#m}}U)!Kt_
z-k+>aKjq&(U9*vXM*>{$rG(0Wc2$hHVA#oU^K(rdyqz1A0cL&0`D&qcR=rLuN5+f0
z9Lv3;4w<#DsnxUkRyx?x)1CH)z8?>+{F4IflaiS_+!VhnI`W4`v#vtP{m_ee;q#!B
z$bo-6t|telc3P5)<js1{I~X?L(P|FO5Jrb5Y%$UcuSr)zJCPBZhDS62G6;4YOj<j#
zz9j^3G0TA1LZ??O=3esxXto9zcY=ORi1fG=2LyDMZ`gmRXbvNNdS{`+*Ixa8bpq46
ziRSm?$Py;Hd>#$X`4l<LenyYkcdY0T0xJQD(VzmjVnp~?Ux*Z_lOiF_52|qVQC*oW
zTDa%D<=o1+7Z>`D_mXtnMQJ0R0fdeko%>v*ZfMcn64dKdSka#%B$d@)OF+DB$l@g;
zq(rH4oa@9%N?}9I#3H@BrAJ04RX9&QY}L{)liWAW1*7;?b9^?RwvFofBM=|9n<bfu
zB)JKy`2}5nIkG{d`%Jf+d1}WsGqhnwT)OmHby3ldHzbI8kw2#BOG>b#Lo8OKLn5WR
zL;Tlf^$q@Pou3=NMiH^54Q&bk+0+s*k2LJ7X5;p4{MyKHzX-d_Xa|P=%m8Dl+0AEi
zvpfH#H{g|5GB1Ik`jX-?Gr6)O-9P^RWofqrB^gyqe$Xb>{M^ltCn~J?)Asl$97u08
zsCth*rmqOf#L{o~opVY>YC2EEqO3CZ`^-WwRFj;J49%2F*QwBqZ8jjCYHUFX2mxRG
zxJQ#wZ47)~W(-Ah(j;wYF3OJHzw-F~#Ed)rBNt7x*6%F5*k_*(FjwqTrAa?Cq#>54
zADLeEimZ<(6zC$ZtS4G&eDR}WRs4dPi&v<SP?V+mhaq0@r|z+ZaH!p%AVY0}HHjh{
z2<cf_hIJU(H0DfBYc3O$zj%WcKY9Esy6)$`MP~^ky<T1VpJ)OEZ!tkU+7>fp9?e&U
zOJ_WkN)}{<66X82(;W#BVQ=+o2|axdGcR@R6o)<Vk+QD|ae}jI-zv!}Vg-IvDXD*W
z;g;5M%8__NVBddL<F)o(J|4~Ky6KJpu+GBxfXL)bN_3$!IR@T^piZk!6IZNsGP+VP
zEco*8t=G|&*eFQajSh@}K?B(9K#SM{Av@TLn(q(?TolBA@}@Z@-)jdknVc*DNv<XI
zAEyWrXA&LYjqC(*w)Gv#&|s(0lEoTvjkQK`U`1miE9TK80qBz+Z%4PDV>Z!Ts)CBR
zUy%&K3iLviP>&3`pE~r58DtcRm_F><lzq=l@KRCfV})_euWEk19!d(zXXGWFY-c(u
z5)>frC>yq~#jm!F+<laBL#;r-Ezi^Z@8VMAjjr+&A|wFhAW*>bTMGvLwhp;Z(js-B
zd@rKWf}=mfK2P$3R?5k1G9oOPK*0ic`rQC;V8!07{#(EY)Pk{?3Aesh21LxaKQg*0
z^lzSH#)nPRQPX({%UVo`)@-Q*Zv;Rz1YQ-$ZPfCRlX}CRr<HkhJ<7;yAn9-fR?q-1
z5EpwUk4Fskijf|O?=<S!J%<~%?pXLNIv^5SVf`^~dWQlJ-ME@hn$`rkQW@8-0{0yq
z9y)e>*xcFtQSD9~_?-%F=359bPQK%wvBv9^u~pfT+_NcLR8O87o4o_-RV<U0<$YTz
zoD=%5JI-$UQYz{|(CiU?mb(Inmw^-A%}n*;Z)yxyjIm%mJkR8t2VP=VQ5#>q%w;+H
zXkRnyOb`Q7tsRx0fq1))?Ey9dBv7Nt@^+REZqM|-IVL~jgjo!0^k>>1p8sx$Ioo$3
zSv=>vGf=XC1Hj&LcaR^a)PA&jG#NrrhJK~(Y(8Sm_>L*F2a+PRMT{s}8|uXsGqTeO
zN#xsV&RTfnb8h)y5s-PY@Ns63hd?3CzN6#8@sTBDf(s|7)2z?VHM@39ggu6l8wEhw
zX(MWEAYn})S7~*7*mq~tYfX9FTzgR{DGg=FJc~sF)|`$Ltk|ScWXWTUd{<-^^6qcs
zjJV?PJ}Ca15Mdq}C;k&oU!IWB{)5>ez?Xe)1(%}zZ~h`lNaMShvt-NvstC~t6!Ah-
z<UgtVJ=PS-fzj3_44)qruDPYp>!w$Sas{MMnz!$@G3A{l!n12>H#PPxNl5T%xcZEr
z7cvEMDUJ=hZEp~&FL#f4_oiz_%zldO$VX};r{tip%|5NIg*c)Yd+YVQF7gXX!3S~f
zJgRn2KSkRiOo4_RT0H-)3r~-j)Af3)ExVaSqF{K{>j$j%r!4#kiM6BR*21vIhWo2a
z)+MiWX)g!DQ_|;uh8ZF9#fe@LL}F^gAWcso0du==XM^U4jX3n#aO0r&M~IsgM+NoU
z(qTv9ME0n_1y6ijaG)iKz*eGWCq&b-r|nuH*#2V8?+_nr_JN8Z;<-_6MzvPVwV!ca
zejL>7$4VV$G!61~x4Z%(@->xJ^C<L}!zs$-u|jwR9oN1GXH>4y_jCesj8BQHZ!jZ1
zqMvwSr_ReR>=-1<#G>J+s&I%TNrNP*)8Eaw$y>lfL&9<y8uc_xs;N7TUP!1owb+a2
zIfsQ8LZJ%AE*zk<K=vf8aF{6N*Od(PJR)2>9!^$4-%Q&I!_JyP(Yp0v;f8)mrj$2W
zO1VU8L4iPAS$5JPAW1XpA1b;L0k=Uif8)y2!oyT45@pb$dm|Q%jZFk2=1i!ulvZ8l
z*@(vcg~=j&kP-6eOM>33pD2v!aZ0Eh`y9|miM}DAfQ(mZDR4gE#9+tLvJtAARKNJD
zjH@sM$wW2zo)*!Ivvscd2J;y%i3jHm!nb_OMrrEAYySiYM$<FNyg^&_5hg?Jo_-N`
zKE%9dIjn;7<Ye(^S&$wH72I+6-(&(K3c*Zz$#PuSbeIuFT!c<db}{h?Y1!yls3l&l
z=%|Pi*#GWlzM)7Xd2R2&b#YFvH=&z*OqHz^DNhSTTo77Lu03`(Mn(sb=HOpld*H1s
zY_G^kDSZDbdUEHGY;D(;f$v%eMKbSMaLI>it{kDtpPZG{Nr8_tJbHSA>*5iQkH~JM
zA7CU*Ok)K$N>m=<2nw*-I&WlOPu}ILP!(K+X*SpHIjtonMTb{cOD(BoOc>v+6`Am;
zQtfqyMIM&y{kD)APL}=e21ZXNmv~2OHYD6UVmO3ei>l-LH1|VCFUEgz#Pr)1wb;h=
z%AIcSWi6YMcZ6hBX3~<xVI4~~Ue|=K<L8g`d>qAzC+oD~{JQ7#aHn~yqs|HckOY^^
zI$BWR9dRDx@k=KRIW%`_=Hg^*sTq<)*Gx#1*|qi^_8TNk`BkMU_fz&E`|Ta=?41p~
zjDqP#i3yqSH5){mZ1v<t!(x3l`0<pd-k>G3{wD>G;rEb&&!Mijj6@1mws+Quh_ojP
z;Y~L`zsoBi%o$D(k1yk<>gg!zG$-v+krF5%ihiY^(QC<U^CgPtO$-rL(J0SGdgls*
zQ}-qMre`3o;NEWsF7IeePM{zU4o;$vlGRC1@}?mq5ghMQ*JNF5H{7?lkT~NrfI;a*
zToL+h_)iLFZ{o@Aw``XhBc;H;PW2)hm5}%mht2I3j}Nd+627TH&BvRE>%wj`i)QcL
zNSo)6FQMXzx+2Zl(p;n2ntMA|<SWg@8i|^p7*xal$$yjq_e%P{@cXekT3an4yz`5b
z5tx&TO4v<5=SE0yr1N2fI*AAdGmcaD%!QL<M7jBaG!s+^Ti})jkrY+Y$6ny4=9aty
z7OY;cHbx<XvG~`#_{D@r*9xdwD+n+skE0R*+@u7vzclz}E@rc%t8u%|(6O2>^nGB$
z!wE1XASF#5`ZIH4M}voy11AuZUR}U5O?`bCQ-2W@*;=jd7Gym0kjjhQDl2C)@@9Kd
z`*JZo&u&>Y2?~;^lXINuTBUB=IJe~-bW;vO!0kYpcuW>l-D0p9)VtY}!MBI^Jq5r0
z_H+jEHxfS|7{fMS4O_s^@fo=37j-xWov@2&Ym=2};CO^X#3|ovZqY|GoW6zR$UyHQ
zmH6a*Q=|6?+P?#~BmtmF!g)4qLAA%5QsZNfte=&)RmY7R!vO;IhCAqvjWig9bbeWN
z(wePn3ff!hh)Ul@SIpWqcQV4(pRslzSl&+X`0XIju;k6ITlq~~UH#UF*6R{V<j{UW
zImxN#dTaZ%NbBR%*SPXf+H?Y6N<fxvVZ*@X%gk9T`1|-#9RL=}gnSnX3KaZ}(Kw6D
z1#<3bQdD$#mx)`1-(D~V(N2OSocAP(D2Ty>U*Va51N%~Q(Y&F>L<@{LgGbZpga&12
zm#E;ox3c8FsV}hgX!=Nb(2pa+oaS_Ra!xzu8%)|7Ke%Vg-(W|oDyQ|i^Oj{ogiYC<
zx8I@QMPo@kD;<$H2w(urfX}%#`tHdUV`j1HdB#XcNr~2hMNr^N>m8QG2J^QL@o0JP
zH@?xuxj;9f=?#-nKK9y(&TtJ|tPO){;friqw43cVk(*%Vy<bY$IGsKg?tB@$=h2PP
zVv@{NfsYhF&GaCOMI6SXqOHZs7>R+Jr7?Hga5{J2X@1=^;voi|>oE2Lj>0z;)^BgR
zk?tL0$fv-OKuxA(Qk@Q%034^P(I}9}$+0kQO=lHVrztR%Gp!>n`6qNdw3MG;Dzb60
zAv6oqv2=)tsyOqPl$dapP-7F7yzf1%G6J1lWmu;xa(p%BAcklr29DwPnXnswPY2>{
zj|nF%h{vG$QxwgOp6i0kuF@6OOL1lO&>NJR2+39X7^Lk3!hWN)rbv;Nj^}|dSsKuS
zRV37LhiTy>uZP{$ryx5YN=l4)2qXK)Vo&xTBP4LBllU!wIWEL>TN`^>5jFIO%?ZK`
zX=@W!y$Rt6P#F|c(bgc}^e3$J{<mwuerl9PW>>KP(hZO3$4*Y!7@rqV<RY@X8=|Wg
z7}hGI^=oI7LjTUr&!GS!$*c6ttX2_$hsO!&W)DLw((3q{9F)t|*LQz@8N<oHn)j_e
zN8<I3RkI=+CIvx+%ek~bFAb!Q2Lmgf&bs)>(uW}gW}gS79DwiA(gp*WjM?-rY&cw8
z<sS$2?Cga3ZT40ci+8%?7ZAR^^umsf#~Fw`qKk_v0omomZ_qoXF;J1xMb#IkjCXIM
z=1r-vBLVD?6ilFI91;`r7g6^m_AW0WOj0qo8y3+W>yyL#;GZAt1d^02%#fyz1H!Ib
zPyawMhPwL8kX&&}&S(q9ly#W$Og{j&6VfE{{Q-sLmcLFk{4F3&)rVHZn=6JLeNdbQ
z$4sX}1&8>oZ_qdCg=ed0bkZ`e9PC&|PDWATkG3MD2UJYDDg#RaCpL#SJBpME*G&(f
zrw9nMEzHc)t+)dFZNWpf#q^vqvTT_CuI=!{W$aZt3aDs`6N~)A16A?Xh6Lf=Q6ml%
zG}AJ=dzkU}o7~MUl4I6jV->_OCBNNM$NR->+`l>VlRY7X44u@DhIvFr#Tk)N0}lpl
zm%qNq=b>nNdiicU4wu$ucv7v}`jEiKw?4joJ9`;My(QjE6$5z{74f$)|MhjRw+y33
zURLQ7E7n~JAl-_J{9;hnF4e)gTWqzJGs_SI3gLE6esw!}OLte-DG6S(MeWu!qMC&H
zW01U?+fPR(8o)K!ir3eDP6|O*)<ka-#*e-1)GFvHi4;C%qyqKYJGtHD;jw%m+YIVb
z%T#0}gg5|bWxb==*AEX|*L8%Us+Kw0b*dLfVnc%)RpT-(2$)crfvTuYRPZmH1$l)~
zEv@X}psv@NEQL0XIFVY3uS=t4hMZKv&8b0MT$7XfX6D{65lop%2B<pTprsBCo|x+A
z7qzfu#3nNj7Bu*hv@FPb)Tf#cp+M4JPHHWUO`3-eXXJFAt3^Z=+yL&WkrDa(cQW6D
zIS(PQ7bK;JJtpkVI~yB4CeXRdz4X!a?`JzZWNTAUGQ5<nnzEXYA4LYMchNaG7t~l-
zd(1d$ykvx-A$?PB5z-*xc|+aG3hK%?pMg+~damRGEkKcxaeuC?{7IJMo2sBA?a#>?
zJM0r`RTu>cI*MG%>rW9KSQz|&*67fUH2@qEc>+hw@|KEmD0s%UsF*(M;VxZT!L_=s
z2D5`zu;>AZT*{Z^4}C{U2Y=&i4|#0i&i<ly_=tn?8jZQ5x4f7>d3m{*U0bt@hfgdB
z9QZCIH9O}nVG*G~yG*c_{u(?)-rVs^Cw#7>5?OynMLkqN#Ca!kN5fI--}KHC;^ejY
zI=^FYlbuWs$il1V<hR_{=r$+0Vt2kBXWHYHVzn%scIb=&&sjZLAGS5)Mgxp?+9N<F
zoQVPNk8ygXLi~Nlj*K{n*&t}(e5*&lZzZy_C2JT4e{Pbe?S3jIttr;}w;W>ow`UUK
zg^~2X8i`HfDc=A8y@vkZ7BVKjny4JdYxFNWTLGW1uCi+LzTIXX;82i{bPGB925@0n
zQmzYBf|eB5oOrX=g5fgv9qDkRk6pU4bfoS^TgqnEqG?vSP~EYjBpx+Q5kSAEe^?pG
z{xkEJ&c?CVbaP7qQPP5!0`U_|=lUyJl%rP${_B}*CBGCWuF<iB>L-MOKeIJ6Z*~0{
zu9;fW<DK(c0;MVYhS0)ya>&;DeRDvS{Eu`e0G|^{kzKqXM~{~<6Zh(GC4?dcb4U01
zd6p+MuZ&xjQGS2=9shWi#BXgk-nFVWE|mzq)_p?a<EUe)XEj^tCnd*Lbaj>ME>%xX
zqO?^<){ao_&<&DpP%qt2{q8Fdq2X_x=pXD@+S@N!9`^?1em8kAM&%w41+%h|ojd23
z2Fh!`D$n!Zw<@9S-CPal^-oEe5I(H%d6fYb{QzqWfFfRgpXOL6-_o7AM|e74eYnn}
z5N=!X;YnfI5Ha+POM0-U=8m;x_{k0uQj8ku6++4`@Gr|*`nhjrY(c@RZXl{Y|2E7Y
z2lb}^L_#zMp|H`d)1{(RYfPl$9PR;T+UmGs(@YySJEIdlWK-nbm^y-@d6H8p)Guw6
z`1;1&&tmPu&tpDbTVG3Y!3c?$r7ev)`e!b~tNuG3?_3Yo6Y(JEr<{yb!@eKq1mp(p
zG=WwTcj)aA(`K8^QD}{2=>Ic`xDxtgi}kY=^^8WlN0&$UJIBVmpJTP&D>h^T)x=;f
zaTOap&19CXPO8e|S-SSo!kX9T@~AKYzOK~D<}6)Jy+776A)Z-rLNQ57rh%J$nzbh)
zRxj|mt(-0m6+wEJ2$?xjgvdnQn9#GVEh;zJ&gSk{<5f3U33zZv#a!paq2!-=<MaAQ
z9K=dRP@9Njvp+ucbx>lWx$Z?VUJt5!4%6agt%?11-NO%r#K&DI#(mm#I}05W892TF
zP@i1cOOEz+kN>OI3t+S`Cya)Q#vpL}EsNr1!%0SZRvClVfPaHOat2ghMhE@R7&g2M
z<ip|Z1ggUefcER%3ac>>0pG8M9%-Yf*EcBF>$iJW<a=}^@y|K)=%8kal@3k8x|#-^
z35SUMbchgiq+;^@E?ZZQ^^7lER2btl|L`-IN9fgNK~93Q=?f#RQ+7VIVeG^)xIVQ6
z=L*WniJ@~wt0kx0Wz0I_aROQLMfe$WuYN!uc~Ul0Fe5_wCT99zbfs3LJ5+9)k~ihz
zIP9n;QQdZ?)(*(AVhcUk&L8$!PTL4^3oYQ4lhZ@ztV{+(Iv?Xy?HmOMBfwEslwnY#
zA2uAdT%Kgj6`X|V%k{JDM59?TXSY*DxGm9Y;l%1H?8FLgF&I^BrEm??od>oRl-(Oy
zt_Dj}FQaRss|XbGXZ3+aflIa=v4gq!j}pZMCCkerQr#?~=0<|Nv)zcoTju8UMhZ|E
z*a>5bOQK*VssG}Qo>tWJ^g>fL^)gz{N2`eUPcP;U)!w~VSO0L}a9h+LL4!@QXJ6=0
zg`=*%NG~3K5Ifyk>Br4X<v*){C)~8c58M`&`)Mk%R%x3~EC3f5TL(L-GooS-umk=+
z#J92|eP2pmm}2#b3HLo$g)1H$wm-}MiBrZ{v7C4_q>Rf}BDo5E0a)PA1YycZ5Nrhw
zQjt729)k!E7+&F?n-Xf4^_g}xJRv#;sVRJjiHd=}aG&_NlBisJ^%6w%X#M4h88(<J
z24(_Q<>OB;WM9pBVKl(|o)QA$^oK&#H*CFXJjhkQzCqzck<N~!ONo#Z{inOcQpVMi
zf#YA^stk@8vun*(622G8233b+s2TA4$^$M?9{tk=_F%XX*v)Dz!pY!Nznd-j6E{UQ
zGkqkoUE|Sa9z#1tx9jUDKS!oP`{_OF&D^8qu@)(AfyK#o^+G1u{uefy==i5g0ktbG
zm;Urp`Z0!e;8~~TspQmS8FBCZ*HXL{jQK>Co>0U2_abR<NOnuQZ8%)-#Fl7r<FRAH
z2vOu!(-Qf76`gH(fXVx9g!=AelL%V|a{WM)W3}*wsR*BzI;!x_LK8E+1%2#zediV7
zoYp2|_w6cHk}$yWDo@mmD1)P4_*!fJ(*nFT<seyRZFs=^2s9cEy>>0{@4s6%8SnHC
z%MF<`;4a@55VH|0cpb2W`QZ?tA>N?>Wm7^hEp5pTjrp;u=d%X4$exguz^C&BYTcpe
zd!zzDAOU>gP^%W9WlmI7ZjzUKFg5#zFMi<yO-+5{{2fw9`5d&N@&Uzr=+i&FKg(Kz
z5t4nuAXe;ETiwI!Bw0i_U=8ss@Qk>IW9;O^gZFJpR>*n5FD{0y`X_&5Gq^+%qRB&F
z{yUZ<^O~^#8jAnl5p?Y`F%+L4@lWIds2~IrJL$3JarVDK_ay}jCWM3f&o3vI;xk-*
zx4&i|#yyd5XalyKPoyJ?+MoCerS7~}ET5ze&zyljw@A)@C5aTZJZ?Yf*cDv~OE6)l
zzil`?7T~wE$&cT9TswM_x==**Xgj#@s@<6g2zZ=Rm^$#a@R-LCt`BN?I^{Tg92&ar
zU24U6>N)oA2inL0okzQ#vHtA{d`OxF^RUC{xNq4zjzxRU_6ydvkkb@$@_4zV9YBfa
z-Omp7{I|G71^VBT%^3s-_PuuQnDG+Zg6^2=+@Cs}5MX%^X1FQFT-pRX)}2WSLy1F+
z81Dp$zE<pN16^l5w*@-3bY?hQR{i+yq)}xlp_L!F6UrZBf<8>1%8pJE=51`Vl%ZoF
z=3z;2V<O8ZY3s#JuY7Izh6rdUk`ltFcZ{Fvb5@TF0S*;$e(zWbHHj1a_Ao{dab6MY
zQpwoeho0VWMB0<8{e+^<%tcY)yIIiJI3GXLV4&|FmFi%W-;(gjF%e)MB;UV{vN3Wg
zCYDec=R#AGB!de+D3XCPP;xh<wQsKFZ=@KxdOsuBvp4svLILPOMqS0NZ!=Cs8=rLR
z{S+T^S}d3LkAUasjyT|zE%v;de=A7XB>r3&*d!TRlg1CV0H0)8R}>0RyuWw&0Dd7(
z7%W2Uc*7*e(uSoV#{n$vloN(zQs`;{F|dqTu5FgTNP!ywmIfUMK<YeWwcrGCCRrs@
z?h~g^=UpFG6WFA3I&$Ph&b_&qq3cc<cp1T&2%l#&vmuXh5}SIC|Izz6g?s#BS!ilP
zx2C_8)Qqe7oeC^Cgnmg;;#v6A{ZSr#^|<=eAdi#?;v(})fPQtj6gthDa{=gpW}d6f
zyYs~&LaXuufJQ!V$fU-KuG->#5^fP8i#=WyGNd6xbKZvdLP!ofMfgMk>BQ6#IBq9F
zcr^i2A>5!(ghzUSmrqr5oY%FR7`^I_cPX#q<^?HT$vAgxQl+q>l>O_XD<ME-Vpdj(
zrVn1L)sECk{u;-ReL_FYDm<WN$O!!0{ccYtF%l#|evc0*K~x%^w6RYJ(2%k%iyye)
z9#4$Xg?w27?`ZZP1B3!_FBM~*ZH{07pHV1zTfFcc4M?ZaI-cgdxwb8z)<h7#to%5W
zM|GQYK815vA4`bnpMBUn0&K<EWGfG#*y7rc-&8w&v;w`O#4|sl5=o!ySWAH9+dZAt
z<rMo2?dfYdW4Yhu&|$@6XA*S-Op^+3^IrV8s%MB9<>+B}dw3y@<t=vX&-8`ubV)#D
z;X_<vxfXDCk@8PWTUDAJR=v#v$k)<(zmU+qdL!3@W!WK%&Ot_c!;a6u$|Y8BA2D^C
zRj(fBt5jKy>~H^=W8$InHvg=q=OPqL>h5~U*Jhnu(S#knAoKo)L6(8l!X`g8?iYjN
zzF$mwDp&z{Dem;g-b3mRKW@C1=v<mTt(*3F0-H)xmqsK<4T_DuhEGCZWI(CYW%pgR
z*$2nGwDQHG^h;cI8j`fao$(=Hn*m4gwNNvjV8F9)3<E48_bVJ9pkV1(<p3l(rd4@l
z9}yT?EgSQ)JLC9&?=;OWy!|m;M?GzoZ+BoZs9^}ben(f4EDyLaAS1|zE&2+3(p>o+
z$_89~)<sr~&vGqCMWoU`!&7N@U|fSI6%P(DNq8g2`UIydeB%xX4DvTeyOXk`bLqu{
zgn?gPr|ON}e@+DyLwn4Xo`6Ec)ZhNPKl~&XBT+>w3LUW8NI*EFdUjg5BltihS?gYS
zALA#F_lKm<E3n{guMQY@AHQ&|Xpkqw{!<8D<5+NM0~(q<6930%*-twnRhP{zRZXfY
z$+g*cC9kX)nhSxnFVbtj90leMfUx(vuIC>Lv?p%`0z*Son`Tqp)K=Pirj|!2p`wCD
zi#zR9NWzwG|NGoigaW_|FRmb{orH+1X(dq%0jeafy8qfv$EhSLPzLy5f7x@DVt$T8
z1xPd2`^KIkECbHH-4x*UMO5SBDn>FZM;^1pRSe>%2XbR)TmTtVa>cKZxy?*nUdxAX
zt=4OVVDGEa+Kyn@8s|kGAeyl-QO1ptgvk@T`4^5@I`mln$_ozgh;r3{YmO@^arAC`
ztzKZV_8k!Fup#ClXV}t001=5@`yOL*xj*^Dmv}Fx=lp;>e9*jLB7ZuBWzODg?=754
zs*W`;a{QBxoY0N#C|CmGf*(J}{GwMXXeFD|0icDcwB)2v2^?GCeNu0Ys5kZ9sSuD~
zX2$E*$ui(qBJKH{4mDiz{m6K9&p{h~(D6Dxw`4uFGC$zS6@Z9rRo`rCB6Oqr*&R@|
z<6(7_sEr2}tlOW9%qR5gs1IIwN0{`05}*X(B3%^bZBoGn`nTm4yc3^t9#eF>9^W=A
zlCMr>7fBB>cG|ge%z8v^QVDE3meW{1PR_yPH1#{DlrgenS~(+;yf{(DlVku@O>TkD
z4CFiBW`ljv^ok^sPN^s7&n0JCKF}3G-rOK`LFssqwL_5-kAeuDV{s|2!giL#0dPFP
zhKc~8Qh_bc)95Xpp^MNTqq2hD&e@+auokw}M>UsN0s<`X*Am3sl{YxS`da%AI<^Mp
z#d+D^Ru&<leU8p;Xn<MAI=0%|{=;;)WCOY$@QQ3Jp*Nrh_)379R>p%@(H1ql8VbNX
z6FNS^Mybq5mXwVmJ0XHOK(r39>H_3+V!CR~1Q)0Mz~*UUif#}uCpp0ATa?0k>Rj%N
zYD%b|<ZmUWXa+50iOPCV&_<(a+ThWAPW?i&T3jM);mnsEqc`ay_f7@SvG5eNt`_XE
zYnM@*sJt4~RIPI(RaD{!kT<4_<5ZT>xoGRT5r=NI?*|n$Nj#w1oi=)dg=XsNQ`OW8
zsQOSIBPkN0FS5_|YUENr0B~3_B@cu&z*PaBU&~`ofnfCac}BShAkylgq9{{jHMivj
zZay|!Ur`*&Zfnmz7vzXuyben(Ap+1rbjBTcC6vJ+hLZ-zQ9~GDk`L?bYPMKAld|Qw
z_AMVz<)HNRy_;^Po<)iJAKqWK_BL|5M1}^h4;yy177znta;;Dr4KfGBN5>YMNr~#x
zD!n=Y0iJJ$2Plq0CQKw|691$Zj5IhIuvp&Gk1<^}a}pbiWqyK$-o=PiijKFLd*%1X
z#~FS8e{(Vb9<ZWcd<!_Cc!ghC2uSJR0Cg4xS#-_)AjT-6{&w+#5}*j#92Dq~@64UX
zXIS?n{XeY|?ti#*(bjO8f0Owh-!$s~eMz{G;A$BEH^l=QGLFjUV8SGc-oLFw(N-yF
zYSdp73Gqs5UXdgdctEWdk13doZZe+GxO5{iGLWShs$pDo=8Mk?2)jVAr9a<n%Ud&r
z5=i;Pt86oW8ORW^Tv9VjH)0AAAzEHZQo@!IhttYz`lOq>7_ndm-a->B75s0oAh{zq
zkt+?69);I4ANHBoCsdVAL}BPUeZSbg6`skyw7FHPV)O45P<$z068XAmh>ci&YhPg2
zKUtbmHfkh#Xmxq?GkKp0mno-CHBtEysE{|6?mx2Jl8;YI$y@R*G0KbyPufja-xK_g
ztCGXb(9MJ2x(|#jEC&6C6%U{*k_Xnfs~qa?gua=IYsv$h1;3{s3k0v>pFE~T;4vem
zu1DK*lp(btTSSI^Ux?CBm7L{z(ew;z62S9UIL$>qX{jH2Qd#c=F>?H#tE=$Ms*2S!
zChbJTcW6DR1bAFOLoFG+*wA&-$y{%8ts=5)XL#5+SlU#z6_$(|8ETvE$ORRAN=^41
z$$n~b2L~>gt&!J*QC^3}t1Wi5Ns;$hvC00oiF41Z;T)n-V34A7OA&w8G5QV0RB#}Y
zO@Q2j6BR3${T4av1F?bCug>bnB*BGa%B9xRC+tY6<NoK=iO2>va}t4csf@BbHEkE_
z^~nw<+8zktv#wMb&-|6@X0LUzj`buZa5k(P&C=$>%l_{Bd3D3JlV;1EtX)gnIa&^q
z9Wy>ouZo@5*|vU!NuA>D0^#}DnJ`3z&CDf^pQG&+!F~bVmh{K#?AzofXF)?ycgz23
z$)33~Nk;fb3T)^L+F)cb*E>DAFL|6I?qqnugvtcR7%SiZs9;KmcvMq_bH4s~35j##
zAhCuM$5u&~oTne7@6AsK3gJmoYcG2;+Ah@?#Qp)d{5%GR&}>1(hwfTuT(2|b*RO+3
z-oe068Sh^wb9T77>4S&P?h29UPd8PUp?L!tO72U@@7l%wDQ<g6qB(|&cPb04>pp97
zj5;a_kH-C6uxj@B8OzHgoe|EuyA|b)pGW^nBk8k=BSrZd>AuZ;fAokzb;g08-?S7+
z*eR-d95i7TGa5;DTZlB^Zmzn{{KeDfV}44$Lt|u3-A(OAjz0d!lH#5uXSl)1E!Xbc
z(kAPZ?`2TK$ZorSauiItzMZT?V30$cvfp+}3!Ljh>2|ZJ(j%~x`u^*>tlFW#@Q<O>
z4QCsqOBO@H4QyoTX0|(@^XF8nGRg>W;SEYN1U1pgbx$aS?`Mm??Fxp#)|V?;ZeKh(
z9~%Vdx5TP9{kd2x@q5HYXsLJo$!j-#OUJEe?agO~*QL?IihzHh*Jj1r-EpOk%)isV
z_0z+{XNgWy%fN?gnTuPnv(yJ#Wx^NK<)#OXBNcqS_cJW5QAbA|8RYGkuBSpxvkI54
zqfe`S9jlSITQ{4BV7eg)^z|9*=8CG);J=kw$hGOsB%|S09-xKu9!*3c9c22iLAM&e
zE+*%gd#td8-#BijQvFZ;3#KkTOEKKW9HsbI1@!-mWZ+u<jsGi$2`ELy{#(YjX7o_L
zhXeFK2yrj!%z~r`!WTZv4u}Ny9`+e|Z5<U%BqGBB#f3%2S?egm%UjwiKB3~0l`FE;
zh<-K#g^*yFR+$H?<k^vsFeD<c7)ZvjqQk2A1h-z#=uWt!J+0FpPL%=j7eKa;hiu~N
zvvk6DVnqOmvvdy}o5qZ|0F7hSY$pBg{L-Er&tH=}VNvzGHs3|>@B`HJMm{DHso;l7
zIKhXUr_@hi7mI#@`$To6GxjjwbZdd8oGS2Y>mK+vn-vQUA>fEzhvWTOwPs}4eO=~k
zFxX|?e(1msG~A=i8@R*%ONkV8)VL_ig}bcz<kRn+#rb=t>yR$M&tNO&{5!`KETST3
z9vR8d=cCiOlw~*8YW2W;O2J-wtB2a3m0DeRCr1mqAFPa!5S5R`P}ZuCf=&qAcnAee
zm=*d>Thv73ySm+pUDHYpyfqlk<vz;unW3`MNvr}#_1ZQpj|{9GGh$#N-_RSj5jy&i
zaaRgQ0qQp+j@rlFmTg-B8QTM4VEx@SmqIIOz6so8KSHry4QW(6ZO{^xG;sc22<csU
z{)Xrh$RfuRojnBa^<B2q$LD>5N`-ID3;S;gckFusO*2dno_|*V8<WnNu!sHKGXxZk
zW!VvNn0txmBc1~6<!v7@xUO|+44$G?b7IFYhvU&P=yfZyP?Mco+h^kw$Vf$e&1VfE
z4>-vcjhTrw#MUDIALE*X)5<-f@G3UP<uEXjZ{W7A`9^1r@Xu21j7%Cc7JuSs*(u)s
zQZiAFRV(XA`BhS+jQcs-@fsCsM)I`I_GhMrG|(JF8J$Bx9mRc#_wQlYO!{D<!@7Of
z+w#YOAva!iRpsWy#3*<1?)R6%H>z&>rrq2*Z?Msw{Kj~I^ks;(2^o++(_sj9J1KES
z&p33N<TSM><+Wu79C)Vy?=)>`7}=>@Oj+Z6D6qC`yu`7BzR=--md*!AYrH316I)1D
zE29Fkf~TgZo{W^}PCE)bwCcMP@->+;8lXtN^e6A-Wgk9i`y)#gsXg%c*X_J`x9v;K
zzvRUY$}Ps*us!D=TY6GrT5uA&K}msqXh{vDs$}Dt>P;VMnDD5H`f(0g7~w812}~>?
zp1nI?IRBnS4T-7U^e6A+6&C-&t7xdpEXQl0bat}f#ydUkVuE4L$U+BxPt!v7AbjAC
zR6UznHt$0xRsWQJxJlGZT>4E5j3$|fpg%*fT~_qPj;3=5avC)_Gw$by0UTlh35_ay
z!UHElxJVX8YhroF7xWaEh$uNLlUor!0~U=92=N_+!#b^c%&K8(wMjLvQ#mK+MNKcz
zn=?wZTW)~Fn3~i*+)BXxBe2oPH*`=&?XKq1qt%V@M{=0yL)}k}YP&T+KE@(>J3laB
z_)g`9s5!&-rF<-4WnxT3xhvZqKo%V-9`ibNLEgs!S%F)?xe*4~H=tsY1?C!V5b&w2
z%LUk<xVT%fa~|@oo<%yXnnxW#im5bI?Rv|!ow59V3k=<5bTn-&iUf)XBb(%2e0y`7
zx}1{)GW2#Y5$`ibNQWDhZ_e#oQ}_c^w@izTjRq&Xu{AirRM6Y54RKS-DYr8~MIA)(
zhenSz(a?d6vBQ4waE~;`T5uNE^u9QE+5ihvQOmcm;UJD-JYqtG0^l*Uc<4@MTa|AO
zzehq73}_{NK*wFIZ}l3Zv_R7HPzIBPPydSaTZv;Z+!9q=2^=Mb`6QxcXtCY6$yR!i
zIsyx6WX2%jjwTT)Yb}Jk4%FDEfNY-~!n8hI`liT=y<12j-{SR$i{UX9wa*)=0Uv=A
zMLMH`tvlXk8>tgB8bdq5tm7)4m=fH}P6gutV^*J}o;iC(4M)|+Jex@dTsLgkoB%l2
z`S+h6YRuqEI;Cwiuqu+7a3%PPNcnnS1M2e$cT)Wsz-+Y2;^w&F5R)Lo_aA7bZk3e*
z)hkdj0C!FX+;U4I{fg|TESo=NPmez`J`3-VcJj%@QCIp(gCiez&d&EMe)^D-ruO50
zuEaH6ID}xg%V*QtS2Ex&u%Q4TSo1$JH27VHPmy~=ZpB!!&=r%^)d1OG=Zd0)iaM7<
zH!r*RXzMGI3blO}RsgkXq6VD{%uK$wL`rmT)wx%dW}pAY<~Wm^yI<)5&Tb_ns=u=c
zcv~^W1ju~RrssF8`NAg5B-IsAuj)Rf2*qcvzH+$P$JjmsDLkyCHd+n!59lgGt-$kq
z53hk&>(`|X2ZgE$xnhK(XpZzLtF@=^oV-VD0cg_lVniH`3|9}ZtPj~+Na*asw-9*Y
zux8@MU8<*u?j%LWJ!g~Zs(;h_BShNh&nI8sx-rYpvqg4ig~5_K?ZKWHC7tiMaR1;U
zQ^!8MUGoM?42Jv4$bb|qQT#bfTr_}5gU%u&iCFl8^}HcxbteZLSt@eLoog(To<hBy
z@utNKs~0#$5S8(Sm^UYCWE1lf$9L}jYe&&)Zuz-qrX(zpOXEl!mUvAY21M59?am)K
zP65|bL9=u?jI{t-Wx!b!FE@+!=G#q)vRAH~&5R#&i(TWU-K%D^07#D6JG@0~|Aqxb
z3RB<u6`7Hh^G<xN)=h?2con$^QvorwoYht@nIUE_`Kdc6oqX;<UA$lMG`8p;KBqn%
z{RBSpxaH%t^x@!y%KIFJVkb&t(9^uxef{insX?XB#m^;XueQgjrK?>{TnV6CqWmoI
zWkm9em&iNzlSA?xLNLqG%teoz2X^R>PofEhH?3Y?r<rXtI(M}*d}ELYKHr1yT~jmV
zz2pp(uj4Tfj@HwGCPeUjc0tj1GUY0JUwT@J_b~l+&bM~pXE%CyhHqj@GX2?NXAIv_
z{t7pTxN(zHtEmaEu2>?4uSOA7TwizKu$0oW&&nD0v)>93R}Tv+An~lB;ePnQm+h4w
zzIcg9($BZ?@WDOUk421@J%Uw@tHxL9@+uPMNpg37r_^KG*t*UkGkNl|-h5)gN9$jD
z{wjDk(sizO1^K7oIe6#D&TO}f5d`j~^S%spTs^%e<XwAS$kHBU34TD!_Fm3tY<tFX
zzkHx&ku5f_>1s4zJKVY)+=JLW+rG)$C>StULKnz!j=7710Zvu(`-ohdlBg)I-LUI0
z>=)-v-k6~B9X8jn^HyiiGnbHsrEynP%OVaR{3L`cK|#1zROUn3Jj6-42|Z)A7!g|{
zOy}F?@<f3P3N<JBRUKwl_EfedVsZL>_ovVxXU7d*Nr{REcf!P)MXn7|yHT2$Ku%F*
zZDj6gHR@%ZX(OOgKK6ueO%6A0`%FUVeR+lXelLb-<DVQ~b^N%<x#MX!qPVe(O3i6R
zT5!VIu`Oz~;h4|%{fLn2hby6y#_@z*U-5{>(tzK6P++FUtj~-zM&zn}LERcCnti(g
zUUTsVIriEYB|{qSyY2jTw+I|qaV%L2Tz<==+xeyx1>C5&K0de5yQb)qPw$@!)vR)u
zj&EERYv|Z~XpWF)7`+v_k8tDY3)KXt>+!hvrMWE_J#7|^HG+4$hP+p<JXbFfok%iF
zeR;p!do5U#Dg9MKU>-5_J4;L6RoN4(t2=w&19~WqUl>tQGvr-PzA}<`>?0(=h~qmx
zs*Bv<Tj+bCZK&g%Aa)m*NOJJN#l_C!=XaEb+S`&XCt`pR)TdAIag&Df2l7`x26udU
z;UrC$t7So!+8;kdPMsB#mDV*GYM(*C!s<{-K3_9Xh;tpZf^*QL%uTAmIqn7z8(u-d
zO^m`=J_%(z)2@L~zk0Q3L@}rK9C&~|1XLe7v`3dAYt$GX&@^hvY1JHfc)ASYAv1Kk
z1~sqrRolCzq9_j(u8MVMi%R4tiDBR}Mg<M6NvW&nKIn%;4XM12RWrdH8eH?Z8`$93
z4$IJ^4P?da8@6=zhR-*F{}{Y-iM9|rkgu4z1t|(tpvwb{ThD>q!SGX$5&73#7Y-;v
z^H6MVG4&e!GU)obF<1AS&jdZg7RT3ObgR<Y#q)BMzxwOBJNPahWcxb<-@Qk+5Jmx+
z!H}OL6wKDk2&Z-)pkc<J=kD;A6s}!f%ZT4e^}qt`Frsp?_m1+v2jCBB2B`e&L>*>h
zkDI!~1}>y|HicOpU?pFny9U9LUfzHk{kZ)~IsXM6kBWzf$mkP_)Wvh9n3Al=da|vo
zfR1zwhmEGC2XbKG<o8oEZ_7zo|La6-aX|h_wnZac|JR4f?56&gTVEw9)cv)G&oA?=
z%K=5fb$fI-^Kb=TJ@C~~*Gy61!>_C_ulu%L)mCMfxxO2+^M865&0{=|(CEVAM&9Gk
zmrNf*X$m^%{HJMy=@^v_#$m*=it66(B4LRavlXX0%H&@v({(Q%NmKJE{^UCnkQ!W6
z6@5@2-RN-{Vk9XelD$|ouu+5^7kc7DN`!(m!EZE)^<Rg^OJLv=#b2V(S@HH1W>_fn
z^U)yu*$>_saMGw39n>!|{7OXLH2*|lT4dbyV;yAEH>15jp`FUHq@qlX>2Eh{l(62E
z^Ap(*54&b;{j_=+h8)TSxn&S~Z@UQzi3x<jTOfYyc;)ABu{-A~(hN#REIvjlLE`>S
zigok^?phE?t3FiD70V?&!XnRXgsHhub(sZt7Uj;%s9a1pq!TMLpF|<65=qZo;-PBk
zjSqE_x_lkCeu$Ispt#_Q-4}Y#<mag&N~=GA=MUq8zI-Ek^{N!TVs))nWYpf(vAet_
zGM{QTF4j83%MF8&lhaLpL5zu*%GAQi)}d?hzrecsdjijXO@O$gJ0usGOOgnhMo1=f
zOm9EIv)Tlb>tp&lWjMH-SL$rd0({1wF&H*QA~*G!b0eaP)f3pzi909#O11dxNA93R
znWJ|6(lL%3lFT~xZ?*%(fR?R>5J8F$H@GwM?O|;TzSgnzCckK5az&hNe*8~&r%xP?
z=Ynx)B=B#tqt`&_SBH)G`qPA(TygbQ#0{fOp9`gNfccmlZZ}Q2|6~^y%+}9px&GZX
zR$XgKmn2JDy>^YxLgb6Z%H7eEzbf<RntCUu9l!KJBbph7mN?(~h2WXvNvxygJ$$f!
z*gq#oB3%2wE{e(1r~kmm0|$Ph6K5h`+X*X@$*`86U;p}mQC;m$H4;1m0_}s^gX^`3
zrxB+8OfKmft@{+p>~-Y0a3tOvdylvjepVEFZ)`pCL74kU^%|8tp42a9LB9p6>zz)i
zK(fvEV?p<*uZ9`B>4olm!|D18oi>LDa>H0hR*CX!P*sC3D^4^Wa86yWk7cDKQZvg1
zz23Ce>0DH}WBW{;Q^hYV{w8nB)%l>e+uW}?p15%7N5L9PujO^4`6%=L;V%V!OVasx
z$No-Z$18)~=H2yw*?N#CwF~m|3b^JH95;k&(u~u0e1F#Qi-7bZb$saVe_DXSvz`^Y
z>+y9AXM+px-#k}5M7P8d=B7X^`^U>qP|NE<1Vdx+RX{fXDeikuOr&s8C5n>V+){q{
zoo6XA`d!Sf4xP><z!;AlABA)DueIUa)VC}J0wiZ~`trPwx5?mm>g<-IqdTO5VQvJv
zz~P3`?pBkaz=?4m6B*!2bQUzR*`Fn!p*F%>MTN~Aqvtp-jwX#3X=}8^=x+YzAjwFN
z!{8X72(j0$Bbkjd-&|U0cwdP0mtDd<iv8upjP`&=LJX4hHxwdU5UXxTs}RRW@<p%7
zGeweEr1VK@n~fXsZ#nGtu-m=c<s%aqy^K3h!&j@_+$NIsqwzD_l#<?1T`w2R_{B3p
zW^2jDIgrRMH@j3zK3_ez@I9kXno3o+$f@y)jX9Wke|NSwIq|KXNP8B18xn|2AZ{m(
zOCp^>7vuplP?#R)&Keb`6A<vozRXmUpt0N_7)EVByisC-$Xm81wSL~xYzdH{OnoPC
z^lQH3=lsuijV4U)wq*G}gU`9T-NS<4H_eE{zA~E{x$f5GH9zMrlqeR1-{ZdH9+$h@
zadM-yS<wzAH6)NYnh!TmT^;dZ55hz?SN8qdXgRo-UG`5`sZxEJ<qP+(!N=@};fk~o
z2~*ZwO6AV5pJ;XBx{VII?o(m5e3RgiSY~1fuMgQX&ySm(7QamFsa<6pvRyf~Ifq=O
za`K$NBrwwIy%z(+Q+P%Iz6w#{2`;76k1ez+<EuwR2$fKe*3zhr5&8L)5T}LYDmk)p
zzk4<}(-t4+g#_$zVTvMPo@x5*8J)v#aH43H*O@6Dx8|m+qdCBYpTElXy^hTr(CjH*
zruR{8W2(>@HnR8x+^LSB9)l6zZ|en7=6VKWKWVvF3oVh(uuhp~qOvda{(6GYhaYdL
zw*pEQXaG50)A=emr>Q4*ZaB{poMbzb2AM-hJ`3m-8HO9_2+cF|_SdD?lI@+Q;|Ycb
zTn+NS$Rj0UM4CYF%J-jv#{Q!UIjNYPY!4O6^1bfKR9fcrD`Gy&8GE8@7-@@*?8RBy
z*nadoMg33v!roxQ0>IVUbekp9*_hlT<m28x%2B)F|JVoZMt5F^1^=HM#!A=vQJV*J
zuzzHaFP+i<r|bN`kr8(bX>XL@Fy6E{f3%X4xA0?XvtQ%$5$YOYhVO7KUaQHc4!ymZ
zJ(PMa{ia(|qa69oB!gRYZ><-COTX`x1xi#3lIP7uPnf})n4pRmn6HI~2nW(YpIg%K
z(|**34?%Ur_(s%|OkPw&9gj^t2^{Om|6yG~bkKiJroE(Cd|&>cU?+6R;mwHj4xAgF
zx@Y(%Nh#<1Vl+~YuzAKBsP2+?g8l#D>n-ErXr5?ck|4n$xCer}yK4v@oZ#;68XOjv
z;1XPeTX1)Gx5a&N=bh(o_xFCd_Z#fa&Q$kwSD&tPs(SG@B)To<73Q_>i6v{j78Hfd
zWVkd>WAeYHov5zE2WLa?1=;ZMVDcF3)ns+U6;?3OUUQ*i)$TIP374+J@Pm7@{Rx@q
z$t~Sa#H}3TXVG1`;nh^KJWbx4?ZA~ReVO4FuI7vSzv7x<3ki*m&)#b~y)!JZ@je+`
z1~kMy16^RRuY23>$Rv$SPjCo65>V!)@|@W@e9Iz}7`c3&gvn>`^o@EVo?U0!pxa=c
zXMXAR)G)MPT@$Hbs(qU14J-WeY%p-%<T;M)EWYdknPi(yl84$fzcXm>cqDK~V&Y<G
z{sq6Q{17Acn&oBdWzMbExMg~{Ba^Bq2!k0sCbgi1jo(+P`-(15%~+BB#&rO(PAgCK
zyYu?~Z=*XDk=H^WMr)WbP!cb$lFxD*oXL3TMmoyxd4bR~hg3^U)$dzxC_+N)S=~;=
z&@2NL11y~>dCPD8U?{gguh>E_J3ZEx8Y2lVsNiWG`b<Zj+OqwtXMZ7s70zl|%#nt{
zRJ`00e+--M1(G+jv!+|V>)w{dsk-o?JkyIm1b*CP@+%F8GS{1T&`Yi-Cga&vNHd&&
zZC?k9X;M3jd>sTH6`jrt-Jab4LxQy-_!uC(NECxWGH!8=j<VG00j0$UX;$|h{t2ck
zV_T_he!qQw!^kPoTw@t%@zgx98|bm9O4Og2FGmhsu$ausmPS}Pw3)K7uXyb8F4xE{
z!~O8EeZyFcmb0jOpS;(7%s{9_m8k&ftqA@2B$J}clP*)J*SD8<yaX5>NaQ8GAAhh0
z`8Xpr_eqY*{!py;YOx_|V7E{Ry`Zl6p6E#O-_osDRvPLeTV+Yxh=RN(XB;TTwXu_<
z;AhL87>&Q1rkot*63Q;CA)(@&@^<A3HTp26rPJ951y&c_9(19()Ijqrb}eG+kQ)D#
z;8Xassxda<iLv2+z0S4bg~%rUF4Awx*dW@Rk?CM(H~u)7hKJvb!fJP72(<0~W<*Hx
z6d?9Oy*LGjnnTuFXRFeM&NjsT$c8swUdP>jKmCWtJa&8Q_geIJUb{Fx-uH`qtOy(>
zu7!e9Rr>o7Fmb7Xx;{>FOuSpVM5ifu^No1@k<Rya`mV|vF?&Rz6Juw5|5XpA?tE`Y
z?mbMfU+Z?+^S~Fs3BSrm1fsyx-f5-R!>*MH{Dhpp)J(WzAJTn?G5AOfCFff242c4u
zDgJnw{BJtL880jSyVJ)1ms$IN@jm~Xyo7}bCHz+J=*so6J>C_=7g)Grn)a<iry0!5
zQhXVXzT2xJ%(NFlJ3qmUMz;JWaXvW|GZwhg^-<H<N-_u4dFD7<qEHX6vs!BW>rNxT
zVnu9BvjZQB@+z1abh-1$4vM1H26e?hvKMZUg5ALKw8g#`&DObWc_*yPgFJgH3(Ef<
zj?Y-rLpWLWKTnmae8g~$`?B~4SQURwO)Yn3FdK+3RWZp8D?vPF?mvi?)moq<FZ>iO
zWDN!5rESeR2HUS9KomKuyK?&Q3C)Z}H><%KB8VeJd3=nt*Pm(x5tB_7bAzWaL*dtW
zG7Qs;OAXB4ku*8OwXllSK5^4fy}4e;{fCpOHy-?tJn`JWpW*2%YX63FTcOR<(%UGd
z^^E6?3q~9I{OSWiP5AW%=Zjn;*}n!rFHb)w1AgOK&*`1vt{hTj=(T=E_waX6e`M~R
z1_5OKV_JnWhuNp0b`j5*dL#F>CU0HPfSSGch7$b@nWFYr0~4AR67T3k^(=+HM4)&}
z3r#ugz(vzRg`I@kp~^(6Y%7;&xfc{&GL<5Jp|v4Ls>Y|r4H<Bf&wT`9h~7S%CZ=kE
z<t@1<zE=31<fIPloD(JbIZ=Kkgj-E8#%4*QP<musY-dLE?|+HG<LXPwvJM|1Q_hKM
zIWLRVIykyyvo!@`is^X?;I<k(3FAFuvmJGT0)E;Gp3EY4V26g+Jqy_7=0h9)=+vR9
zDqpIp3c=|q=;#k}KW-hIXNc_ybzhfgYlbjwT8?>|Ex5+`P#n$2u<2)vUld4sM)!t#
z<DtJHEM8m<;F%xhgF9JJD%Jy1xt5$a5}b)_3w_t*N42vW)})H{8WY+SD!vfO#mW^*
z-%44O@rgcVdD73s|2H>+t9$qAd#yQd0hpEsS9LfamHu~SN#N@|Bg&tp;R>nj{l$>A
zx)bi8)G;>SAwo*V)W9=VW`4%UbUyA@S3V**{OT+0b^F-iRE)_r*k5dvXcdh6_pe5y
zO_%TcGr1CyPD3Z1qo(2KWVLx?{O1`j59x>eabfaglrW1mxihC7RD;TffxnO?W)8R)
z!<YySr>8w+Tg@4zbXP?L(~X{kYm0T*KPU_;0G&QtxiSQpB-KbrfCaNs4J7cYdc5s^
z98RaXXb%350^!OzS<tzy+ybZ!@?OkokNN6|Vzg!J6X5{qx<F@zDwAU_S}}rGmbSuf
zdAO`@`_HjEp<r-E@d0;nbI9Jbo&|<IA)JJ&u+k)w&%MTjY?3!+L?tsfX2J>-j7pEN
z2wo|8kvhYAUmOVvg`1nZ;R6Ew%W71lkR9dY?JX~@7#88Fov}()08IhL#>2Yz51Oyo
z|E7=E>H3Ulx!W84zgW4&9uOGM&T6U1?H+{iUZxV7Qbtf|$mep2&dk4vFSmr>A#E}t
zO*fg4fZm5JM429_jHh+_n{mx-OiPb-9~yruK_57>;+JnKeJ+P!M&b2Ae3048HzX9q
z2$UIhzb?dKA1$1BSKBz2uwyW0j7=f83>DQLh`2#Do)vZ~|D3pJPw=7m_f*p(A1kQc
zv8#mp?Kn>-^qc9eI}~1Bo%yDwueSw`x&%+IFA*OYmk-t`J>c#ixW;>`G6%e?kt6$C
zt?8AbY)II?CkVMjKA37r+n1<1p?c0wH_k*BIi6?6rTemEQ+(XsTm}y~g0jGzG{Go)
z*K{$nkkg{0B3sclx~s(gOM;{&KuLLE3lUEGTD&-mEA@ycfhPOA*!dJEQe`@5h$At&
zvYY5SPR4YyC|M5n_o&{b?QO|_ybFtpoKbf?4-_zfLK+ox+o65}KLBoxx1J?qw)D~~
z1ElXiqw0Mt&P`F2fY6*KDd6*M-&l<Vm+Ac_u)oxx%8cBf+qJ@$Cy5SK?^9OMkypAI
z<U`~@EPTiM-;xS;EqHW{`u!lp^<(Ds*t-pk-hcboFt@1xievtNK^fG?Q?@e|sGdCJ
ziK7zU%`wZ$6E|ViPFGzTyA}N<1amP0qOmRqzRFZQm3Jdi+cni|@XjkyN4SfO7$Yo7
zm{{qApxm6M40MZ#KLJjTw5r+2S}1+Hl2f5+E~_Y|u!1r@gBpIeZ~uZwA0g^;9ml1;
zcy5m>!2}5JgMQ_Ze@>bI>#TrVhH2lK?Q%iDS-m>x+XyqplF6t_rIPZY6J^PG&rElw
z2B9qi;`dopYzd$8cC4t>j7$fS;BbO{d#(qGms5Aaxz3_;wq7MHC?x*<&sh*|dz;jh
zsa$X@3Ebf57+Pl^7My5giPx#ae*#FAlw$u*)CQ8S*l*n8hdUlfs#?J4^F(WMl-XW0
zE_tcsWw+%dO6vcPm;q4v!=0U1bi26qef?>M2lZPS$F0jG7J;zpuLY)O{6p=e=37|D
zB%W5e`TqvIkToC)e!JV!SKO;>q7ubn<*8I@@<3|MYqXyAE5;Kq!IWy$ThoatCl|?P
zxDNp}1n2wajnhgl-NW1bb4aTVOMx}g!nk%f*h_EEy)Ie8sPM8V@W=h)vrLtB`oLX#
zqzqdVc=*TSykl5BvJ2EKrs{<rP!7kBRdNyxUe12mw*Wuf)@OsWkQMBIx8RA*JskaN
ztf{sTQaIfZuNDgW!FB-XekO{}D3YXy{|6y}icI&L*H?dS=Or*7M1k5Yjcl=?E5jGY
zX7GpdjQT|Bf;s*wAC4!!w5H<_CP*WWcWi~Z5FXparS#SPeZa_9;~DPGI*`_Dwv3&m
z5|$u4_`TY=?(hWXOI-HbSjT1cLvoenxhElEVNB^R70H8?Zt;f1_$bB*B@F~8gCpzr
zz(5<x3yvV~`F4(RL0;VXpXc{_n!Xs})@cKGZL-xuy-`j{|C8Jha5Vph7Mr0#44G0z
z&K{BTV+9o!o0krk4?AS2vms}%MCBYN!u4Q2zCSrOH~x$#Ceqc_Ri}qn1D;yLd<a-<
z5L+L{aDJjj3^|NK1)>@S{-WAx{ub%^j(CrsluCO{%%Z8VzHCAa&3RRWP3ll@x7k*4
zOQSbVb0%@}xrixFkPXcr^+ZH3_Cn?VRg~l-B#D};Lg1IWP7wBkD71iJQgpUlcn6xQ
zQh0&#j%>pGSFW~j>L)&eR1Qymr!vZEtQdVhlu`r_I-%LZT>S%l!Q@bXmB_+Z6w3&y
zOvmTQ)GulBio{6S99ibP((|KE#bv29yGCKApNvNd+rmY7jb0*YXa~BsfN{2i_pbsK
zz*r5t)W=m-g5Wbg+oD2IyuQbdOchli>)t1RW%O+fwbS<D^BEFpzh2Z)!A@kR)b0ma
z>^s~&sz%aMo6Z+IoY)EtPeW?|f)zS8hb<yPL!fdlvXlJ+I^JIHXfEL%&aoP+>kmH{
z3S3t?NED}hWgTD4MNp9tFycoXkS*L74w`s-l}BiM*WR$5(7M``?B{+02fTz)Vu36I
zQ>NXw_ItEO!3&1O$~i7xGv323mydNhF0ixE?ylTo$7m;t_YW|^U*<_QwA1#!&i2)H
zBPAeA@FT|?id>+lucz0X5c9wALPs01(K7ppw<Q*Q$GKzXO6f1+Iq(U*za^sJ@uO86
z6w0-J77i@`58zpMi3V5g4WfKnVn6gw@;8oTqSpz}Z50Y_c-}?*w^SKz+ecgplk%4u
zm_<7g8sV?SI^25W5Y%8@-hZ}Br<NePVwwu~${u&I|HjzR_Wzf9!u@|_h&;04t1Xba
zS|m*v+jyLXP4&60a#EAsMUr#<96NeTd4tBZZfHt^cd(<ro(T-}o*s<1&TwYankbeu
z&?L7Q9fHv{Ou-y|QAeCr+LR4Oxzu7(vzSKW0&s&%2#j+g?-I3%H`ZYZb*O+IkFG#Y
zuD|kdZSYE6+Gn;*N940s7Gmf^e4Wgm(m59bw|X#s+k;fz<r;5vpqBl6qk5Nh>h@J9
zsaJ=j3zu8;sV`y@ac0i~F`f8>Dv`&BE;YhwTa1<6-%Xg>nusu=KMfMpE*Irt{pKl+
zM9yy1kepO7rMeRC;^wgk8v{WhrRHYkxlpz@st*yR+tjomk0YMq5RqYUzvGg=mwnaX
z-x_g&tl9w=>;{u~(DfDrwJ4i5cY_@5a?A)p=Y{)(22(;<ZVUeb<6YQkQYuxp{5Cvx
z`)*3SF4llb=j~4hfM3`?ok}?A%Jcg8H`G@=+{9?7oRFeDo^5_elZVwb<R{(F(#Yqh
zsmz1IQN7#dT5ID`B%cyNOiX~J+d<iKdyt}Uy7HWUVArIJ$ib%u1x#FI)(DyHer);x
zgjZ`!u`#Kgake?+Ul`Bjot2N6C{ktO8?P`u5gZ6Z*HxGd>~I_R`W*-_U5S(T$ZQHB
zM`o|-_VHk(KPshGylq3e-+T(}Jh0>auc4qM)^i(G^niemgSE9CvDAV{HOmVl9_zJf
z?^E;rn$X(U{jD`9a=0h=lW2wMI-^s~V;jHgq8=CBKOF^Gavd+Pe4omI!bUuTPqStZ
zTk|A_d}0w>-p9Lk5E7>6fs5lAo>{jrz4KR&A@9pXAFj&g)bd|GYhYXRwa%jKXLdKc
zmh^M$XB43|i1PQpv*{oon=38frkhZHj~!`UbI+~1r}K4Ru24@@NZ=o#&2^nS$AzR8
z_xsMCl-m*^W}j`;?9XZ8m!0cyO@0$EKre?rqBxueF4O>VaOOneKUTU@^P`uW?Qk24
z883&vob@(jMyBrWZ#^2(H-+O@`Cojb?r;0;u5MiP{A6x|(Y7<PUF*R1L^?NKUDZPc
z@v<G)uv)HmpOfX|@K1~`CDfI)_^_ZPlmKqHI?I*N>)i*N0eksphfzxKiG~YPg9(f4
z*{4!7M#BxK4yyG9iHYjK+0h@v>t@8pC#?~Hn6lr7iPiInA1Xdh@K)fCSxr#tp*J;B
z&zS{8`!@|3Hf$+)JN^QwiYIi1Kuk0l+b{e@;Hl)eyvLEgd!3mi9a6n@S(^K4qVG`p
zR`G<vRB8YPJ5pD8S@x=>t>a5t6f3|pU}30FGEa7rpuD`3o0?RK0wVk2X5jLas(?^b
z{?T196I^GVT$Y7<NRLk_1eaJ)@CsZm*pLR3hoh*h!RCE}SjMzdHq!o$oM_8$9^e$L
z|KSES7iywo`Fj7<+L-`R`vx~_v#tXe=rxgl!tB1u*=3ZvF2jEt2bMkTHv*g3agw83
zoZX)QzDebtuF2>16d2(*FE^W=;85ay^BwBlSI3GhXAUTxAx2$uZ~tA6k#`zC)*L{n
z6$s`xnU6}W)`E$n2V!RMZFXM;?%04<0S|9Za!F|z0!%2dsIqA4Kokg0y#IFJSD~kG
z4vR-8Ha-kFnKI*9b;Hfx0voXj_D9Gu#g9?}({GeMeW)hQq$opgeZ1DN8-ZBp@_q%P
zgf5y9IQ~#JZmv!c2bPoNK6fO`WT-pe=${&!Q06cqHQLkeZ0Cq+FRc?RZUt%R;rw-u
ze~g;=&f&xYwL<cQx2}keG2W6GGBz=_;vFO~!9N%|V%uwA?ymUsdYW&9&I%D(zO}`(
zb-7bUWwyMuE~I}#%an4(;90*ua}7Y-uGFQ?HZ<Fp-%}YSjgzU$G5O%tau?4zv&wt}
zOT2uC2C?>T^6vboxJ|e9^$ne~!N%COPg_M|H#Nizh%(z|xc<ZS0EEgI5z~6ta0y+i
z3V*mer~Hq)D4(?_d;3LKZ-SENNv7u~uiKeVHU_)1zY^VL?t5~Cs`Re|*ighk#$-US
zsnD{nUrUq6g`&h81W;7my!MG_;#{?KrcTbA)Fc_#B9;hNi>nQ(t-na7#)+)XH^mZ!
zdNV*gt+UUchE^wuA`eYwJaP_U1YdTFG@Y!Gm1T+?o8FjL_!_Ptnj`}?44jQNu)jNX
zI=hF1M@v!ybMT+HCQ0s3o+qa9Z78xL*y^2Wf#0we8KTs0Y4zhonE^f^X@TC%^}+gV
z)q6ke)fbqsxTReuprwD9FM!TlnyC*32Jb&c6R-7@ij}$V9k4~x5e{6Y?E(S+kjPY?
zg6zU%c{A;)rp8?-vLJ4*%qzE#P@n<X&x%_99mR>xhOq%4%l8tyO)sQ1kb}V*Rj{<g
z<5x*fAYAazTZI1Yaz|P5n0$^7<mTOF`^o@*FflSyTttF{!iX~WsQ(Zp%4X@BojBKG
zK<2;}HA|n=^lX;N8^|DX`t>MNg1qn@uRNb>383)4Z`0-QD@JeZQl-j#a)x`y%pUkj
z4}LA&R@c%qd}VLF9~?@uuXs+%m~GEvgwU^e#8kkrENNGhjAz0&sL8VoJ~bme-@rRJ
zp36}MNW6%E?mjRhv4h!WJ{5{kSJEv5K;UJkzTts<`T$Y{=rO7R*k~oJ((Z!4DcY>5
zzv;pX&7bX8Pysp>=zg(IB3#>%I@yK>yIm_LALNmSV!+lDI#_+jA!A~V2PlA6f(-gg
z{WBdA>2UsWqyuCFgDH5nsXmAeMtNntBI;-zBNNMe^+9b0lvoPejZeNZZn^2eb0#j(
z0p9mWmt7TH@<pM##0DcRR|&nCyP(5KE&4Yd_w%Z#S#AQ*c%Ye8O_nTob8AvLMX>-S
z)?AubN~_5hAw}{)zxw2HqfFYX2P^#%_mP$8mDfdB2}}3oZgkWvw2&Jho_^b3Ycy0g
zC&TaSp-{DlwZRyC5A<<=PpTr<gF*e7C`SJWW$OLsXp<a#*anOE5tAw#tNX=u{dYLe
zV>mQ_GDXW96NDXjmW&CL7h-I7-@#zQtc)7DO9~QF1)<p`I&KR=!NJ{daD_7rkYy^M
z>e<WFX7$R9F`<w27bB^@N3(|u%m1R$ATdAs{eT8QV8DWI%m|R@-II}9aFKd15e+}w
zoW^#nntawzduv#LsEFBa2&1ds4ccYBa>Xv-S$UtA>*RcEUZ~33Y#4wg(ChxahB62J
z$~M7qM;BsaYv*Iq4dD?V`*wzkYQe1futLQvd|GMTAjRU`U&H|i?nNg6oj(YV!KDB+
z*SXCrF&<k_&9aHN&BMZh^!4vD-n)|~vnBz!(5JiAiIf1A)y5}5;O-@lMJY;x9|jy0
z6|0B{<fb&V>r<E*UO(BcjRAZX!|4xqhER3NVSoaC{O+UC&?jv96X0ISk~ih}6&Qf7
z2`7iP6Si8Dv|h=zUS)^sXV4y>5A_=jc6<3C*E0Ws`8-YG``y43B&0h?YC)013EeB(
zE9|D@S?Icv=mU(TT-w;W8A&ezZSU~@&zv2jZkS}{yV)1__rdoO!zoTT1Dbvtjejrp
zm}VTBP=HrcdU(xAY<$Mi*lzanIq3d)KK*F7ASwBNG9UGwB33C`T9o5lzIy+z9_JbB
z*@lylSM#ktA7^q@M`Rh5?wB8-)&%(3feYq8&iFBe?m=+3{KD$3R^0HZYP|m}i4SLm
z=l}Rx)+g*NJMXNkvI#To`%y7nOw4}l=v}H6KT&=Hvzw_oZPv5+93N2kXie&~`%}!&
ziicL-pTx&+p~t15VXA%Kc)Rvzx#y5<#8ReChY-eYIJQ83hkN+dX0Q9RE)YCjlhakE
zx7AA{un->?=?al0C3RuvE>Yj<Ix*Rf$w{>aDhNR=7>nC~{yF`j$?W;=&y*Txovpt2
zl=~Tge0bOfkY{YUcUuF_K1)5tpim(IuCk`XOlV`}QUH)zEGV>P4WJ0zHlMG2fg|49
zoljm`ZvXXkbO!#yCg?ak`c3qs11IPz(4|Nxv=hlITu;EDw;?Fg#H=BTV%@>Km~gm%
z2+TY}l)G?}!!tcd!9-8VM&PJC@thE12w;1J_$I-SqH$6I^)3&@4IPlh*4oU^?Zx~(
zS>kGxcb(1hLWrK$rSo1@AO@7taEJ;sf3B=xLbn5<wysdBIHu}c6$4_UP#Ad8IDv(I
z4d&<BZM<Cn1vH}cwF^3Z$>(J09BSI;5joar#GfNLox3*)pj)HeD{2rcdwnq&iAZdr
z%fTKFVX!En$@|^+sdx6)NR6lrDkQ1@)dIME{+ioCpl3j1$|>;tvpRKT8#yY{FP>=t
z3qt6>|6Bto3$`QaSICm0ln!Xf<76_L*h-NUMwZf#D0UFyIA?-ZNr6rb&JqRGA;fFI
z($wq7g0KpMln)1R9sh*#bO2}tU@~!a7wxnNU^dv*;D~WB6D54Ig9i}%x0iz6T8C*<
zd{rfyTWpMxz6%0h?d8e4lc5%b;H<bAmb;=y78{JlGfJC6_~wuhP8jWXuxx<<7-Ad?
zHu(VvTat{B75GZacmZ&#GP`A=;-rQGI9FggX8aOU!URmYN7}IW3b9n&r1>p0rx4x|
zq{7JI&9$@Rgo=Wq17QT5K42{MHRdO2sS7+oq_NLgQQ%DgK2n<PNnX48xBN8c1WDb+
z4i|(xQ5l&ks{SHVK(Ds8rZ$DyUg9QoJ4mF_`EvNN#2aCx8Zf?l{es2>$UY$F(@$vP
z^xqd2s#y17_p>iqC3U|+BUJdU-zx}%WoN#VF;2xa7+8=8W$*FQ^Yp%N_sj4j0!V>J
zw^P9Jq{!6}IY3xxY0$9`OYHPX7Y~uHCm3)bzX~mb+Njz)<oT&CEUEld-HyeM3o;1J
z-i{Ejs?3UCtw`SA;<YRL##MY&iFDT?#(ngqzX0ls(QVXIt?&=!dCB8_c{_54LcmL<
zZ{_sh&Q$VO`l6dn*yz^`m!`L}KD4NC`=j$__^u|PyLeM^g=*qq$4Xzk1Ghpwi-K=y
zgX&Enbu(jHXkGnr;LqO&$Sp~2nbX2W1F!(;=|1mnk?H`|71T1W{v{BDBzIc5KXm6Z
zwJsO{<)IOV1SI~SL{%06j^HWEh~JT@R&%aWMQ}(_EZw&Q4NgS_;y$qwUC(csGfu)o
z?-020_1g+IMf|-#TEk^1n=R5f{6K0}{Z_PNAHYdGXQ;iJ&yhi~pW*6Q(7~UE3<6;_
z3}P+T4QM~tXQcfrDtmtgy~??p6t$jXgr!*wk7Xm=L+`?(uNLs<Hx*N{XAiO`W+qLA
zx@k|z+12kI(jN#S4C4NL(5+o%N*pi;n@wHP9T3mTz2aCr7|sma-K1sx=obsrJVv=(
zS~T-P2VM1~ETxzHKaFWYtsUHHh{5kirzk%})Rqev>St5({9DJepS5hC)g2i=|0k|&
zY-?>x!fr~M>BdPTT{`Ml9I4oOqPQIs)5}H09WN4T5Y*>vxtS&BO@=w2J%2ma$+gF%
zqxR<=1q-3(r^Hi%)0@`&gS*g`b6bJSB(PWgZJt-t{zhd^?ZamF+I;pGyyN`>RiXTc
zNlxPX4FyP<-RkI`*;!8mL-grNXUyxVCGq3>{^7Z$X;ovUSBK{pp@%!vYH(jDche~k
z=QCuTt)cC<n&jfxZVhCTdam(&=X=xdR&GPu`xGO~_lU11s6a{FQ*Ev7E7Uw9!I|1w
zO+>iTP4qGM2>;L=lMrt0RAi3bdZc&vDnX8tg`XIo%O>CX*?oFS#6D#C1{)iN!4f0J
zARRkoB4P^?FGpT=$<qoGY~EiYRv(16PdqZ307CueWD8sU70zfQ@C^<W%8-V!`IML*
zTF)s37D*@T)z5&t-Za`0cW5X-AUI{mx1#4BJ0f&jnPQ4E&;Wr&7%WTxRHJfSfUwe7
z*@1ASzeWyFzOt|V{*x2^&vm_D){3HlTFvoOf`o9Zx<a;<*(&q_CcEKM%`8q!82cFi
z2%fq`W!V&XS;B07)N#zRynW!R_PMdzz6uT5uDv{HMvM^>-{XMrM9Wd)!4a+!3jxw+
zwySO{Y9fIw75;$K2_WF%{sIF$-f@YYST9zV6;w(<4Fcs-1D3@wg%uKzgX1s|B|N8)
z?=Dzga>!KVNO)0K`d#Ufl5qwa05NOwfC4Qsp!lpZ>j7gXsny!ekZ8KDZ7B1XCb{_C
zqi4#&-Ou>E^_aC+gL+mLCY|-!hBSndeXq0OF%zpE4)q*{<n#>oBT(5~x(c8X6v|Pz
zh_s$QXUo(c7f9XD5_R-6$bPT(&$%O23^Q5*l&Zq-V7$AShZdn;F%TV1zdNX?eo2<g
zquxZsj+3KuAOwneuxQ(|mgMwt=e_8hUsCCQgy_ZxZE|?Va4L4(N)W=0T8BJv`6SA9
zv?gz-X|Y1Mw@NMF7ZlW;SBT9aJ10lfNGEVVUV37`j*Xr8T*B{_xB4fYu5iC8XPd6A
z3FTMS#J?tC_v_h{w&yZSEmRI%cn(n{jJEqytdKl2WC_RiHLiv*c|V%*0>dOJ>lPN!
zJgeggIl=#ta%HwSgF67izx@yDGfvOcac64uBgW3B&V`Ikk*Y$kINyLUtfM<0pmdGl
z?tEqNBg>X%v4J2=hks-UZLIYhnK{wz00Gi)X47F&6g$1Bvs*w>YrGC;Yq+qyp{&Lc
zz9@{dJC{(L>IzhYPnf`uvH3CEc1;aPyoC*Du7vC{{O%7slvrHxZIE4LQ~luK8U>_A
z;!Zb(HRqs9DaSti?x;<@JQ}K-JLjip4|hs(GJmq9XH&>;%q&>&$&&MXd<eS-#>rGW
zisJ89Pu8}9nD}qvxB|A6Een<1$4sA>H@ZMS&h$sv?$80*{&Sh&kX^@BV}{>c5x1{1
z>Msn_1j3WNs&bJ15^frbt>g@nb8`9=2ixMsKsJlP{HO7jOHTe*G^XcQ`I<v=n7+J_
z_!^Ax;mcooI}f29BOY|NCKDqH7ejo$fxjjk-S@A8MoN`bf2kP8f~u7v_W?m8WflT8
z30LagAmFp+WMWbkm5{Al=JO9+4)!ZfDv_B1Hkz_oe(+E}L{a2GLAH%+O#fWR<s-Ko
zKnE|J60cNGxcErVp^W3Gf9zEL{OsoboaJZI$KVE~*m$PYy=xQ>Ff{T=tS(bnK!SpV
zs!J)=IBBw+L#qQxFvYLd0)tl3unrAY;{y&%xt@#13vA5EDqDsI8}Q|0Dx}LQ#pwmv
znV7#bXKo@sysyep(=&#sPhCZPYO%V!<p~a;Dubyodq?*#aB#pV&?!~Uc(s*n(fwoT
z2Li5<)e+dWJGFjNw4V&B+K}bAG@P%f$nnrbk(7S|!hQO-vnywIBK@gdE@d7A&bW2g
zd7)P$>lLzO`AKJcI^ca5m-ZI`Bu96FD)-OaQg<IxDE!-Uqt#K0NMf#p%Oh!AM${2C
z-?*-IS6~!6E=j$mPEUa?1e;*i?AhENkTW{A6Y5A-#H?QA`zKm2A|c{Wd`p)nmE7mU
zKmOhqRx(eniH>Xm<*|n;xO{?vLG-h`u*grE^B3R;Xa9Iv-}5Upye;1+RX&rg3zIU&
zpvDM&1;pNgZKp-O$<mTan8DIcNy>aL-n5p5EP#U=%r^|`-qm)v>HrR78i)a$HL@wg
z-=ugY@FX%9j_!*Eri95+z=(!-AA>Xdnb}665Xze7pvi%yV6VR3lRZE2o1z~rsC#kf
zO4D;~1D)i=VEe>`9gxh}d41^W{*KfWVUz0aO=>+NE+V27l65*<2ySW>E5or$^EcUb
zAbb*Up?$E19cZ8-VM~3jFzp}H?hfLS34O9s8FL2GDn>i*zg!au^*=}0Z4alv1^T4F
zeickMxNgFuZ+$j(?vYTx68R<>Q&jr^Wg~T5OejQ<yb>{T3<r~jY9pbPn6qPb<_O!o
zqTzfvkXq|Z5pB}}&UOHnu22AwVbNvp-)F5sk9;c;IDdac{PZ$uW7L78s;b0+ObAD3
z$9mqg@QI#IOi7hL2W=`Er5T0DcxymN#I8q4QZD`7&okG&mNj!m98=B+BTZaeE>P|!
z3hYo3o4Y8mm0m|6?WMz1?K{26BVg1f76OV6D5(I;U;_hV^Ik1faoRY<$P3u&nKvWa
z>6?F?m^bqqdyWwL#>?{cYRl%!GD=7&Whj9D<=g;>zpkI+Ut!AM{<*X?=ZLq#W@cu+
z(?fk#lre!HP-lQ+L8#kI)Lr%-KspAHk)Zb*O?3{ymB8|1vIk_!*ieOZQj;6{K=qtv
zf6C6CRO<jX!6_{#l0YBvIm8LDwo@#4@AU>hE4x37yoV7Jv#u%OCe}j(>ad4)8^>4Y
zL>Np=_B9RQh|cKX0+pjjKKul8FE;f3V}Zz!Ii^HGWnNr#|I6>ljPfo5Bvj<g8v50C
zC=O+8rMpX7S}U_$0!HVaV51da;jFga#rcStRP&(FL5>CFaD=^&oiG7L+8A6Tt1sZ0
zI*t<fZ${oI6F_-~ut_GlnOk;&r;CdX;aivOmrGT0tyGU~fP7Vpz*ULE0R?dN@e>89
zKvSYmy#0fdeKO7(JeToZk#F3T9wyPKSH#`^LPA07(`ITPA;~glXssZl6^+0Az?Nwt
zCvdS)f|A&*>QAP*I@O0$8(cw>zWRFcWvxN8X?DAn;Z2C^tssP(Yt;iMG+R&Xc>Gjl
zBK@f}DfBhgJjB1~cWhTQxVO0l*V-<xs?^inH(x`2Ve)NlO`MO9r512G#C=J6mO*G#
z#cA>qH<-;@s52XLqCZ|3L3EY}76PK@5F_WtIinwxz%s4t7q$+Tl192UNaeX)V*hK6
zzu6n?hz@)c=raJoy!4u*p)b+Dq{XL@xk6UF=2QA7v_yfF4c>H=ZAeHlA(s_(`#7bi
zR*b$TPrtm*SF<Zp*ybN48*SJ>udSB-T)crXGEo5hxTN=U8&On#LN)q8{Ur~IjUJFy
z6ZC-N7o<(;vdx1)O~Muo+z8eyW@gGcb{M`^64obeXb}(_6?-GK5e*EQd}lw=l;X_X
z6c<LiZNMPF#s9GZAdR&@QuZF+Kn6@Ij&E4uhje)CUtlI`(#*)M%}q$?0iSQAxgOk-
zb3k>%K1Pi5%#wHPcv2wXb_kbl$oO>K!z$)`4j8%8w#cvuI`^4A6CaWi9|}^a_3U=x
z(B~Gl`2$zGdyA~DZ`!YaXLCWfxpBc%pN)?Mw!2axS@0~{)TQ69y8i6c9&Wu(BjGSM
z#sF{t<=p)>5&G*#$b#Hcqu*0wW!vuX%kM?~U&uZV)E@Rk?`q$dPqX=@Qlm7X^6pTU
z9c1$EQDbP^z21Y6C07)^9;y+{zufKH1ZU-sALtWHymRX81$l9QkE3)-8+ufg6Pt79
zy*&lLnVq-wfQ!dpU%X7crY!4^+B9k?emGVxVx}8<hC-2IeI$Q)jgM`mu{$D|_WA<v
z4=sAYfXm$9roL2DZh5n=rgduRm6^Ih!deqWz`0m>;-kaJ#<!m_k-_7%SGUiPALw6h
zu?yYbA12Z5e(JjV;U!VeiRf>w=~S&BNzd4rMy*qr1ex(finNwAJcu6M`7d>=K0Iwy
zhz7OzcUxlGdISkI;NY6DxCTzova;0XXWH)wh;g|})(<e;yvrXw`GeYsMQynx%YL1;
zc}X**u?zA!5PqMd{TARU9`p6HRIZCksADUWP6oGwpvqu1bv%7soL<NP&sRufL|D)>
z?wFdXw)>YGzKzaP+|P~1loo#z@Qy6{Stk7#IAjl3M+Qfk@^(8-9lx1=x@;L5%nWd|
z#?2)4hqj{C4BjOtFW9QNo78O#jBKTm&1yr^1jVQ6L{fv?f&H6|DUf^$nB)0gKzX03
zsLdBH+TWFOLRZojB8#b+TWi2IUq^UZG+JPPB)WX;#yq_sd6+RWNr3UuUB?(lH8x%~
zsQCNVfgN9YWiesY_@J=5K*g3|AyYd_wpUPMxpe++;%5+9wFb1+H_oVER`y8F%r38v
zk=O+A^D5JhR2T^E<|XUm|1=?;HTf&|PYw#iz$MTIINyAk?7c2k>O(>CTpC2_I>I)m
z%@*zhkqf!mPC}XvlCbHP%HxVvcu3%)SGtmwGdC<>ii*QKJdXn4upG%y-c*k*IrAGy
zSl**5ax>NvL-?^LF6LRE*wvxhp={RV(Ey7Roi`-Uz|@kfWHyV(;5rpM!{Vaw8V~i@
zB)-(P<E$M0qxS*6T<g{7@obk=!yy9}W|yNf%xMV+wx!0u+oDYxeX*j(nQhZ!lLFPu
z$Pe6<`F-Gj$HwVNZq33p5d~het1D?&2WhZJPGyM8Ly%<QIqS3l`}vftT*!&1iK*l>
zP=%yn!=F<k3&q6fzl4|y2PV>@kHoOB7j~tgiP`gJnpyMY+o<)wpe4p|3HL#z=&64k
z`}uWAniu*Lc`G@%MK`Mbn}WS7TA%YHYY;WAEA>ZKXhuxM^=c7?&$Qo#%FS%mpp<^<
zTg86$BdeUi?Ym)oL5sY(Q-0ec7=7r?#QlV!@B_SLS}H%HIxarjW=V+H-MVBQ{zMvv
zv2PTTOI_~qxZi821eCf+-gsab4X4@vF)x+$E9NaW9uG$W_YuW;Uit5>1*k~x>qD*;
z)PY@PPJt&Ij<y#QZ`)zr_PJf9!A3vM-=mogDKF4cD!0RSFAr9*v;iYhA9DrUr!>6o
zstMsHdfAKI&v4C=n8v}|oX|zx@o`<Pd_v*#&YoDFz<k)wpo13K>?Uac3LLwYH>Y6D
zhrfSPb#hyo$7AdU6`RHpZqn$3oIv5Rt!-++({Sxww>)DvS+pm+2Wn|2I!Eda$Joz6
zKS#@WuT&tK9Bbch0(?R>E*(w~ki{2idaN{npSgw0hp41t;6C7Tj{RpG3NkJXJb0Yc
zA#eL1n}N&a|CyNmy?oIB=ihDb$Z)rh|NC~k_W}1R%N5Tu8VN(9P&@sduyFWyq9db1
zMuG%)*r%{86MkY;czF1)9Fi*YxY8>jXptO1cjA3HX1~Xt5dMaLmlY2ENGlMs279vL
z<=)Hh=?mlAnUbsK;N%7uud7g;4-Lj<Yvv__>XHyyt(98jyB-Z%n7k8>7tB5kIi5V9
znc=tk`R)`~hCe)|QMF^g3EcZoj4so#t;rsGfX!C(vYOsNfwy)LYIM#+EP*V519Kp~
zC|#QO;G>AWZ+!5FLIcChsCX?J4W8T7qkL0qXs(M4WTFAoleOqooaN&;O<!h09p
zbG|R?9PE9qWMOhy>6&LVLk11gJ<~mqi4sPDl`VGb+`uZpM-q4&xS+{P)Y{W?t){Nz
z`rrtyBo30aSg(qH_CRE&pvq>>7!VX^NBjJ@>Y(_lB8wRMR~QH1zO_t@6Sg`2Ns-^^
zP=N^+RHVCrwRtHR!-+|lJKQOcfdQGGnqH)m=h0_zV@AY5ixT@~D2g^OUfjftF3=W-
zR!KbXJ6lIpN?nkV&?QPXPQ?08wU@Ipen0NfbQjg)UPzj9<$P6sex!IHq-h}r=Gi$a
znwhO}Q>RaFVL>j9BF=fFX{$u&Q=!9e)J!cogKzloMNMxQQv$rAs%~&x6T0@2TVU^d
zT;iMfB=5&`y~yV0pl&7B84PrV8FWo*!n|6up!ja~*`pp`M?;p>HVz&7G-Y=5j&Bt4
zah5zbJ`VDu?S*oS^oN)xaxlN0exo;T42{P35iT4PZ$esdZ;;~B)grZAt_4q92<BNk
zPQHsn?geAkDKX)zYR?#;MW7`}yApdNw-G0cUCzJVDQ)?mB#Kh*Cgy$G@LA(OIWijM
z0!iD6{P~_>J0h^>);JNaB=jn8T!j68^EPT5zG?6qjkpWwiC>4DM^f31!}E`ScX>%z
zcc|UF@lXXwXiA5*hD$p|D^p}$ea)eQU+>hEphGJpro*hR>?(ijWNgg&bjfM-Vf~2q
z(aLH9G|z38eFJCp&;{E6C`kro&8;!urt6D;h{?v_w^9e|#_T@&pk8{be;1j_x~9+G
zef0R@nwtpfnwN;b-U<=OEyVS3*pMd_JeLM5uWyyx&z|oduKiFb0oyzdWRw?IcrPqe
zmY3yPnCovZ?%qMV*NM(~pCJcMJems{BR~Q5TLIv3qMkk{x@bnXbNUgg+!CL#!rL6S
zZ!`O-)vm`P-EnzlYTZ+~Z@vx-N)VyD<<!FnC&R2CcU1Fd9>mhoa)gyjSF6%j^JgbH
zH3kB4<_9SouUGSSVM9~Og~t#+4}+c_C~E4HVjT&al?t~%PlFe6=3xtsLR~2f4&BFC
z(QVoD>sk$J9r8W{wpyO3gm-r~9CnHu)#;DXYv)=kEnDuMP|HsQ?iM{gDvj1+NgJ8j
zWvKWNya44*#L)Yd!nxVoyI5mp$W{9J_Idikr|>@9+h`VwFX>N=t&ttJx^gaVR4_Cp
z-nRh-&j&X+?mNnNwK)#0Ied-!pG-5ekmX#;g*0T?DdQ~Uwx1SO9kb5uGtSz6u~%p!
zg93Ose!*nF(pUN(6YPv;^nJCW!+Gndj7{`v!pUNDqP!_ntGT8_oo;K4zosdGYV!0d
z_I{)Rq}&wA^aUrEbc-+9eqiDQDJyovv2lX|Arg~4xyGMRaJn9NI->7k|3v>@8Jq7A
zinMbNvRKfT>1usY0nXb~PS%q)RGa4+iLj?07V>?CrNIjidXHW{WctV}new+dWl83n
z#?`d0C!gfkkp{=(Hk{grs1NKX$GP_3VB_hXM)sL_&9o-4vwgM=_VyHVeSaqtn2v%2
zwrN(Mb~b3_M><L1ulyF)qQ{K0n%b#3GUdjQ>9aWl<wG;Y^UyVw+{@b2_UYU{4}I!r
zzftCX5qxd7;P8w}5+;|;wo1gB^J+4B{nWbkS>bGzAKmQf0<A;)g*Hwj5{{`OI#QhO
zZLucFj-kWBFo|JG-&oTEhRq4_fW+yt=*Dgv1@?a77J2sCMJX}9YYwZ{-TMjF@~QdZ
zad+XeH(Jg)3Wbe<E_p(?DzV7TuAs0nBL14bB2zH^A(&$IWkTpGv&@#9iRaf3Q>Ki`
zS)sih;Dv9(!yLNsY5<2+oke4xrTLL+N^{ck{dgD!qb3;2Z*n#IfRN#iJ$%nyb6LW(
zH=d=T-@R7ZQoP!S6=NESY#9@;HD|36jqNObf2Xmc*z?B;=km$L?lT>NPF|1qS%Ast
zFx`%sU3&l(=4ZkP$mC7-=LOfgt7_wfRKSYO>OaThx?9|#);78W(erXad8ERh1y}-g
z&2Tnak4RU$ScrA+WXQe;JtYf%)paEG)4pQ!HbC9dNU_toLDQ7}TO<p%F_th@z2cp(
zx9cLfW#O~pU8@kJZ-#Dfxc3ctLGk^&8zY`ijmr2+7A)=YY<+bO3GxXtG9Yd~;oBiM
z%Vy0~<Q`Udy{E*p2!{?p|88x%+!CV6Uzviqk8{;c@N{rMOv=_XoGj?V!dJ%Y?qh>Y
z!^wtMg3Z`b6)BayCeMsnkjox1ByrgKHwQ(DkR-UW^mL1@k)ih#88SUg`$?9!{rPXf
zEc^9$)rG(b5<^c-6fQlE2;t8G@@3m_9lobcp4b!Vy|r|=(`);msb>0dgnU@|?iy+x
zeHfCUKX_)Q2{U(pQLr(Y-8H(*N2>Vi>|LlBZw#E3LnQE}wZ59T{+ml;`??5D8RvY0
zG0WwgSF%Apq2JyqIDm_}|LXWHh+Ntkh28NjN~mySYA7q@=Bs1kPOt1)+u0MwLYXQ)
zIZuC8ZOl$DQu8T;Y<3+W5_Y$|skAu!L&{elwvz0VT9IR#rSk-5oy5*+?V-YHt3S)u
zJBG6{0jBfoN5)zEAWV*Lup3QPVzm*al-=pG@D^LT$ME%gB;B5$A(>4COL#%8-FxA8
zX<jJI_4dRQm2Yee0&pp6bw52!Mz|O2V_=k-zymRUSsgx30$%v@tgM{e=(<YATAo(+
zo~N^QXZkj~SLBrW7X|6QAp_?-aacn7^E(lP@(kN^3^4|rl#U}5cO(gJI)QS}R=WyQ
z7D(DH_iUjmL?f}aq2vC6*D=mjZF_QnVz4i5h$z`7Sm_S!P4`A7xB8<SgI{b$B3YKk
z4Weg=R;r#W%?Qr=(2SLD=nB{H2QGo2IUw&V-CzWqtGNJ1&Gg<p43zZ_*~EnIHZa}A
zx*SOtl9tT<^@ky<DezsPvGHXB!FewU3gr6i&vi8K{Tzc^f=#{B+RRYqRq{pNg6?mF
z-^r;S!PeHLW)>DafkuplCVQ93EoTO9UWi8T-X7W|pbWQ|*S8ER?#JEWo7aL`b%70A
z0+Z%9NMG_~+xY={R{7=OA}TTYrXxCHD2rA?EJb)l+pnFm%mIduw=ZxD6@?x%u+Eei
z>b~cp2cz)730Q-tl1UVp(v&-ppZu?v4c6KUjv}PVzTor|&*yCONLbSiR|5FM!+m}s
zt>dS^4^BAozp?|S%J*N-bq3Os@1~nN2$Tb|6#Q-+F=05%7h^`YcH7_DU;{M=7K_g_
ztON*<yw?7i;p-I1(}Jwxp9u?dI+97Wo#=)Np*=IDVh&c>tpu6WPxEJ3BF7ScDwXwM
zT~xQ^1;4etx}J61BluoFBAy&t?;e4co_G5B4)BRcX{CYgG?t*Glx}(s-3SfkYVRYu
zO1kKz7<rg^{FR@^;C)kLHED&S;4(lSj$Rr1%0e}_mRYPyi-K~)!o0U#;?*uH?MeB<
zNTtkL^DwT;mGem=Ng=(9#P*Fo>Vr?+GTb3<^;OG?P!Hfhfy0x9_Xn9JGqEGOzA}F@
z)gK+|Ou=Xwh$0_BisX4XDo}T_uz>lZi@MRe?YUmOCM6qR1(b6-O}GOdW$fhiW#tFz
z<!*tx<6UmV{X?nx;@Xs`gex)Xh6}2EM@%$2pBOPtN82ge=qR1Anz8mfAth-;dNh4W
zc|pC7XHQ)|MQN}w39yW2ue;;$t)#_8vwbq3w3KE|RcKZ@?h$9kCm%}ee$%}}Jn}5l
z(DI}>OrrP9)ZwqbF|f^Wg3H#tgo?3w#b5}GN&%`9`HU$nE&RpM<atqQCf)d0%Sv~@
zHirt?&y2Y=Eg4Y#uNENCWJ90+q#xt<@}k1eao{@LL;uz6VP@QKl6y5$(`vgY`Gtwk
zi#Be@&B7~RSVvjtZnZk)Im39Al1S)zxxV${QSf%|HdcT7YSSk<@O`>#eU+IX+{W`Z
z7JY|1KJr*#q(`83bADo_w=YeXeo(&nmy!nPvlb_Kl%g2AWcj>0?m$2OwqO$PQ+({P
z%-$aa#}{y98Vs=QZ*x}ePGUqk_L#{l3Z9nR@HZ;JbFvJ#_BYz$C6pFhYg5<eh`hnC
zBY&^@J-D@f=WT4rg{V^B?*5v53=sBF7U~p#R%c`olGM|brmRls4ZPF+W~xQzBKM(a
zur5XrM27M4L4)^+()P!#$@=O8Cd%FCgqkAAn8IuD<L%o0`%p7^WL`E-8GM0THP_j;
zz5t8l4~iR_LxYP@sB3qa9)}bFgaS`t32ldFdu7P%uYcplCnJR4L~MEo6SKj?lDoEA
zn=L#z@sATcOke~&7a~1l2MEzj+beOt7}Lbq`e)aaZCXni%_wbwQ&pTId8fj;UlQ<a
z_YaJ)QZ4^@-J$sYt_s?WEOgn&pzw<|F$QQ+!SX#7L&r#_wM#a@1+zP=eu@qO(6@HS
zj#CiXyR#0+Ta$f%J-drKMpSUjAlE$@@vUdS@_|^Lu{C13Aro43;0;A8&)_{NQ;z-5
zxaxPr&%Ngt_ZjXVj9RMzZ=+!k*GTX4bARC#UNecBE~TgO+&6{GmEWIy;G@Lzs{Ph}
z%pnq#hO9KM%N_>({G&ZuHYqnuyM5SE7GoOhlLZm6v7;v?lu0U&xeBc;FHuxr8&Z=F
z|6eOM0Mvj^ys<HVU9{9!)_O0?I@-`!+gR-65D0el7kWl&5#@f?XTkj_nU|4zoOYL;
zm`kRDkNo>*?(b>yDr4jAqvJ2!d%Q<lB0swkv6Ziu%Ti{hKXPXdR^xu>j-+9N7s0IF
z#bI8a^5@{f`6Vg$nD~7)P)hB2SZOA`=g*%|UNYiK(+P>_{>jvd7-jAsmhU|SlY)97
zE~~t4ur!ogcd!<#8&MksqD2dzLGKHpKUmg-N@}kTj|q90JSSia@Ropf41xm6sg0?Y
zn2(%OTzKEdT_`P>Qc&b!tDacfQLe>@Ntb=WPxvU2m!E?1l{Jl<dvW5Vl`LMCa+Fsf
zn8rd%HS64ZWB%I3X!@At_*!;sRDuGefh^^P<aXcIs#1^W)RvWLBiCyGwy?A^C$DLW
zlKVN{oOv4=CJvQB&k^o}wVhS2!S5KQj${BnRxsdYYW3vLBF%HmUwWSE+Ge(W{JEF&
zj(DjC8>(!o_m2~HTxv>?WC0~bm#9h-^U*al8TxE4sC7pLcrQSE>SB<TRu-bD5zOcC
zO?P%M!uyyHor}e>c9%TdMMXX#OV(L(%+y{jEl+bh*lGPr%g9fJJfno-+4BumQvh&K
z|NGJ3bMkB^R{B(P6@2SG%<l|%xb>0&Q&3SKI8vM`k+j$n*a}&9@IKF^1&$?(`d;R<
zeH+Q=Fr3BxI{ibMg~Dix;`8U<)Ce46$SHM_qgr2-m8sY^iO8r}(lM1;6Rcs1e#vV&
zzLyM4A|G*o$z8At>oabB{jBOD01#|Qp2)OHy~fjdbHzjBRfaOGta~VmGmBDYF@e5M
zxKoQ;(F%qE@5=gwW#d0ek(XnC;K5TvVZV-#PZXB4B;);)X0Aep8NGE2E3F{Sos*@=
z4O+^VDELG(ER&Nr;6GF>R;Fr%4g)|y*iZp*)EkS?$5>+`l0zZZSGOZ&O!H%WgT<|a
z`Vo0owP`$o${2k3e)3PUU=Q?<Pt4|W2lbNTBE7~{XdqY?5=$1KQ#oaLfja%e{m85J
z&mzEXFU!D;BskS1EFf$tHf-gboDezAyupB@NS}l1sv)Y2FBEgt7stkjLp+yMTqK{|
z4%Tz6nvDy%g*vBz=!QMPP<ZMny=M1WNWC%ne$FRVOj8Au#krWd3|XTE!HTxcnUe}V
zv4E*>BBqDp?n)}v&bM;p6#3Ir8=YZg*|+VUHQy12>UI<2Q^M8qitL&Kb-n2FZ29Jc
zRN_COa1B!M^aFs20Zvye$s`_wP2K|TonovKNb3(z9rV|n`bOBi@gNbiLS`#k+m3`l
zohLjZg6>@xv<L}lb_yew>_z#@9{GR3QQaaOi)H<)*ObY@9R7~jpCpvf%oOr1)X*UX
zd5SMn!;O`#aeRABlyo%VV`IppqYT2|F##{~u`f4Yv;3!hVw-}jAiwAl!ae+e;eC&R
z_m<JZeWpz`S1GrHyi%-TH3a=d43j}ic7L2GCEmK&g;)F+!Y@gg#dEPZlh#{Odc0Ke
z_wRO6dd<z?a=&p1;*-*e2at*q--&zAXAoZQjT%htpOd=XTF-h4OevyCB*A;MiHniu
z=Agw1LrEk-qO#oAw!C&@iD&|tU(KGOu3;H=uMqx1YGtUim4#h>5w`T9rQ;8e?+=_|
zf5PWg_6&rMgucJ;A$^ugoGGI<t*)pJ0XA{D)VXRUlO=QR0V2GTCz#|#xt}CjbI<^%
z>CZw%t3*yY0)Vr-gQ*Ag_^hEDwmrv*cp1&_-$Tl&0oi$;pdlVMonEssPe;>30zEZt
zj#00?Wk9fQveC9@rdHIq|BtD646o$*qDBX?ZQGgH&WUZ?wlT3iv7L!Au_m^iOl;eF
z`}cqDd++^_52yP%>8k3pSFMd&8!18(1rqXgxS`nTu{SE-1Y1*aFs2)<77K1~HF0HS
zQ3fPO@P*|T#`7qY#{>9#0{lb#Bt+CtWi&F-;PE3$ve8`(*GSVU)!6vKTol805-fv>
zn;tSSFsulqBvY+(5iC|MpL`@H$%288SjAb=(VTA@@DyUcdc65t?e%p}WezHSgbBnU
zzsYs<{P^7I+rq#>(zQob(cog&A-uq8Y&0|<L&gM+GzHdhN}vff#vtZRjzf0i7pSnD
z6P}oF(uq+mU3XF!>E0aDFc?M)l*~8|JJ7GSuX2~`xh-*|v{+xGz3^|%F|M9@Fj(BZ
zzJj-o01bi3BnP<;h7J)MnoFXNO;>c>pIzo@&?5_E#^s<2eiA_eR>xh1<4-gmJ|`YI
z!K#-N+d|&V5iV9GTr-!Y5T{lF^g<>cgZbWP$2^Au4M8eujH2hC#w7nGbD@8JB;DsC
zc0i}z0;Wk65UJ@_)|W|Ku5S-ntH~5m3q=R|rwowsb|KA_1rOnhrl5CxASK>yxcl-v
zeC_yD>>`8`XG8qe47&%Gh}MS*G`4csGndJCu7!@#@-E9ql!X4hoZU$kDf8n@yoRj+
zW#LJLfY%Rm<1dz0Re%ARAkh(@RYa7eeF>*Gx4YhlYRW`s$iF|5q6OQ=HVCsxP99q&
zVJ(-Tg!yXLP@&%eyJkAT7g93`#arJiPBmcHaEyj$=CY@l;Ddw$1ufJ4M&0gq=!;7e
zxK~008G4^*{&N|@9F`doPcbI?NfimWXKda%`%R|$B|fu(*PF3|<mD1l61(Lg?#{cQ
zM+H4r!kCwN+~$tCaDI-?;3zY_DZth18y}IjxU3nns&^bY!<{K>9Ig*0f(j>`@4e+m
z0>ayYfxlGBy?LHz=i-?7+<n08>9&l$tx*wq->DQOcP>IOjb~3<-V-*jn`NyXUVPqs
z$8fs*&#PplWDl>;aphZXhySJ}i8ogp(;;mko^Iq-cW0`)s&*(gT>jfQYB%^(7&Kb?
z$>L}uB}v?90+TXMhXf|tm{EB=XSwnxB9-tBW`)I!$@7?5XMMf$X_DibOAJYsCJ`29
z<tl8m{xQ9Dnw(4q>sUt5jXJyC{q|<*L%=5%xmOR-JJ%CpYMSUZ$2;_5l+Q)|BHwfu
z;u`~|{l^jHAul3@{FG_lTF+(oEsD1KQlfgIoJ`{Ln!H|N#X_vq{k3PY+T!_P%xZbW
zF4uEX?GgO1qL`@yO?a^q)BT$&^_(5d3jO_d_08Ttl{z1t#;E%K+nXko<Vj|AM6}GC
z-uGPWYzu{o_yjAhkG;~*=c^nZTgU!L2?cQ8|K6<bVT8k?Q{zNS)o%Zit6y2Uk|)iv
zB$!0Vzu!)&xVrzS$xBOb4FS%PmHx1D=}>LXz5IB@PC-kpt8cO@NUBk`mT1=aJv=hx
zWI>tV2oKqE=l(fV-2EzG_jFpbyx_M}MFyTX#?<>R|NdPwq18l+i^U1{Y4el6Pd(=q
z)2uTGn+x2=s`}75P9R#bwD<=(x)4&!rGUTrdk;4&<%*@TmK#H3r|#QSl)TaGv0~Iv
zN@Mv2!iCQnA}jp8^RB-CL}-vJrGGe~(S?=^W1P(k{ifgcS0J7<TWQJquynJF{u!ib
zPJ$9uUvR>O18Md9Y<UuO?L%tLfUoAlKSkkA!}vkj@$z5$^Z7&f&0p(J5z)vW^qs#}
z^C3PCs|bx|Gh$zP<siSN@w0_hcPVx~&!uWM*2PAvz<RE*6TIw7>3KCo0uIRBn3Ko#
zz3<!k$OIOgmKE-+q^#UlsXu>*We63_KfiUnuFuRRjfU2MF~%$}887&WcQnkRY#n(^
z_A1>zpF_;g0X7VSl^rDt^rD;H&6>T4LwAElEX^hlnAcOUr-vy$!-<2?2>h_vXaU-F
zuV4I)Qbk2Svco>EC~%S^lTWQIj%#nvDw6-W!rwkc@mxO-*%F;0OkMB!(qUucZUJia
z*&5uxY_M~wb+``?BCJO#>4_+$-?r-H9c$YoZwkQwwcUN~y1$~Y{P#>&P-bMct%Jiu
z^ZcmYZ8Fc-zt$N$(#jGch3t3J-}!bLlUuNGw4!y(o%zu1rNh^HOm0$&R&O@tEHj!B
z%6PQx9rsTtgFbBbZRMyDLgocb1_GA5iXe*VxTPKXw<->WY3l7*X;-jT@pjev6jDs=
z(rtm%z`y!p-n)K2R`<(N=PM%PuD+mvKq80Ddx1T5dH?iHzDFWA^G(I<XUfeNHvPAW
zVxRXZ4uv&8qwl-8jBTl<YM&{&0?%J&G=ks8l^)n^cLv}Qm6<Gwr>bB3By%S!v<Y^%
z)w{-&EBuG1a^89+Yu@uv9z`4={GhrXPn><+uE|m1b8kWa5POat+24>vdlj`#UL+>|
z%r5cwDs(4VU+7>xid+%*qrzP&EilwqvRBjOx_0g31@T}Rn0vGAYqs)U-4%9td~(EQ
zNTPvBOk?x^TEpok^n>CR;&^RyQ_9esY}WVrNqM0gpf#VxO=Nx$_80%Aw!xfn|G1d<
zYL!F7@lm;~G2?LoF`$6ztm>G8bRlpE)97~6tm`ga8fhTzf@=TRVbBjMi=@laQ2@+Q
zj!-k<`YB6%h01cC8zq|X*TCk`e{;e}&(`-C<(u90;Z?An4;E2usxAkWE;(E`vP?D?
zahglm>T?tj)mqH%-O}&3l!4GY<sg%@tNsgmIJ|UrbbMZgr-~Kt^zJa5bw~?95wgwG
zuzj!ALvT@dx&-O_E@cO>$jPhBon92y+Y9LHx(MJXItO;jPT~n6{WxN+Tvdq^qjx$t
zMgL!h@i4)GZ1~{41_g?cU1w5?elJmzBS&2I67d?L2iswfJ=l?dBfQ{m7I{*7!(Pxz
zc=YWBz)}Vi6YFp3cZ1VhxVs!?_QkWkuj;Mk9qxw^3m5y>F|*o#dnGLnV;=Y(RF*0$
zrpSU;emAqk02$Afp*$AH-GRq!du;QNxT(Oy|6gu%2T!C!1nGZi&i`dJCx_BJ!ifIM
zdj3BN&N!yakx(LYGH7xN!H=gp40Q5U6_;MfHP(EA2HbYe@sL}}!i!uvWhF?r&LAUM
zK;D(Ko+QaON6tzXjMxj~LC_CT0s}uXK6cgaOZJypWH}Mha12lastGGbNW}>~>v-7^
z{_E2j(3WpLFvDq=x3{&GiUu9(Tf!_<&m9$fK2$_{JVg+_fe^IJ(2j(t2V$IFZnb!N
z`LymCQ9*=ak)%+<ooE%y(b#1n$85|vWQEN@gmV{F14f||2y1*(^6vq^>(9q@#q3y%
zMeofGB_vSof$<IY2%c;Ypl|zC8(Q!9YFRshSVG`DIJwZB4Fx<vg*Zfz6Sann1d8eI
zMJ{l(<<SnAK0|`qP_w8XD_?{^+y#Z@K=TDJ8jI^_ZaX@ARdEh78!aYSN}*O9p_!4{
zRm==)T$Eyp_zcRRqW<ev_PL45f=InBLLz(MFPa&o$UL&qUnOP8mEUM$gw>?LR>%sm
z$l=;I?31p5WHXkiS>#d0y(k3;iEKOAk8?wLjBmtRI2<Y%`%$YLLv&W<;1Ok?t0$RL
z%Dg2N{23jp8t+B<)sjtIayA{y$txilE5(F`noS5ylG`=-5D8=~j8RWy@l_LI1rl$T
z>gO*K;$n%iZzTKQAt21-P!d8XD-IC9X(>uKsfmo6WpZMUMI+0kTHb;W&shvB{KZ-f
zstGpnhS>4;xTDbp1Dy}0b9bQ%m5;*J<BY`rwTO*cG)PNx@>;_ijJQ=KpGy3x`O9H#
z)iD!74u8v+qDZzU^=<<nKQa&}b4IK{S)^b}W;!ci{YQ4f@HT{}SYX)Kfhc{h)wO0A
z<i_%!OpY+5$mldofg^am717=inCRcV47vRzL3YAF2`s^m$%0@SWCqH>X%SW=?2pba
zT3i^A;1pQ{g*n2S7Z%D$sc;19?tFw51$HzLzRUy@{41vbf&xDzC5}<DyS@~BOF|ul
zoB@MInPA!@!(PD|K#4}2?tiw}kyk1FuP(vWDXjULT*Q3%VDUTW_fn36JDmlHoYnRR
zI%;l#U>{;HT<-{wT({$zXI|7f;{`8l`G&t+lbspe7VjqWBrd(T)>YiIJdcjGDekMf
zI_ZF1xBI@MhsQgjJ<<)ncEtO0gFi7!^pAN%^Pwm}O}=+1VtbGG_Wkgx^$m6ZG5u0N
zoX*Rv^KbKg2PTdF=2%y<YH8OMQk?I;zK1xXH+I$L?B9}7_J->b*{7_oq~K^&^Q@c<
zUEbFi?d8W&ft9!ADLGGHeP8l+{U_Rlt>Xo7{8x5V7aqrlrrk4J0p(W5*gVgoWH<9A
z%P`!#)#ugBP2Z{G$5zOv-bl#jg++mFxSVcPsMx<$xu2Kn8f#7I(x0xVV;`=lT;1gc
z#zX7`HC`#!AI^~KypFVx5U1DT-`|{0b6ZX}n{SJkx<{&WI(h34H=js{QVz?|_U*k1
z4tcBzxGyd*A>B5i1Xk{}#oh?vauo^B8u<tnc!>q#E_|}J&2+{G23RcX-FKtFsB29V
zZR)2Fj)n*k>WeJ6+uL!V#rnp_7(qh$LzagVp}Ko(v(!^`+=r{aa)hLv2tbe>FWNw|
zZp+bQxMo<qVrg1+O_mIaqUuM?-kFJ*ml)QuW4wX1U?SP<jq<TQ0TD`cyjpyy|G4;q
z_5$5=zmZs*cx{oE#AN+8p2Lpw^9<$DNP42|>jwhe2=vq7ThU%ge3ogUtv>FGa{Y1T
zF?TfqT1fS(9V<6mffUFOMDQTT^V<{9);G-hK6!mQ$-EICR_e|-%En^19eHy)Q^x;P
zXbb7inqa_!dId3)=^`qk2&PzM;KeU;F5)4^TgFNQV}=HdzOETDpUr9p`@ZO)cv^Qe
z2ra3WjM8ANzV7@Pe_V?{DQnh7>@{n--qr$M3qlJ*!FmC0lIkzZj-k!6EFDT;q7`ui
zXHXvH&yAu|Q~6Wqd$=Zq9Y+rZ%Xzd(y1%<u2A5bA7%_korX*9~>kpY=0Op(+Q8}WC
z`gq*b&P0J@xxN-k&J~cyC~Bn#BZ)2PqT-Hp6INHEJwi8`*WF6^SPN-f5b$>6V@97F
z(K#n~pYi>_)dN03Q{8dg%O6pv<xDArWowX$vEmJ6*pbr<Bu{2A1a1(CMI~e<*+668
zgdEvp6P>{({IVq_9`WgYlvhL(dlmb=Is21-Ow7y?qPmAEv^`YOB)wn-)K9|nlKFzH
zLQ+pg3L<YiZG(OmZh_DZFOb--{PstT{yq~Icx!;MQK#ws6ssv|Z_3nA5l%6=p}ld0
zZ*zceojzif$<YtI`ntBe!L&DM=uRpf)**?@3ifVDp}vO_GiYIdB%XpEDaFE+asOxW
zF=jKj@`tAv0z~13ZfP;q58ka_racObWQjlNP)~gpz*{OcK|v6gPtO^-+kP&Rs-yC~
zPcFjvl`k*RfA|NmF>I?Uaum3Dr61VRRv04mjbMCoa2U8y-wbOqCw()K!4lq9q^Ge_
zV<S0m5N5R-a3v#+nu-l4#*Ht>;*{J*?!gA}L`8%!*6-2hSKU8t_+eBYF2uzKF|U)7
z;{#YYLlun;#0FVU&SIjU#3iTGe@o<!Y7EQX8{lJ9rIQ(>6jPJWBd{4f#>HF7m}$MB
zeY9R$mXydE4)a876${ra8ZT&DIQ{`zQ(^8+nfD7w)wBadiODj0d@LQL>JJ}vi<zY$
z-$Wi{CeHJX2!;e5TX|)nA$2U0(U9)QY*|3q(Fu7XRt=^yth-zFoelNnzyGcMXje_A
zIas?Xbg3aQFcNZhD)6&%cznE4uu!R?y*aWn*ae8FEv?7tRhgTaaY=?cQtI;=`?|#4
z6GH)+YR`z3{yi*E(>EO<4&d_H(MDgX`NkwUNV=uq3FYPA|F9)3n=KoFlH)ak?Wz>`
zj@2g$bu>nDMf2_MT%W`k{IwDJQB&<Dh72(+YUxbjbp=xAdh<Mt1<oL2JUmN7$q!)@
zkfHA@hKXhgXKOs2GzBhCli@5Duz=L^jD?OZ&{a~>az+@CVcJyn`^szzQQ?2zvQ~(G
zICi&2aGjm}78UEk{n&!r=r@O&L{k47R!L=ZS!Qf6&r)A-6(-B5w7xXj@VbZ^9=x)!
z@b@8Bt1Vlk(t1@~CX#6nn;Z%Hw!(EP2Xj4E4|1WRA|klpOjA)(T^3WfasAJ^b>Xcb
zWj176_yE}=Z(D4<Kz~3Z+tbk7nRi8Gr~Mhey&znhf=PPwM^lCcCp!~Q?Mnh1=WBNK
z1JBC46?tUAbXQIGMVsBCKvzJ|1F@L5xaI1K3@b`ua~)!%ny)Zl;di;F?pl0miiY!N
z+hv~fdAmx=Eog&FJhRQw_d~kPe_xllq)*=s-gj{UjGSJZ9t&`7>@PsU{p^hRNcpPu
z@CPMI>W=MRxw|#Mm-cbn9kSLPu3<mDiqi{!ndc0^F`OoXel}LIzxZD?^G)4pWqC5c
zgfzy*q$W&$*JVAoDXA*Y&bV)sV2Wt`)DWk`Bi({ejL&i~;+y!+YGX@=6EUMX2kKiC
zd+1UG{lzCbL)^u^pK;7f0?>6YO;p+xSLEF7i-{)15xmil)0L|yvC!3X00UTUEQ=~i
zqI_v9;*C&}Yc2;aA8r>nm%jX%H>1Rft$(O<4SwH_xM{!0J*{(Jj+=FM5Jz3p{}*6A
z4Vp$RvRd$aGxlq<YKtdNZ}PPTP?F#go2p*eTo&^k^7ZTT%ea~sj+eZ~MsZoN^E8MR
zZ+D`*lH?3gxx1L)GmTJuldAorJ#4Yl_Kx)q1zy|{p#|##r^L}4wXQ6<(;17JLb0(e
zBad?U#J_TY?Wo{Z5cB+X4f`Cw*h?0k*gpRA<3ts&69J);N01E5FwgX79;3$t?1AO%
z46EQm6NV8}MCwN?_E26XUnT-GGw0xB<9#$-8#eBJo!@|8=!?$T>a@gP%!AEt{g@?m
z&dz?ckQd)<Ruxxd7_q0vOLbIe&`MJqLN4XgN9iNb(Lf<$*RDFCE?;jj)JxGMVj}Jx
z&RhHYI;0zsGzx%Gq9R{AVukiJe&u9j->F=3Qi$amLy4=i)H%d2;#)hcy+V5qo=$0y
z$<R5yG<Xt%=Ty7{O2JnDA;Y2dr!*1D*2ID9*?G;u{tE{n9>dqEdPZ2bt*-qU&2eUI
z@6R^1&>{Q-#{?h_XrP1ERdp=*76*KsWdBA-^0oE{q;N2enHHFK`{2-Rvz%(b2>R@`
zmH9e&Krih*wi`J*)B?xSP|a)r`u^;AeagO>FS^&`dsDr;K=;NArjb(gMV;Paqb)HW
zY(tebM?(Yc7X`FuD&NVxsG&!`n01cc!3SH`Sk_ym_tU$USL_@3+x`#pV`s!Gi~S_g
zeJp{^H^?uc9c~bv4-vd;0t#c+kefh6<VI``XR*ol000}XLSptm_NL;G1Y&Opqq7LF
zU%d(~;oarB!7KO^$e&?7e_y!j3AAM3j#ZoWUm8qXA6_QQBSk|<auhRhc>jkq#I(#G
z5?@Sve~AhS3ZjXsu~r+|>$6R6Vb!p9^wjzX+t++B0psP!Sloj5em=eMudB9YaTS4F
z+33tHNni&?cXAG?xlsR6_Uc<_2i_<W6Aiu|4L;R5F&`1`4mJ5DY9n~UBA=uJs(ops
z@*hfdP8(8?nE24q7w{&y6xo~!K#vjwQ}yaOMBRDfPe>Vvjm1f-#q?mng==cF?W1U%
z48|NIF)>wGN)M*hl#3HGYu?^2&rTUXnU?0b>;#|<fSN>FS7o4NL!=-Q5Y_WSe3+?i
zOaNfVg#!OL<XB^_3ZNsAhv>UFIHbUiZ;2^O&c=AJ04bdBp~vKj?kO5*s1qc2M}e=^
zc@XR+nk-@Ehog)kJD8Fz<3HD`uMHQU*KEMHnwG$GD8j^Z#XfKwIYK2xNzRkL)0%{&
zO4146IbT_0x<uQ~)lwMo@QyD^$~20Rs?=13)~FUER%6)W9sDwJf5-knGC(Gyh6+5g
z%oXQm^j2a-GQYGdRddXo+0+o)R@FA#Dj`}DTc)!%I)VQ``ja~rMo)X(?}g*dv^Fl&
zZb74*`b(+#ca8<8)G@d+Rl0sOv5@@Gha655Ywc<BI_tSbFAH;*3L{SJjyt4;jI=0H
zo{c0m`EBGicjvsVU7ms1S9IwyWYE{kGR$Nt&}kRDQ^`Rdn*}o1SK|W5g16omZnx3@
zTMOV3_AJ47{>n3B0kAllA`3U-ItesQMJTTz0Hq5%%fH9<0b$5Ekg<!JyVCulpIcXb
zX27D5jz~3Xs(WK(hh^({hY$Bx`7cX(QSbJu4eqAs@R_(4RW<`G%>0=OO)bB`GSg@L
zC?kGjNHL}f_wbmF0fcqHIckJ@Y;fh$sH{)UOA!ZsjOK@?Y}q`Ga2FD=CH)$w6|-N=
z9hmhvFxjV6<}dwHrPj9>iKjG+dL0eETwk8Dj*Gj}nV+yaity4dHq0PlxIwNgQHvvQ
z!RgLCZ-wreG@It`NX;)Fj}1!!Z1Dqo4T&4iqQ#5xX*tqjlHfS+Pi<a`#YSbtp(*&*
zN7l<xS`~d8T2m70EFf40P$5hFFWuoH@YDpL<&e(z^`ZWX#bfMq+d|J%8i<UF1Y<?G
z2zIAZ#{P_Sc`L)?Av?D(f)$HDD6JvpW=ADhO6!T-5zJx|s<L@j{<Q`TRpc0&&f%<P
z>#88mJ9!Vp(2Zt>fl`5Ng+zTnuB`RSzq%v`y56Ea;Yb^BEH;RgJ;uE0l&N_4G;8V9
z+mZC<+MO+$`GQbOOH(>}t-yl6c~h}P!y0!fs3+IKooE#M&Wb%G6y%sc_Hf0NVjFA9
znmH5`d7-yjUaHfbz_q#Er*|xN3j$$1nhwzQY&AwA2RT-jyWkbksO$Gxc$~RMC~#}o
zt4a8XV*-mh3TFX2vcJ;{o0h|m4oawd4ZRNat+ZSl?#R*v61}YDj_rv;HTdQZ(bUwt
zZE!ez9P;g<A+63D>um{wtl^cz|J3}x^LRhhSGY5JF6yj`wg0e|)lvF`6=UU-GF8OJ
zL|&XG+4OD0MsuiYpTog-C3)Ew;adWcT4%_qR1F{!(_04xz!F4EjxL{fYpnsYR%6OK
zpX%As?;@+bgR<B4>8_A9TDvV^gc2O3ITJ?iT0R~J98)MTFj)b8p4;K0pXZ-6KY#wz
zW7${+a2Q+&DIW2?LfIM#d|RQ7XoPtOxw{)bi7KnJD?_SE7-_KctbCrvNEi*IHl6OG
z^E|Nsi=Kge4;C33G^7giEszK)s^4N|u6GZd9JsO1(fo;lMsWc`0%L6}jGABDMjpha
z3Ir0`yV6=EbQw-ceD;TgKS{=jqQ#4;c4K<3Z^LFWz>Cs?rHX}9Ohp%7v;Dup5D}JR
zfeQ&H8kMP7nP`fU`(3$R16~x6y<(*S0|QeOR75+_;mZu+VEh6;cxHU&*XTqNO$2qO
zs|mPy{QH>lrnk!46ggjE!mW@S#axe8MM{&ef13&+DOjuQ?rSK92a}2hD$1zMMcfaM
zDvqgaAXMN*KAs=8zrm{i@HB+3bAVH%uLoF3I9TD+kPn$@tUITPW;)h7*Uh^M!n^BG
zoPB9cU7RXn_kitY_kH+goX=;R-9iKj5Dwc377IAgr56=O=D!YFTmI;s?0a^`8)s)x
z_kWfGRzP?f*95R>dyz}WvGy-vx58MAXV3edI@ckz)9HVLeGMO~Rs|fD>wWITU$TdQ
zm79$gzyzfXostBLTqgXSEZZN2Oqp}9jXU&zBXR#cW0-}RL(ug%@$L82mAZ3FM;YjQ
zBpqWmiwf}Waz$@aft`_v{@%Vyk@LE?YO2YlBZrvG2HFlk^kglN!ulj7pc&iQ=6+r;
zdNdS^6g&FYp5DuasoZhp^l^O*Te~l}Giv`9T3t@x@i2|4aTURsKYKC=877K8>WCBB
zXwN7A@TXNq&k4Of*Z*nyl^E|fJ=s`MG#N)T0U3l|PevTRBTm8?7xwd~7~9ADQiW^B
zyO8ov?GoLwAfPJ>6^2+MIS{8X>);m@M9On33mENs=k*^l=6yPNj4Nvx{A4?hwx?wF
zQ^U3w0g3`%rkf?vM7?x6m*FI_*g{cH712yUVnXR<Ybt?&VM#c_Nyg6buS({yK_>qo
za6jLwdtu0SK^FLT1$$sN)mtv#Z&rKrTH!C8LLTHmCll9BnC;bJ{B%3kE#KY{CI0vU
zY5sR0M!6=(;yZO%7i1!d;bO*c!$EDKHzJOrh?wxIqUXPhGm?p4(VFJHgwP6X3VnfD
zD?UaqvOXWlHCKNFr<nPSX;xC`4ooKUyem`TZf6|vK3DKRn4!Wx%%`ywqG+5@8-Q3%
zM0)zZh6{$<{n_q?X7hC;NB7vL-W!pK9uELUBt&`2rxdF}_M4eLiI+pI2gaMkoXScQ
zYHn_eNmflxc{T9<oU9wx@*^dlEs#G4K>Ie9Jc5~pgo;Q^mT?3DQBq9hUfRQQGWioT
z@+IWhKNVPe@6IH{4}p#=IXn?TJQ)#Sq|pyCPCZpo)0r;wc4fugzkx4LpEPm;(zH<X
zd2A<+@U2&|9u9W|8~LQ}R_yf2^ne@CKXaPrAlAc_>;uHinm@PHvfe4#=T%M8w<isO
zt#{`L*K}qA_L{Tfk9s4^?D-e8Gu=MG^_YcG`mN{t^Eg$u^wNM3oaJWbq>(cE)kRWL
z!qj}tti*KU3mA-Yl&0z}%owd*Uf1WBhMPm<X)Jl{sb>l{#^fg7bAamnJ-td)&WI^P
zvHUk9N{2>{6Kn8;au?fiPHAP-Vw38^kfVeQl|H#D@26aiP_bfye!^4$8ylQ^oC0sn
zQyMwtL#$z<bE15{`9HmULu=9?TM-pHdUJr)4_3tt*9%i2mAE)kL%QRUq2S~?Qt{0v
zz`cr4)iX2Iuh$kMb5R(|F(;|#4CO4*FloSjIhufI;_Zy-xY1F%RUYC=i*{FvqwD_d
zNe$-?4}@1TbVaRA*Az1vz2KE(N<7tJfxweh6%4hTO`k-SqVZV$jP8;M8Ja2Zbbva^
zrD4`tM1Pcgwc?mhh&Q32?voOZ49U1>lTeKK^$(4X8X5@jpxl8&djyi1Qp|L9`}(A-
z*m6*}Qchm}31kJFKc(h6g3Gd038LN}d|80!?Ij-~--RAkpZ8?s3_CnRMOa3B0bYdD
zXuW^~e4{2NT7$9e_XtUfXbODIXX^ROQ=`iivq3Rmi9{J~U&1C<*aQc(q4><RKovYU
zj-=6g3ht&$buY@%Ttv0%Y`mDB@P?Y8!5G?UoQXd++JN8H|Ld33;2!p+RcqX&P0t}#
zk@F^o&bzLMIK-#_!z52f^@-G7xAP?5;X{ejLOb0`ift<{u0tP|@FiGcD$sjuM24bs
z9F8Sj@_?K%DYb*F$h<fB3v_;nl}&BWR%C^Zbd!l>3Y})>>4UetxBoC!Fz$t0ivT~r
zKldZh#Bt*_Ykg7Adi$z)u5DMAY-Lozn`I_J=%L0tT~gYvq}>qnS^3b(;n3@wsj5fA
z&6J2FBSFcb%yz3Utwx<c+k3)Sv%?7kB{OpQMSgkp+Asr>XgbYWEi^q_tuY6i;EZZM
zk4n1}FI*r2<l#y!=d`|3x%`RayX|LJ)%H==!5y?-#O1S0Pq_%u1V^~nSi-TgxjCNo
zhCM!<JqmkIiyaY%A`|S<=t)0DNyqQh4N8nlyj+c}a2S>AU`J#a=(&NG_9(dkF=rhE
z&!J%%`zmol)4lc}PGPFXOx%gqZnImpD7)wB^ksRjJ#HyuvA&``R(_%lGj3<cpYPVw
z&tanZdZj}&QBlQrJYPh=g)nm<l!Q9?4-_t<-S%Gt+V=#Z>Rb=DBxYKTm6p)M$0`Hy
zCA*7EPsA`W9%^Y?imd3``)Rpqx*UH|=_`b0xvl{76TPBkeTgO>)YJvfy=Lszxu=^*
z|Lsq`SNv6RZX|Hz3X=izd!@C8xEuvS$WR1k9RDTgi3L=Q+e5yo#LJOlj#+;g`>|Hg
z%VKlaJz?ql)z9EO*dtBMRO`vz1PQWQt*X0|?{FuiGIcV7CkYm}F3d&IN(RtKv5iJX
zNDeZ8H;afZhV(kopBsM21hs;aYDZzps~vS0On(&}#-B@)ZapL%PV|#VE}PP-tB#Xx
zJtSSi#veE&olBPTvcxHngbp1L{r>H7E@v_lNEKPc!#jj$09P7FT|^46$wW^vGD=o*
zYk>`}D!9rxqckI<;lE@tVAQxZa<wJrzI(+jIVIhf%+CHiTrn=vG@8~Xc|5M9<438b
z#M2>7OQdaDUQ-Q0L_Z`yV2bX+%=^^f%gKx2G}Zu(2I=sU)iSqU=9pZ$^94NV_S*av
z8Vy{*hWWyU0~`DpO=eUepBHL+VP00xl)<!NbT~IHZ0xX~Ri=Kdez<CQV7rbq@*6^P
z(3L}4bcah~9axUb6eoQEnS{(AS^n}I*100`-#f`<^`hBZ2C!r_4zX=W8_HGBmY}0h
zxlo>;3q}LRk4$LTc|=ud8qC6>Jg`wuc9bM9E=45CLcWy^lkr*M#K@K^1&pOYobNDe
zkod~P0}9R5$W`kaqJz^H;h!Y~OgRj2aT*2Zf|Vmg8D>TW9tjwKLITw7Wm=pi&a#f%
zs9Ey1l(QmhG1A-*V9@0BjQH(qa=?UU45FjoTt>}OcO2HP51$;p@IAikF~cwqGr$`-
zugX!yja>>$TNE7@D>K=DJNC3BGnV}4M`_wHMwOuzzF^rm4L^H3H$pewpSn-p@FVHZ
zKNlTKh9=(bWb7pw@jBXHlXS%Gbhzzfv6fiBCTK_tiSQ*;rW0|%lWv9?#s)F35(JnK
z2dcNOIESgH@*S**5#n~kdG{N}u8O=R#;!@%On~X#P@)4b&?Hkv(v>O4h2uHtW`t&f
zTWW24<ceqn!$q~Ur7i8Q<Iir_CH6D+Q~gjZgQ`9ox_t_!e-onYLS4HJFp7cPU7pH_
zq;#JFGMG{{gGWw4hO@vDwD)U2UKQ_e?(R_TIjI8P-;+PvV-MT#9#DuO*Cs8N6}E>r
z?HGB>Xo=TkT>D&T^0XA#(gw~egEuZ~@pQ(k$fZ-Ba+qMIi%w`lW<-lJ!v2p?DGMd>
zDaSp>b-S|e3Ci)IYu1#cYtoXY2%&ZcjiWMY8DV3_q?~vIsRsp~_L%DHtf;Dh;-Sr@
z6?zAP%lhA4=h!(Es>Glog1!R<2}BQv7MmxQ^*s!rMJD&~_(OtzhG*|v$~rSDNU>J?
zwho-_n+S3*17XaDFv(?vNW8uDv^*uv?vO0<*X|?IYBQt-kf@M&om+y*_*lx%Xb`;^
z8wlo{2LGVv(Xx4vMRBB;W|<Q!37UWsA;8FeXL5WXJ3UCCCRlJqgYbBQA<yLkCF?^5
zEeuvyc_f`PsAHtT!F+E^3YKUs+vA%jr!V&I?48+up*0PAX;{H=>h7I+JdY<viD5jS
zAx`~7EC+3_h6R|v9g`JrAPaqzs0CUnBmTezZ%Wyd$#==0#P{p<MI+qgZs>|%*vfW(
zKYl^V=9E&f>N(7D+1vOoiT;(Uk%0ffGO&0m(2|467GiPLxn23R`IYJcGjUsjbSg>u
zZ+h(Jg|Sm!;>{fj$q1BZ4jur<b=o#$cVHA02rHBIHv>=Se!OZN>S$Aazd6&A)nn*J
zRKO|zgX!iSlC^A*P86zxsgv%&!qYkb4KJ66Edg`(v5J{o-m?DBsSCbpUZ@{P*hWSM
z-^f3N&Gu!@wcOD2Mbp076U;f6B<@Zcsp+xF4G5L61eFVt^Jlte`K(lbkqv@OOX3LP
zwP)BrCbtL;6=`DO@Tc%=kF#;d0N>C#!{FIA3I1l!&~Q^Oc_az!DeL)qtRkdKNqzH`
z5#ytoaxzhMA6siamObW<!)_1v=i+Ff$oG(C6I$>MpIJP=%0%HDj?^*?(lPX{63L~+
z{mK5{oY0h{|9#c<0WD|kky(1Qnrv&-gp*ykn({T=ZNU6C@pQyc>w%0_xsPG3rNe??
z<$?~F@2_I?-3HIAHD5Y9&7VsLhE92bB>w#9*!%tWU=6oGSI@CR-p*e}{={4tbTrJP
z;nvXgI_cx1RVB;%oOT}$O^RcnBK~xf{RqBoXb};S(sy(!_)!f!;E+LRg2E51{&Tkm
zo|(TN{X+a!od~9>z1q*(7EX<If3-kU)3{hAVyZ9VRGP>l(fT@3`1D)-NECUjo<Axf
zwsHH80t!vh3Q2CFm@DixK=L<tXXq}j<3!Gn8GJ;4*FW736Lzr=gQ`pH+^Q?LLmBnl
z2Q8@!dg;5?;@SvU;M53XMSiGCIh8rGMeE7uox|-Sp^+zGiY@1!ea};ZHaPx6)I2gj
zQEBR4c~B4`qR>ztTE%E5@{69XWtX|Un%8UFrS5jxSK!dO`-3j>{me2%E_t=lb-ttR
zba+!OEIH&+Pg2hF0X*rrQ&haL<7yWZ`vO1T#S41L?;<10?DfTJs<B0s=RT+cOExQ~
zN`Vt=_c~0CWg550N$ab-pVtf|L$34)Q5J6_C)m)aJQ{tG=fU=Qxbnh3ad|f;Moc}q
zP469~@muf}YdbD8yg-!SX$ix?m6qn1I1waW8PU|1Uc!Cb_X*TN;+7RI^tdhK^Xb+o
zk!i)|Wc;aj#5Q*J86<k<2K&z3H)`i8FTH`VT_je}=KJ6nfWal8c(8C6(c#b2G&jSo
z12=>bYpTi4`{B+S{8*9NGY7nHF13qVn=v9+hf$OWJ(2G<*HO*>@)_$>-<|nKdUu`=
zWks5+3PTT1)vr1$KJTMvTQ64_fAF|-BaXj=MY?#s-`u|H5$993I*(3-2mamYdVe$7
zt$E&yj&m}-HvOBnHe+A)g3#S^9k#r$+z?cLTyAPTguu`LE|)c0hgskcD_Y%{H_r{V
z>0%To_M23MIyQ<-dc6Cc`C!v;1$;!nT^OMi)rmo3gOI{ZhbIVoX|iS7C<|HNXOEEa
z50s29bI5P$5*-Vs*r>kKt(;50JK-qDNe9m}p9_8am&}PvBQpQdf6mO$pS8}u`%cw0
zI(|YoIY|ORT+inG1iVhKjFX%(+7xk{aFsABH?P|M+!tD*q}gHi-ba6Ca}#)yv0Ihp
zwY$36l=MQCiX;+9K#^$FN<~jdNm@w%-$#jSVv0th|M#v$!j8x|_6ky3y=On=DS3$D
z00rj{@Nio0P$HsoIk161lSmC!<Po%RDh#}r?W3eX2_>g|9L$!E$UwSu(QF~OfqtYo
zTPV{=ijq0e-q3*e?;tItFJZ(aCnQYQE(I(){b@t&<)}ywP$Z+<8n6XtM25&S$w)_~
z_NsJvr(^X$E!pazFwO##wwQEOOIloz!D_=PGKd|JTT{A3k&uxTWN>4H&pX`sOF*J^
zT#o(nHV%(MNMzXL)W&sbSnRk320QU^bVfVjUu(l7l~s`7zp_wV#p>3Qg)iHrkUowT
z#45)#CAldGsZ?r^XBVZ53J%a~%&4Fy4q+wH5w(hn{xv-DK!r$skv^`)l3+)!z!C@H
zM!k}Dlp><{Tw&$?h6)S-;n<T|VS;Vgr&`lclnnHv_56XcC=t}zOT>9IiIV(wuY|%i
z;3@c}GB8*by$9_FKBd3N<nY*Z@gR?0=*bHnSQd?*d17`QvRL8!wARG0*&hiXnCT1H
zlzN@vm?8`*cLHC39!VIxFZ#|S>Muh@J<BC?1sGb1<qY;4q<oTk&;i+vjSLk{)L0Z2
zxs7plp{+$B!Y)0?nkN&Z)E%QsP7U%PZOU9;(I*QfBayhEk!^8D3;fdT+;#nkQiEqs
zMv+VM@(A9A9lrBBWgQ<4@+2quz>>V=qL1d=^DIO=YM7mdli{#HP7Ze>vZ-A6W|Q>X
zqcfe3sd>^P^CpQUX5XV&UZVz4u!3b=+z6T6N!smZta-67Y4*8tt&LDD6p1?fgK&gu
z^U@SI(O9!ZU^T&;B+54|L;UczVfa-Wp{8<ZuDw*ag=}aeTr#z2h39pJ3;5@nAR2`P
z%s6@n2GQ&sSQK*pf>g{}J~eTX0&vtM6!Ki~SnTldU}0h%223e&(h?ic7z2OtT~&J(
zUgSvSqlN-w{-wK<T)1rt1_@@wQX3j3WIa*+bi(9#8=Op>Wc^z|gE+CP$x$W11ll(}
zLNzS*f*)-?UNA+2M9$Wpm}j=GgPe6?;M|&(ZWkLUJ1DW^>OVp@9m*UjAy1o8;Ne+j
zH@PlybWd}Qi$0%{u0-j2+YbF@*FMe<eVb{~V&~Qfd7onrkz{43|GdM8wrRzA$K_iC
zf^h)`{+Op9H_AH+MBCdRjx#Q6kG93epOXy~OAhk5Z5c&XcCe7v9KKtJi@Z0Tj{b;l
z1W$qiaU0U+y9|p0_s%FYKiW#_?RR1HUIsWkPhNAki}ihyVx~SahL5(Wi#@9)8aLDP
zPjK)_-z^58)0cBD-x$#NdWgk#6O_G5s@szf-D(Kk>3@aSm;1po-OoCR_%%J{?QXUm
z`T;qL=3|SE0YP`#5sy1wNe@5l<1;b!1!X@50=PQ=32pPN&mMXnY`aTfYNbDMq8iTS
zr?M7U8O4-f0~;imk&A1p0%Ak)%OX(y&*oCsLoBgeIF=Hlv$f29>I^a<NWbge5m@ah
z>_$*h)f*WP7OY{PSew_jCx3Uv<<jUj{l|ySlCt$cC=l(z%Pe)7|8BCc4ru#7;wdXH
zP3FAJEn#I*k9Y)E53b)eo>MY=`S>3fP${xIBPY_$wY<B&Ah1t&*~1=qQ?RWRtAa&R
z<gl$+ZF<FOSOoV}V?gjA_-q}Xa8v=x42!9R#02kKQ4D?4$F}QXbEyGF&O}leQWZ4@
z+q(KizME>*ExUwedR~xjOCd}BMXQ(N(BuV`%k9s>sB*Vg{)e>P-Qarm|AbBjA07QU
z;_9Af*|-H|&F5OI6GXGl&tL1b7<B03Z%2N|`p;pmPXY@HsQ)j&oD_-%mSQ~aC<(?4
zS$a8>MqF(_$65MqO|;Kc6Nt5YPicgT@VF6=m@gG}4V3?QCRNY=-|9L7wFbJ<DA?F}
z+viIb9^T~6_D^VS8G8$1-O?sSxudOPBh<NK?+3{~djXfV))UH51^sJ`Z?Wx!K2O3?
z{-bFnlFS%pE!{!%HGB{hzvR-V1l3{VK9}CHN_y(sy9XcG<N*awr*F@w%TG6#>Anl%
zo)4+|#y$6sg%rCTohNbOi6I@czfBJ0zZfhSZx;HjCgYW@|NQw^VUzfP{*zNRHRhRQ
z<6}^Nx%<|&yrv@rdLe33J1<34cz0^bUIY!CX0)Dal;;Dvsv#oZ8Vu6g;$K_ni)>Bp
z*5VXOTy*`#@38O~1jWWHGo3CO7u{m7<Lr)yD|f1@uE0Ut8kW=5xWeC(8E%!uhPSjP
z7%K6a8eav^SG$cAZ7fE>WO<gp#pfuGFZe&&|6Sg@N8D_S=`C*!hVXVMHD{T14*wTv
zRif|8k@@jr9qPSW*X}#sxvhM*RvXyW^o}Oqp5=?HUVD09I+Dv7RZRE^k(u*Ke}2D)
z1WM5Ex>440w>Mcjl~2%AV6t0XT=<iTU-JRoz9z0^2kpJcANBL^FHrLj+Ywu+z;K+n
zSZP2~{u7c%T_9j#+iW&4&cOY-wzXqfM~_UiCo^5#qAX6`|1+EPhN1g&t<v``?2Bkn
z-=3-rrT-4yqQEnYi*BLOB=f<#hd_bL*>Lsm2cGu_9%c6Ci+v7UJ))r`xgiMw>rA(u
zhIY@b8(eItxSQ7rMwS{2W`F;E`j*Ovh7|oAj-b<W1`%FZ)0Sl{sJMEoO^lp>&kzEg
z=Y-D_gP0b$C&grZVKsEEQJ2*(q>gMofKFT7WwlpuzD2lpsjg7l+}Os2+QmnoKM(@h
zZC_fKUvofQ$ziqP#(DO)&h2HDt)|NsbM@n@c(C2(S=t2pj@T1Hleo+1`{YOYY57cF
zUmk~b^$WaRp|@#2I_0^2pR4W&q4az6JYe|b{JvgwruE5N7_|QH;ZdO1%3ZhiCfe=3
zc7KKv8I;hI!hTaCpYQI^0yOuMeC`Fe@q3boYsBwO${Z+Q^a)+XRGY20r2Xjinb*hy
zfAo)TCVYm?-|*8(x0KQ8`UTw12P&=zPei53(w*L&5Nwu(Yuho+#c*!<9XWv9L5L`o
z3^NXSL&vX`$Bn`KZ0}lczK(~WTzuu6=o_WQw=LpXX&A{tBb{4D{rrak)jio<;fEp^
zg4V;Y7rsyC5|M!=<-9&arBrEL@btfTuDr1?j*Mm(qOSMM8lJZW0xXp{%^_BjXqRuo
zy0>6V@+xSY`Q0F08M~!y$il0wOl0ZIZ^Wy2_945c#@p@lJ>S<f@Daqg+gl#i^vxfg
zX^&RcB(xm)wS>v-SIJKFy-{@+()`xoRp>gu{mcn1alB#tm^EK{cv9B+_)8fP)?4{p
z6V`HjD9~bX`K^c*%v?r$eXa6oq1(W@t@Uw}u-NCo$Y@)Uq5J*z-$#r-6CinZ<**ro
zaa)-X9^!dh({0cv-EI6Z+ur)jve}oEBFl`uyHWF*GSZMb=V>s-<et1Cq-G<N<4*6K
zvhHodbfw42HZt4ZH7fh~Nh#aDgx|_<J))~J(V7Ci06Vp2^yRNrqr*Ullbr_peFr^+
zflXrL4aaBX!o{BP=QR~bnXzx$)r_G#RsJK_e7Wm}=@06eo|Z|-+vxFl`|(`SehS`w
zdxWH$B)Yrb?Gme<_d8!Vl&CnQrW;WED4Q;?B?s;PF=zkV_m*q*gx&ZEho!m;cWxRQ
z{yv^3Kg0Bq52{4+qd(ne#~Wz;v<pvC7!-X+u#@BePAFBP+mnJxSkm2@Ku-7y!IHZ9
z{c`1gGFw4*dyh*+*OBzxyV+q^;0=?*_gdWLY*W(2<ZO8Ab*^HwBN2OyfAq{+Aa;G9
z#|*zie<R479R)V#$786x<N5bZ!=jp?*wAb=wTa!6*N>1L>f537!CdIy54f9t=A)H?
zQg&(eKO+UdRy^7tGCscZxKHWyPKw0*=U>-d$CwiG(MAP49yHhbXm|W2`oFaRFPnQt
zUx@nlO|9&(5g9(gvJ1a^oHOLKT&~@Pg#JFa>oIM-9RRu4{%(wwTmij(<k%}6O^LC%
z;MFIKe=Y@Sh(<V&jJ_Q3jStrda=_W8dy%_OXh<3RLw-+4TC>xo$%?sVBrOfjN*vL-
zt@CfoiwYo3qqx2N#7e5s4kd!3d+B}uV_Wf`8TRDpmD_jbd<<eBod27!aAzcZeze~j
zmk>b$mzg8;A5<}0ccvx9Uk^`|Sm_-r>)Q7si;~$_g+-=ENd)aQ<-SpRE+idSuEVAk
zrJ~h8MIpwAi%N(YyIgEY=M|i_+(7;8CW^Brc%#7p<y!+PuORI6lb-Fzf@RyIjLF0r
zKLYxMypvFCjkkd*Cku1>l4=&mXQT4!?hF7aj7WuKWU76yXwjxJL6;w9U_xZpO5kIk
z&(gtOuMv=&j+{jCdOCj-YkXp(>y2<Wmw)Et>U^%-BbQMPO{=^h;+gC$L}}a<LtgE7
z<7(W$-yi}&lbSmAHz4RtyDk*lUFV4?%s;d9pJ)H^1W|<vv+sG#T{DN_H}+J_QjYH<
zB!$b_79&r~69;t*HL1XSDJp7F!q(*1KJbHsjDvi|?5GP&gz9*Hz`cjBZm@C%3_&EN
zw8`?mR@BUCG&~HeHZ!7*7zrs=9=(JuDU4n`Bx$PjI9w7#Am;KkxAJoS^E0WyAFv{H
zH(UD=Ymc<DxiK7t=$ma@^L9N<rt*uf<t<&JKJdEB%gtjh0UejU6f$!z2vlcu#BD_q
zN)!$iIJ`GzPgLPCn_w<F5=hAhxneam@9LrpLITl6#{9`L67%Bu;R0XL2hx(eX4;n-
zBpp`c!59c7qorb}9O*mE+|1QG0vp2H*nYo(h@>=h^{VCm2kdfWnq_Iu=of!%aGJa1
ztj|jdPj*}+8#kXGc(mcfMNGVB<3*vH7Xr$yEkcD71Wsq~yxbF}2?>a|Zirz>LO$rc
z#2~Mi@jPr1xW!(mZ_JY}?rJ@HUC!#0ckb{iF|dKtF##E|6cncD^^KhYEh(Z%gM&P>
zOJBdr+={S#bUMon?mbbOvC5i0z8rc!sOy{ORqXR7Z&{r2GZv5;x^mUMc4$<p{Si&P
z`im4f_x0O&+aWuiA7N;Tp({7E3>COROJNj1wt!49=+9nzXhq^cAT4S3(KOV?KN~P1
z!KM`^6?iyQdpV2(o3C?Z?7Wed8OyvEyQ8GxB<#76GbG#N5~SR$<2<?Tp39a}E%skB
zhhhe-1`-ZLnpSFCbv3fx$W{UY-g~hUh9OVD?`C6Ufs>O3oRFl8jLtt!3k~3)gk3Qc
zS^DFcDH_WvPyiBcT8oH-P&{)JjS3MQ(8!i)_2cqV0t$h(@7M@9Zzj9-KTOCXIXE)6
zV;Q_~aHumlPXBQ{NJ`K{O~wmei~fi1GP*9X?Rx%+=QVE$M@K{n313~<BzaN#_Yvcx
zL#O|ck#?So7aL|kh^#|0$p#%AAzATCd9>t;vbuc0x}ca049<gd)L$J;?T!ua$CUaB
z14`0-w)a?Va=bXbMvApOy#2j}fLkOgERHJ+8~C%@axYod_&8g?q}J&HGNp$V4#Ioz
zgcb#lBt1MaJOnyhL^eA7X9;#cDh+85RjB3c0#BeJ7F?{T1liO_FFAd4Fzw}=IL!fr
z@#&bq!7FVXGCTq*Ha;0PY?6#9K+ldcA}w+5@GAwxXugG3?@lq8Y>Rh~1HWcLuPN-!
zUAf%Y`~aYcs-iE3k|h<oA(bFq2Pbrrr=shT-!!2rO!W-8EyFEIS1W`_5};Rqmh1&e
zDkIlH%oxP!;rSFs{iKD<>s`hrshTRDv)sHX9hZ!nO95J=W^_d_<vbB^k1Vs^;nN0)
zq4}r;$Kp$gaF2hoZ-j{#{EW}d6)Vk)`8U%I%sk)l@Xo~MTp<7$FaZbE1Hz`|)iZ<y
zLL6{jA|n0sV7~dtJ0~N2Hh7EzeXIcM0?oOK85ss%Q*DbzcTHbWmwHlURTRfk!IZU0
zDQdBYiM`Cg2esv>QJ7#vTN;4}L}-HB2+RxGTcn9MRRTw&2uh335fFBoR_e6SPP`GE
zK*uVPbIwPwaR!aU!6=G}ZDOwqghc-t8d@Amq&Rj&$P8R|Q+|%KN^ipuB>tz39Ae!t
z)yy=~jB$*AxIu=>xW@P_Pj@sUOPq$sFAa~q;qJC-WW19e)~PD*lR7uvm-Cy$LlSD3
zV%P)Zv*W&tdlCthI)m`o4DKJq%j;o0@j{FJPJx}iX>9?&yW5GWpuPbX1i)%PD;!Gx
z4~$HBA+*i&LK%y-uBcwk1<GGT)2@Mhz7`Q=-ysoprlv%YKzBt_{?4$WhVjv&b#1y(
z1y2{R&pCH+eZc90<ldFJ>-~KcY`j5MnU_ZnV+-yG2QbM5WYL727Yd^uqC}9`pAPS~
zkIu`^o!TU*kZILq5gu!x-S2j+90gr#pyXKUbc4OXOupex&otBC6nW>L@CREq6V((_
zO|staL0D~*65^O8hA6n$pqQbljsyXs-{3viBv2%3VVRXXl0JJZo2H9wgp>-r2~~^&
zz5hXPLHQWf@2+pMvh_RapE4Fe3bZeVhaOk)kAbu2Od_<N&k?Rg1Se3gB0kmQ)zo<-
zj<9IgH9f#Xkq8>xA2Yf6XWe@J@kv(N5}CC_^Kp~ZU26M@AVr}+I8N?fXSzVR^6`QS
zU~mE1?t^*020oP=CAsSDO4Gp<oJ-@@)cDYW!>+@tVkN>E17RQ)aRiE7^QS~tfl1Hy
zbyBpr=Y{&FpzPpEqhDp0)}yWA?WjOj1GBn1`qfPaU!_ia>cZ)%BpIvUUFL#lfn<)h
zrik{?%A?8cBp5@p4-SXlcG!5wfgZ2r*AtG;-rdg+$CUL)B2V<W<A3^n(gL3p$Its3
zXt-|8gd@yqn_ZRP5av5T`r6;S1o`Fcxz$Q>2M+qJ-19jHRTeMq&K#fH=IXyF*dU-~
z?hf-6vtXdWBU~R|`UF0_*tX2o5C7MnOyKiC@wn9|9dLgRe-rWwI+SI4a(v)FC$o5-
zMtq9TH7#9t+oy8X7(uM5^Na19F4dm6e-6@id&ZBX>ih3;PQT7&)}7g*wWF?_aF$>2
zVsjBeiI1F0P7Pj2iu|GrF9=>g@7sR3-I?m1jMRTey@<BinL)z+e6dDJAmUNumvhJq
zL+}2;Ea*|r{8F-NU->Q^)CpMkd3{W9Nqws;BO+Z(gg4<kgZ0CufuSZF!T;gwEu-S-
zf^K0#2ofZ?ySuvv7~DdF26qV>+yev~+#P}h56<8a+;#9lg1fta{e1V{cimsNSO03d
zPaoEtu2Z#Z@2bqVohveG*5Lz?c12F|^+pn-!VaK|38bQMA7OPwHDwLlo_XJo+G$D2
zJ$w*qZ)f|*LJ21RQf;w4gwu?WmZLrX!6*Q}*tD;7q3Jeu%|cz5FIo59ot-Lce(!bS
zE>M0w_i-fFNtevLsqO9qg~z$b@{Bu~@qKTzeEwi9SXcD!BH*(3hEI8z8N}y`oiQxY
zc_s4DL(^oB@zP|0m#KHSFw>GcPYmHAMLGXN;O`RnZ6o+xvcEH=@kvbWWefEP_HdAf
zuU7l-+{&au2v|tWRt9}aA6`jXiZ+1dMGe$1@5p$tFSl8!3H8%6qT?3+`#=!jqblYR
zk>KhH&qfKeXH-SXr(HjL9AxF_+P^)&l6b*6pQ1deK=Zv9YJ13)kcu9t(ZPdHwy1Yl
z?a%!1EWmTx5;894!^N9zA1e_(b}0GT{ClxVe`0c%QRj_)D2Cf7UMpe#SQ21M#Feol
z7kE3ymZUF!`|~0^0c36*m->3`b9eE5Fq@w!KvJA<qD}s=$i9htdWsL-aXgK`J3R;S
z$I78(5(uQwo4!-v+NHmGcg|7jv#-fLthDz{_y-S7b}ISPn(ORrqJ-ajWca)eajoam
zSGk>^*rehV<5#@qt$2%E?zB?}-xGxLa3jb1!H%y#1_9rpCx$q>ZUZJ^yuv|IWPPH%
z1eu*|bGTH$Kw=&_-?5}o#7-W;$0j4Pc+AbE<L4g!_PT2Ypb^(q?Fs>x=k3$k_Me@R
zfior!EdAfPai!2lCFloS5wszN_fkzVXU33Z#^`z!G;&6E)!hCE6A2HKx)Nwi5OL$i
zQ>qj!_UcIOD8j#KuMO=p^8Qy*1Z%LOxGrDYDyw>aj{o*8;6V?(J6>qsm4zc-PM1Kc
z(@6yV`N>EEBR93|u<GUG#bKG^Na!E5dQ$V9l}2MXq1gN7{qMrSr2g*&04afb!rAsN
z=S<NTP8Z@nmx<GK4>c`%oGL@#Ip{Us7We(q@6Ae;uN{O|{X=Bo^5CdW&RoEOnvO&r
zs7D=G*SUa+veO&3dyZ;tQd9olsRL(y1a;c>!tkqveAI-_u}ylP+Q*9G!?Q=Hy8)ll
zksROekGzB;IYcFlB$qL+*sC0!v?I8rkjy3>`NjWtMnc(EGVXt;FMv!%Us&3Q->Foy
znDHRYcZr2IoMrEHDVBR^<=%vnOWMV%lcmc#frA1fW0vPpFW}w^sL1glTx~HH99NlI
z*IY7pQVx5%>};MBOqaS>)3rr=SqckvH0u<a$?f8{^i;Q_iid}x$OH+!BNRvCQm1sF
zUm#tT??ujiT8vElkv4`lkJhHOUwBJQPwN_wo=qQATK5-gVES_`%jyrOC**H(CPmu`
zSrFV=(sB=7!rVri?(niYo|f!44X-orlQ!}Tr^VwHZcwa?{swxYUe~msvQ5)<A^6Zi
zvn+HFu%vlxj!5aM4mELUz}9ZiTz@)|e3_E|=JTh7Q)qg8raAP*pPj^?i02#qw?vol
zeTGwxRke6+D|gBWn|E9{m8p4YSaL^pAT$hWjnN)zk$!m1eh}wJo27`jsQ9dAIP*kL
zJ7m2}hqM+19wy4`8)}(Hc`dCxjQTtf^(T#xbX-cI4#VC-#_x_iLpD5cd&o))Ust=y
zY~eqS=k#YDh{|f*kFuPKCTz~88>P{jl$8{Lr@w~9rGy<#ycOKAke}gGmW11%QKk(k
z{9!32g}bMEQ%%!yt&h7|v61*R>aM-Hr&;5;-!{2X*pyZ+p9Xco{QhL-{AN3G!bm(x
zwhw&e(5C}1ff!A`<`d?JGBMJ!3YTDGM21@n#?NqK_A;mI(BY_PD`XT9TDt1Dv^Y#k
z5DH^Ab(>--+~&4VZUvPZ<KTn`{gA0IpfTQ4_;*)%!5SLDQ%OUsdXg~8tK6L*Xv5=d
zOie7DPY3@n^tP}R1#i*Y`>ir{#z%l#tl4ARNIg_6j+u)bx!1*h_;(13_pVwGO=Ak3
zJ{<quRJ{oo?awdBm<1Gu0qp@)XBa_ns=5QzGFVZO9B?b8ze<S{WQYl>w}VYH=tC_j
zbqiexy!J)&<Rfat;c?380x7fC-1$^+44^fG#q)ni!VDb1)AyRBET2iwvk7GqJ}z5I
zCgaG<!b?LE>J@&vk_?%#BJ=VER?@2<RYJXA;Y>ec1I4nb^F~mVGZo~kv1irV$;}k8
zq3-oQ60O$K{N}VOw1K>k2Ff4*1od|n(Ym6#*Ck<rcrtw9Z-Y#!7c_rblfHS=Y9l#=
zKGV~g7e%r6?u$&^>Pp3DIxK~$sqG%DN_%>fA;nJ`cfZm?B|XjIJih*Gx&<6`;#kv+
z{-ztu4{D=pDBrE(@o4XZcb_o~m9XVa@wRGS-B8Pz$oL-8Q=lYa%6%Ys(zJz9TGGu&
zG3X03mI`G!e1tr__R@Z5-6^FmE^B@u4AZ%9e08<bfsn8$(D!{Lv*tW9gW28j$*n##
z;u}g9nCzU^VirU0EhOB^>PbL3*oivS079*V{~*=VRDsbQ!AQi;k?&;tyex7Ng2Sb+
z^{AV8sc-TXEk8ry^Zmb5nS;Se-9kss-qhDk8*(3a)L60BkW{VH=1J;o0e&&R2gw=t
zc1UJRgO81GYy9ND-(o^9oS!>80zL1p@|k31sYy$0Bvs0+bjv<aoI7PbT%#91`|25p
zh_SU>spq>WnY;doc{o1uy`Hd+xGewk%!!2>jY&qfHFUZb(EUhV@V5SB_3x6zb9mlK
zFsbc=q!H|erjsN?(aH?HtLwyt^}Kmg`GKW}bK%mP4L$Rw&aY3^^%#u?Gq<OX%%@dv
z@w?NSLM;WyM?fJc;r%@1JWuQmyF09O>zz$R(uPfS<URxah8XwHS*QqOFV->*i`+^w
z<+;aGUmsAlPGH@!tDXsaHV8gGavj%5?mW29u&iQVMm~hy1g7L>-)9%IKqfS(`E3VA
zg%&E#NUhh?BAdS3HV86DT3GCubi~Rbz@8b(X!upHAUIokrlPEiR~cM{x320@w_X}8
z2-U|`VKP2#@!nV$oE<63qtg^77Vi4#Njn}SvQzi%+M$R&M@sw4?ZRSuk9C=<HKVeU
z3d@jA(3#UurpR*n2{kL25-b9-z<jItOG62`=+9XFY{TY-P`dl9##avtT4C(I)NPB4
z?jeF|_ZNuzCbQ4n9M=L@_^ioq+jUw<M}72aa69YE0=4a0MZu}DM7j)-23ywX37-!q
zTH<;o)y@ic>e;6>5G#NWimrP5l7)OhUA()pWW`76GqE_;d;E07?c$gd3H5Qlv+U5>
zcRaM(Eh9!yHy)36v0n80!8YMYQ_lNQ+f3m3__>Kk^uJvF*n|W9CtTxdm+)z#K3r9%
zt=Y9o%k_5cPpy!c;Spxt%d<mVW%iG?@<#@ZAEco}xk+l<7I?@6>J)rHc)8>@LGzwk
z$l?ppiM<unrBiaA*V5?G83-6myO%9~eB7Ly1#ro?_}hE`4471vsfkV3`tg2-J*Kyu
zS#GSW&q+MHAV?aIyWQ4!^e82%c4A9nS}(mI;#hLXK>O*3zjBZ31-t-qE1omNi|SV2
zw6y<X>a2CpohbJVALy9G4J2EjG0Mh}{welKd;fm_$kT$M#evhzi?3GG@)1nNh`|Ct
zf{v|G{cI}cxaN~nlagNzm*QO;3Kgj{jCb{X2CidkHH+Mj4W)vq0D2myvp*erCb6K#
zS_EyxXtOgvk~<_Ws$8($9iL6lb#5eUM#I1ETm}saz?ua`1@YtYZ)(N%^~Id|4>LTb
zxEIFS>r!fJ8e)J|tj}Y^F5Vis6NvG0dKl1UUw7F&;#edi7(TI1ofUpw*O=2s4Q9>Z
z3<i(F12wO~0s5{Y9JBWM040;6$`4F*zde``l6<W<=@xv@#$Cyp*>48Z#sSNmEo5u#
zCOca<#R8cQJ2hf_byJVsH>~-CVs$B4^9iEOm*{r2-Jf`o|3V;$^p<!Rwmn*+{73;S
z?J-_`iG$yZYbw12Qt{MwZZnPO=QAbsRW53Tv~i+j?w~h(z$;s=*u|)g{$_{TWVps$
zU2csFOsWKn6mj%hs^2M%ku>3w6xm$--4-A#g<5%3(GbRBs`i!)vs;>YUCWf3$lD-~
zQx(~KFZq_X+EJ5wah%OIGv|?@Y`g5M!iSNm%<F!!=S)CCBSr&|5-o-|n28&C*PUST
zxTpa2(0XY}+1&=et}1sZQo<`DuhIlj1To$99u(RVx^J`p+s={+Y(Vu1o3fnuP`<^!
z;TX*WkgwJR0(3%vXlOAoOng%b3uBBJeHv7jMUtwYZ-CCvO+3Nad-8RxSN!5)LRArk
zk(jb|z2AJ>am>}BjTy7BxVh$u2r<2CxLLmv`R`qb!if70CzdbvmnD7$F{^WiO9x8|
zBj<^l8GUmEkocTjRV@X$MS32$ecUWVhI^dLLDu1|?cC1p`tl-xujpR2>X#SkJ;gIE
zpm?A4efpATq1{`+3+M8D_aP!=vwjxBbGy@=)}pGUf_hCt5@#NfRm-|$3sc7dbb1<j
z3%J9UDH~Tp$hhwSy6eEWCZBmKVp@wUGaKI@G<OWcy?ppz^rm(|e!s$N>v2na)VTk`
zC*C{+fXfo`^<zhJhfW$MCTjv1Y(+ye!l$RI-x^3XV(Gu)FV-A5zdWrxVuB*}M^2<5
zB1Ffhh2KxUOG*2=@}F#ddgeFHGcjGqTDs{_Vda(5hO)!a>QWUL!#CrU)sTxzneGw$
z**``^VdxL$XJb!IF*f5hvo<|*8{W?{lNt`hzu0a<GXd)$cVF>&PW*v>Js$bk?O54B
zU*>p8cw-%*Il0-Bdl^@ajauw<<)ijB$$>&Y_U6MtV=hebGUn|<*85AxOrW|(Blm((
zc*;z=*Rks~D$hcLw?No|h<rbG(wK1b=_6D!g%$U=1y^fzLeUEb07RXnw?~N?AbHl6
zh5`|z^H`w}nvC>}dxO)!RiCIq9sbxP7G{GV9DUHr=#@;V4Dc1+>Xi38|=!`WE~
zj%I5>la=)<s6qSzjDp-(pgcHQAY7GY<=)=cTOs9;#+^HcbbH}@jCj!UwaIG<@94y?
z$N~^C8z2b^&h$h*q0qWT!-_fbd1+F2Z!31E{#0=nJ5i4?TTYP>BmQd$wO{6Sd&7gu
zoc2X9;$fBFJE#Ygh~n<XkE7b}X$P}S(Hh;Dl&3iE@q6;W98n%)qg&ZFaa23Ry3cqJ
zLGGt{a)6F#SmJS)J5fs*-LeKxQHsDwnk#L^S#H|JR3ocf3({lO{ZlkYH^z1hqCgy~
zt)oey|5)oxzVYNf3{$)*VVy8lv{-FgbE#`<Gk{918He|wGFQVm*m~FrMP6TST=b?A
zh+B%xOsN&9!74>wdOUeuoLa?rL)7t63`4BLy*Ow86kXzx2beNGBmZD{UXYZ167m-%
zBDZ4~sreLjQsjVfWSw_1){YEso@!}gEzBWNAGCgAVre9S&sAfsNX4&h%W$So{T`m|
zI{QfX%ZTwVX1YwejyWnuRpO34(!#Shn?dwSFO9S8V^EW?>h$Gwb*nwEgMIhDkASF;
zp#?b^diFvVKS+UghJv<OG|XJt0@!5PjhQ&mT{X={E=3}uK=LCh1rN3?fXOd0i`yX^
zyJ`gT{Te%K{2!paOc`H$0maM8m>6M;<m!3{Qvu~$xN<1iE)nHJ$WVG*B=qp_dJPUS
z8d>wM6oC5fi|I^Um7mhLy1oYzBtI$=hVJ0+fxOuE@$M^0U&=IVHWSRyD+xe;;rpQM
z11A^Qu?PwyOjMsV6n<4FbdAs9h;snoTPtec2`xDh9Z@W`k?mJvC}<wt)({qwM+9hW
zC@BGgcbxtE$GOvj_DZryR;sVGK-`~uIFP#C1I*>az#t>(qk|ngNk11a&FL}4EnwC1
zNO&E;S?-n@q<!QiG%IvWT53E9f48}n6m9<tPmQJfL>xjB&jSbvNx|r;v#NqvG|+9D
ziA+g1Ln4)DGTb|O;H>zzs-p0fUsI^>>25CIS4U%X!WF6Fw9BCDBTy*am{N4Oewn*_
zh;ZV_JMm|cY2Bc{z<q7Og{$r{ARTyK$11B;l6Pr3UtF;hYKkpw{{7#VSobTa#!L#n
z?mhF_q5;q^;%s<zEUKSId2rB*m%NDj(~mDT>M8M&CTT~f->ei>nS5G5Y(U6hIaCv=
zl6evzGkT;s@hz?7a9*p=__?#po=0zP5VOGO`_;#iDpoW`I-V>TeZ#j}VqjV@?j6t2
zrRBf6R>Fu7c^A^0O=M2M4%Ck7Q%4Y@9B+tHXMV-ZvZzec8}Ibr7luRYG2dPic7=kV
z=t1j(?5WWc%Pw>)eg#iA=Wf2&Jlct5XU3YWy*R{R);tn=<9uJr&A*rB!vg4u_8j(B
zQv+4KwvDwr0fqy-1C^2a@>(|dO5)9F%V2$Pj4SR*u8-6gCthMu1h!JDp=m6d{@Y7`
zX_j(mKc0utc)i6Jlp7=zjGxxaOTEPbHy(Drwoo0sG}1M~`wgLEry*#^DJFhj_wlPQ
zOVyJpU6BdEwYJz$bm6{SB4T~L;%>=k0a)<+c6+3HlJIF}q2)z3)QZ$gCej{IUM$cX
z^^{LI%;!#|ts;D-B1DeQUT+2b?h#uuTZEd5qq!RS=(UQY!Cp8`=aB-Kq-a!%)Q7pt
zgChK{r=-(gHkt9u-x<uVJ-aLS<6qR!x;FwG026ig7!}lFf#AMd|7brFqoQ&ffxvvl
z3U`7CYeFcmJSa$q%)wRVowBPY9Z{t5-qOO)8CyA}{+dCpgmYx@N`w<QG=u_Q{5)6f
zQ|pNrr0vcJUIvFL3;13|gDOjD%}la7&}bv0Ds8NX1dE*QINV!`)H}WCvLC;tLAD??
z^Eg29u$73)Qe<RGWWo%vFBaPluV!&WQSy+sYJPQD+|KdXj|i{3v~S*q!t?(ALNBBe
zfT7ALizR7Q#&K1V!wkWRbvq?Ailkos5oVFw0cJui-?KaFokRyJ@ic9J6R7bp5cB(E
z#xHIKJMUNj&SiP|OUGQppodc>G@oi&3u)aP4HLjD>(Sf`4u~^9oX|nHNvujpROku8
zh(p&!mx~Mmqq|j2MCCF&r5OV8bf^jjeVi&%=Zv93{6hjNI5M@ED*fk6S+w=EhOLpt
zqCpOXXmV15zFNl0iBmIclDB)5Yn|tBv}PTDcNC)o-KfeHD)7_St<&bZ9GKzXZ4MX9
zb1SIORno^S2()Rt<2dP&0cptI1Qb`OAoZGvr#-vuCKfQ?ZxpaSZ^r=BjJU}enp04K
z=g+1L5H1L$PKg`4QNI*Xj$&6E<h_6sam^dOF@Xpq1B5`h@1}iK`5hcBofxBNBhWBH
z7S&;h$OzM7y@|}<=H~0nCF!av{+P|pP(L6_6Si~(E$+(FdCDTk<>Zdv;E4b_4r_(%
z7l^;UKS5@$Pv%jr9i(a@V!L6*JlWF3?dyP8WjV2)(SqXj?5r!VYjhozAbBEt?qD6V
zD(ru_01R_bmtRPWMV$K=5jwc2j^a1?U4dLxsso2kU$P8<k$ViJ+wSj#S-~OW&)LWc
zgQ1;v=0F@$>*yy|^<<0p^|RSN0|CYLf5B>mbx?O1yy0sP32T#|__Mtq;8lI`o7FIU
zUa<v2x4OzquNPF0h<HtG;}KLnKw0B@a%x^LG5Beo6TeT2mDFmxN}DR2ny0c<eCM*<
z|B+egRB!lX-d)|#jqk*j00;mW)3PQ2UnGA45Wh&tN<MJqN~{4L0lW>Fw3qMS0W3(O
zBilYsbXg=}ug8!zo|##xYfX<S!0pmA@<&*-l*5!k6!|JRQ=OeYa%yrKnwrM>jfo`Z
zo-Xp_nHi(;Rf^y+zjs~5>I6Li;8LXq-pHc$d{pXPyc^}%cXTwLYB!QmqA$^XMsRQZ
zpd$9LS3W}V%j=PZwdIy&v}MPPb`wjOWFQ8<<9y+R?%fmzbWtbR)P-6R-}}Rf*|=M-
z=$ecedp_gFnQ+P4!r#AnZ>GOP_m<aW0`w^Uw0=$ZT2bZ~Z$MdO1}Y(B)wi9mMWoCu
zG*;$(bf!P{`4I1)a4KIbzoj~8GC|^2KTjr=Kxb(rkZ7)N@r(obtEfF<G+B9O`5!>e
zU-<)8YeHkk0JJ@aALzV+a}^@j<nmxT$9n?zi@7*W6J78AhUGPQerIdPMmtK$jF((8
z-kcO14Fohfm0pSP0Y75m#s3%d@8A3)^cAV~P2!J{fx~E#v>c=HO_*U%rB2O=mP2_*
zLT;sBXl}<nFWcLdt;e-%*X_F2h%yxay7hO!XLRz9@{5bo-S)%opsQB|9nO*H%pe}!
zmw_REz{dl8xuM=8>d;sh-`$w~$p7y8`%`VaH3fLn!842~DvIn)CopUO&V37z<iAQH
zr5Nv3PV_m7h2SqlnQ!eK9e-K;iu?f!gB|>PtP&faPhB13X<gg6IS4sAu<PeF(cP|q
zIyCZZOB03tdls22@D86{k5;ZvHp!s>>J!!cR@2(W{OItQam9ergXjA^EnNK_W)Ith
ziM3oS8M+Mio#<)VMM9aZAomXQ45x~|Doece#cSQ|s>yd;kBt|Y*M(io(l1RN3zTgH
zDR>a)-QRw~xAcPl>ioiDFHI;^cR4(Evix_(cf;ReEEt<{rjdqV4$DjFmK)fQdtE5s
zy563~>i((!ilQKk_28G|my?}Sc2IV(U-qEk2VZ1Er9>@WB`a$4@6sAbs4Y8$xGr@g
zww=URE+VMYaasyhRz$cK`gqZ_g@)Kj{BDknqvMd1wu$^_Fz;>ZAjcQ6SNneYU3zVT
zR)S`{c0}4b*8b;n+)|zQ>M59@H!{gnVRlsd;sgLpHd23vAiA)ykP^K!B@e;|5cv2R
z=f@R7rp<r*M*YVnIpnJ~u3kK$LPOTr9)DHrI75>mahgP_Ych6}h~|juBehrvpY)l_
zzSX-LP$zS*bz)xAE+2`_Cpe3=bf<o-TvVm3x!hWg*};u(Gwu^9nzGNno@#I<%4X0%
zS?pKNr0qsw9g)2(A!K^1j;nBJ^W^1^y^av-@T=s#LuaNrYsAh}HEW~>Ye+uRLW)>L
z#>U7w)`z}$zIO`P3LtFcgm8g9q0JMHH8$unPufP#`;RKMp~vTcim&~LbP_c5<=o2m
z&T(^og@I@TVY~g4P~bfZDQ5`FASBZ}7hj#Cka+INHsbhI<PK6JK;EG8RV@)*>~ybB
zTcBV^qR|*M;^VC;R@79?)hC9Bg#``GUQr*`%lT{3hgA<}@Qy{7fF!`RvA`RwK@GOo
zU4VoLq($a2OyM|#6hOFi=Cjf9;LRJ0_^6O6QE}7YcF(#?EExcw&F+(NHmp3|+<1{N
zOJ&_!oCyg?g-xBm$^I>qUZW620hWgQ#d|D6d_>P@K?ULdUI2o%lT+a_2t6h!uY%Cr
zPD9rxK0A_pHbU~g)-TM_>K%lWogI`Jm+Y07P`niX{Dt7{WZWkwzIG|jHi0j()6Jre
ztOZ6AM2!XX@h>M)sOoRp0`;eDvdwva(he;^qDKfoYQ1nIv&TKWu2<%0YLrOMjL|va
z)J)6~XK<?(zEK+RlDJrKJ@V}ZKeTqB&4sk?B@;J}eS<7r!%XlPeN>)}m7;H!uC(T4
z=YKH(s!!TlYBOE_1XfdG`r4vj*4#B+f(!gPzZf=Ed2_mP&M?WsEmwPz0xO316v%Yr
z!_=e`r)WFuC`vtTd+$7*UU;GSWlDY9v*BTz=@TcTo-A+<og@nJgUK%Gy9V+OnbD`w
ziPw`ZcAss6J9NVBVsn4ynWZ6u3o)I%!hd7m8IXT8=ES12DMSWzUA5dt?)3u1-37L3
zDIsgyRFw=w%Dql3%r+yv#my0i3alb78`!Ro@_O`KVhSHGF;)AkO`4)K8(?Wh;t|Cj
zRR!F|;UURirp(`|Xc0!qak1jWmc{eBE`w!h@#L_3O&UwdHKh|ZO-m8-8L1<&+iNh(
z3gOZpZ~LThA6x*K*}4`?iF`HCjSRo#6-*07<PwYcD2w^VXHfMoJuF*Sn|$@<7fgxS
zp7K5Xn*Vm}C|kLuKhO@N3^jK|Mcea-uMvJ8C<(a3RKDcKrdSqlG(~3PQgL!1`>rU=
zMat=YJgxIjPQbEr3+d9d>Y~Ryk)Bz^yaq{oO~m6onB{x%@=gq*sE8?ho>TPj|NAw*
z*(LMHHA46`02$dcRCMevCVbba!bdO*U=eqxy6owa-lK`hc_Iqmjmjv1#5cr{w~*ju
zlZr`Yez^z=vsZodtT`dC%@!g=rG>_Z_2+*y-0_PC+32kNGk&G0Km4?Ea3*qh+m9;*
z9V`p2Ag^h1f1F&V89<dGIbaCDhkIGh=lRlgiyXv=HONBltfFqfC`1#|7yOJ$IrP5l
zaZ)y!64_c>coyl==#up1Bv<4>3Q6cGTaXxOtvuKho@Tc^_%s6;7SYLKSaj*mb4%Z&
zX(jnE9Oo7qLeG+fz;pBAgy!e`Ka*1}S-F=_6r@p-aRTORE|{?g-7Y`ThvK(w89l?M
zn=m53#4D0il&|&mWEz=T?h86J8NbsFo9`kad|9hPzkcPypIz;#BtVkQS-^eZG5iHz
zDgA7KE4W28T&~!2Es|ulW%@XHLxC$_yCQzk)`hiA_VjA}td>l9=HP??kMzHfFc%CJ
zt-IBRTrt?Jr}lFDQ)rz*2+qcnaH5CP{~Ut91>pmO|Ig9$NGM17KYw%bq}W`*ge&{?
zDTk4v#L5iD?;Zfx^|=}p8S~`u`TGaDl&EDKn(R}VBwl@898R}jo!0MzHMga*aBSB3
zh}<tWzpKI6W1*~P{VN&*Q4AvtRF*?dXpOyCDs=J5y_tK(Yhr=bUUsomLCn}mvnF3a
z3UQ1n197TF5hO&5>WOyd+V3^%hZ1C}B~K8TT$xGyZ6{gs<oFeEw#+ToxH1Bl5o=*m
z@wL@KeT?lZ3@N(UTtV*!ROqQU2o(2bg45*Ae{DGVo0Nw|%>;dh6B3IzUs`k#B8Ma(
zK{&VU(Y`NplbcXiioY$N{99bAp&%TeByV*!Ln1;!Dm)@Xkke&5y=d5v=(=P|9X~T9
zCUMRwFbrW$92YVNS>FdyhA(XFbuEn!%gG<oTx2wdMS0qf1@oD2*|dtbC3f=YO;y)o
zL`#SdNul7c+a!I&pfPV{LM2An>ygj;m*$>=g~@mmMd3V#w$*LJ-Fp>I-nNedx|W_O
z1F00;W{W7%OV$sKc;vl*pF)86-mc#KLr@|1ckSt_T%xz;VJ^73f2*3$$t1S(Sd0H~
z?_xPj^&L&VGe%B-=MV`_b2J8H-9(ia@U09D4{Cr95xL<5H~NgeW9>H${*8gVvH3cR
zk{>M~pzw<o<p)GA*k3+cAs*-b=Nf;ehb_!=?&{ce<UH6RohI>b-mfDu`H>|G<UV<2
zV1A`p!fo^XJ*J>;b9?-Ca*i5#OKSI4715J7wX?j?W#WZ@_O<G_xjw7kkVX*rTfST!
z3Nx@^j$Y88^M9oU%a}JK)XDd)kc<1v{?*c)rrV6oRK+*H2(g*zZQCoN!OS|<sYHnL
z(vD0rt)BsPqgaKDkr%m_tbu+3B}(ZGqgoYclQqM(uH%aP_K{hx&~v;4KQ?d&rm@Eb
zdFpA$4If*)EFh4er#~!2wC*6peTbZ}C}%FcxLBXCbU@kR(w{0;l8YgL<eO+H6TR5o
z*VTlIX6U?-p6xoZ+NBa~R$p9b;(*JCyuC5*4kn-J>jvl}YQwBOkB4|qE2xmM-|6O(
zII|gOI4_-~vhBu|r(zKaUOh2b^&!ubuC(7Se$=qL2<Fny4@^h>gF{u^ia(Vtqtb;9
z2^o*Rqr$L4(ydP4RN_pZImh|1Zf7KvrM@8>wAA5^cDanU^6byRzcBJqn+M`%3*SyO
zBq+w#AU$P9rWy))hH)mGZ!(A~ZD*q%={)kTy)K)QH+>^|y$soId)TGOI%dwky}FUR
zzcc9End1=c{bS@AUgdYov?PplzvPiC?67*WiX7nk9cSJ8`;15K{Y6h31hV6*ai5j{
zr#P@n{&caj-{!Z&$YI-kDn+MRMhE0=+j;TpcEeggs(iq0$vFcNdd>O15!U{xFr6Fs
zXB6Q>52$))q_)4yE6T3Vw!j+o^89(_i68Z|v-m$dqiorYQpN~#UYvyG%t+6qf{k0@
zmQozWPz61D|1r5sV-}9tKO05q^NGjrx#I|X5o`4&L6W~DxWajHUqt2;r9rXz3d|Wt
zqyae?Q5r$!tWoxQo==PQ^9}$bHXW6{73&9ZjS|@tUa0VzR+c%!LYFZEb?jKg_E&Mw
z{;74LB~)qD&<iJHK#iMN3R6ZjP#<Jt{uKsM=%d+-AO!DpZ;w#p#TplGAP4;vNK;}a
zr^=qIVf!{X--aF(esB>7mcv3Y`FVPI`3|TwgnnbJhYubJt|+PS*|1{m+$ImY|6OQ`
zWoM^P9zgGzAKwjOMNscOMoowci4T49hNbXzcZz&ml8<KO5W=;GZl6=kzWOXpMua_L
z%AhoIppwY{e8fV)8Y6*&Q74DP!zZxPun>pl^x_N(6--E~$u{5hypAp_i6fdnaFG-J
z<=cXZ5TR~f+(pPBR<q&gPB}dWAqUkFxU(ulsYUkI!(P2M&l*0_O0Iom#qs6Iptb+X
z2nPM#rN&muJG4QN{41aNlN#j#$+Yk@;+swqmE^^a;^fsAMv_LpYP)rW!b{e@e<D~)
zKNSv85Z?HnP=50em-AW&?Wh>4iW12PB$R+d=gDY^DS9FHXkQri#`^kY1wMNk_&@X*
zm?OQ%oS0fGO6Cs_^+vg_PYg%HB-^8(UJIdRX?oTLJ+_|j;^#BqETkxCnFtZ+d(${R
z?FVpvK3_j)Xmnk6QU<4wRLLdbndTXO{E-Lek&q>XGK0f)r9ld-jnhPD$sUjLdSRXR
zw(WhwJ#esqjl^<#og;;Zct>1$P$9!R1jyn&ot-Ob@#Vi=3C{;LRiTcEy4I)iyJE+`
zfzwDuQc&l)r7D=)yFiQxdjJOHvH#WLM_4Kt;Up1Z&6p^s6@KNmF~UTOkwidp)W;2F
z%pe&f5u|%_rOFrc(TUj0?=Z36b(Wj(c)MTeG(W%O!A^TWs;<~aV&cdLg(R{m-q@y3
zYL3^z_$5u~Y5-|r8G>F-lq(V=M8|XY!16j~vB_bn`AkgScJt=HyQB>9X1xZqyPO4u
z>_+l!<`FUM^Z-9sT}OyWT5YP@j16bkXn1`7RgoEs4vWbAL1um$=KiB-g=C10XvElq
zMSInY;*q~&@NSWy?qn#}WeVMh5G!pEV*emQT)GAcy<obuS?ItQBVr&z<#Oq})=)JP
z%@;5A{3-ByK{H)STMCL&a_mgKRue~ti~%aS$f`#)K<E{g$1u4p9%Y`DfWo6dB3#Gb
z9{Aq_!}Jpt8*NGxPHG)K2{oKm&o8ak<Px=ITwYVjyr+vG=t*?X@c;z)g{hIT$&lPX
zeNrDY)@He^Y7Q<kAQ>Xz!LXC3%a6+S;|lV$Ho-cunMi8<Mq?v31pFre5HZ1Kxr*c<
zlOR{tZ|C*man&#zOv!06gtS+&Hulj9hw_uCzi>ju*_=zKS@*IM;hwGKLCAddda2Px
zO_f-0e{q1!d6SKDn$Hl?iH!WAf}#JFkN#6$fGnJr*o(9!+ao7Q?kij1vT0kF{!V2R
z)-6_GBB^TBcn&8=M@MG8&D0{!4&;#F;;GDF!Cf}4$*Btg5z(}*$y?xUuyc_r^ZJ33
z<=Omh<N5g^!DrZ{%C}ugyx#*_Uc$MY(@ci23xxJO4C-^qqF6n7KbXVU)&s^dk`WgY
z^8VjV`zQ@APz#UC!I4*KXNPxeD@Zzm*WApJ4A2?fp6=4r^M$h9@37?Ce{-s`hmTig
z={8G;?vu0Csa+@d;PN5>h5Ey%w|F%w4SEo6KEg5Dyj^RGn0PvkJkom=J$I5wh7#0m
zhlIUBzOJ8Yv%+EXS)*<(fmhIy2O;XbZUy7UWOvV{lIMAMDSGQ2ruk1bB&lV`+Up+q
zK3CPxc0>!X#Gdr~C8_CsbW~zXU47AnblinuT|$AaXRKA=3$KH4umM}!_3|k&K9|q-
zfVD(*VqlCcMvPIPCe-u#cFjcj+U{WqBg)#*=NNWm@W>iOTj2GSq>?d468pk2i^^eD
zU7pUQRJn7X8<1T};1>rnkITw`&+7;$Mq~wrc3LhUMFPtsMk3%*)!}i@Y_P1~*aNjn
z)Kx+^#G)`7cf_I1r&r&r(#Zjwe~N{x+joe83BGapWk0`IIj$>-P1jbIN$SDRaTJgY
z6v7~gHR!(T9(wW~p$I?VLWBN0H&A=~37>TxdMO_BLtYSX{Q_kK_$Y5!8#me`X<4it
zB+3RRI?y>11d6~4s2l^C$rDQh0YMfSv;|bQ{Ovy}!*QdtyJaYz2>`CKjN4;c&5^M0
z;(bNawo+4!jyX!u$Le26&MxdlPDi*m6@H;g4$s_QQVe^5A!EvvsJZ1uXC{W#e1nNb
zB$TPSKiXBNZa4Zyl<5-Gej5WUOyt3RYRyxMPs!Q~{i4h^@Ea@0_SDbX6__4$^US&x
z<HNeQ(E&3BT<XdFBJY0>>Q`?2(ekINV4wil?fyJM62j5q3^cmh${lE0OM*VBiSVQV
z{l#YMh<I%HH3{?ur3m2$*@%>YCl$*%epR*S$jq+{<sXXo7z4$QyH;t+Ol!9LB)%__
z8JX>0zk(>MezI|hj?y)*j%2Yo<TW;6(4<r9eCkg)l!#?i^nBYaUn}((NLILk1@2cB
zGc#zszV1Am3BJ8(H1dxY%anbsoR$T9y8h_Cu<f5{bJ8$33!8Fv4ZH6Ot~v`wGFq>(
zg2X9XYP<kD(edXqFW!u=-ZqCol{h8!xLvEh1O-nq<}&8X^>Ry>%dj0l4XvPN^fTqg
zUyHS}sEKkgus^46x#de3x^)>z<Bkj*vB-^jg7(nxJAvtGgOQxjF&&7C{RjqddBY7!
zNAy{-@~bPV-fgF(ZkHgwS|_~bRMTczz`&nupW@8;4pqbF;{Y1u>a*Zn)8XlIAW3fQ
z&otuL0qhhJ7;Tq4VuCMGxplstx35kbshRiMYm)oDx`0j|&8X2n5-_j2?wc$1^Y&Kj
ze7PV@(C72Pl6#w@(cieSxvKt)2)L<X76Q*Kcu@T5wZ+ktUAY(|KrX8=JNldT>0j|6
zGa*kt0Kr=DTjEBa6#3ESQ|Z63;v4ud{KnxR#HO?>V(kDgvPb|Q1B*9zDBy+|=*R}h
zy=}-e%JY61UazrSdmvyWm&nFz^iQwP5w)=WTui+C^PhEAP$KCF#M5_n;%@1<^Dl(8
z;!9zopLkv-B8eX4(-DVxUsL4Wb&7=X;#~NtW9L8Fi*_$$_$vm)^D;wPNvT9lSLnxh
z)po2rX+$xi0+U_gevOH)W7Nrx09YOe8*1OS#gv3EF$syTbB$vUYLloxNTC~|3<!#d
z-*Z)w4#2`J(sha=Dq5Vpp7tUZ)TayIkf^5+iI{Qu9bf+^s$%hcD^%YHSx4ETWFgLH
zq*CM-qSHL`zqC1kbg0@~AE}pd@1pw5cmA-@9c{Y%iYSs<9o9Bmh)!nYE?&pciK50y
zppdZG`qE65Oz=h$H%!Asb@XnQ_cI_7qh&(Q4==PXME3&`!yv#mfSZccx821HP21-!
zgGJ^ySF@qQAWn~Um<Vsw699n_;BQ!6@^qSv7A_Y2?12sy3Z-(J?IXRxq)YGj%j7<$
z<iSYQP~ARITOR0OtGd`(Bi~AheL@wINw%u+-=*;qGIV&E^o7&dp5!)}cyYHg-mYb`
zTbox@g`~nNjd~#~Y)=>Q0?ps=HJ-!?YFYCbxl=6a&W{mWylIv%b7P?*m==rO5v=I~
z7%F0~sT5@GD!lGT+ruFO(3$33KNY?B&>8wf?;xSsk-NVedd=QLf8b_#9i_kM7n(=A
zew~OillHIVy}t5P_HMlU{$BU2ORe^JTP`BxAq}w9Jo9xISac}5Q=xXf+^EGKwGxpx
z9_Pc3C&g}S_zq3CUF(NAAK8N_DUmnM7LX{Jm}_^WwD@pSJv=+QAaY}B-WQWu8zaLM
zIW(+1VuedC)=!H4-MC3=pecREC6gQidq5yA(bemf`rqb#S=ZSGQ5KbMz0apL)5V9N
zI|T5TdLoQ)*RYXD;tsLo%5RRl&DiVqT^l_tI@O#HQVR9q0m=nh7M<M^m6iKf*&J#F
z)jzkNc<kf2u>tQ*E`X=(fI0(tisXP&;1g)VSY$f{$NJR1kg^pwRrqV{t?xiBo9mYW
zy4^GP0o7@(j5Kl@2iov`uAS)SAMIj~3WbR?w*p*so?DgqHtF?YBIh=S!Za%g2L5np
zJSrUmv5g|`wZH7f7p<3k;DKpQBZAAO>V0rRbGns_75ck%Qj*ot4{WKH*Frd00yO-2
zvgwy$F%rc;a#DofGndI7zl7&U<%F9NBL=o%w-Db<LSF5`0~3UrIb{~BfhRk}TCe9w
z`kp;N-wu$Go*25woVvJ$cs9Dvp1s%)fwUUQGlcJql^Np)a{hk@UN7UJP<5*xo+Td~
zwb<7?#z$kmP=gtnjA1%9iRP?78As|*U_th<s813Nd7TAJW=21vEsD;q2>xOTm*p24
z!}u+ijDM9ANhZl3qO|FH$rUCJF_jg1-O5U}9|IF`SAS>E2LrrvXRqMFvdB3<J)%8Z
zdOXxk^sS8vCe!PJ5C3`6F)WlhilQ&9wHJzkqmIOs!VAGUU>vf&yk>V29sI;yL)j&H
zEc9z3_%7#9<`La(f6sP5(Y}GGD5@1OW3O8Wr9j@;9<~m>Z%P+xPSO#T`L_En{CiaW
zP>k3g(X%m<G)~Bmn{mmPyDj?s8r;VN;dIfC@SU4C!-};;XclOysF!cEIQC?*bw2|g
zLkzA-xc=~F929ZP45bA*x3veL+d%&<x^?cpjg$~)4qzps%0W&fVa@OG`+z4iTkks0
zMc)bT+UrUbc>{wmjxv!X1Hz>|rP}Vo5N%GA6S^aVSJA_qJ0if?{AaSO%PsbvC%RIT
z<p7(y!uEK6CtKbVo)APir0;Y47C8vmwmnM77$)O-62oh`ZT;WlCz%#v6PEUdUw{<o
z8_kcx&snPM;8lG16Sc+TyG|hM^(poyw3|(BlKIaDXs-O~!p`{i4KN@K0>b3p+%G>F
z!83r`aBNe~#V39?>euH_hGRF{FHr{<8z|-<QuVm(-?VB;$lNErBOF4=F=o#@(X!(%
zc7WKgj|>-A{mFR(AJkboGW*?+7`^Q1Hec#eCM)LIEU8Be9f=c#E{B2ruxkgTs%C7|
zJa8cSQBc&gFodXD>EL7w<e!HPQ6G+J;%JE!k{0IclbMn)_YNF?^LBzWb{bJUYZq6l
zBWk-*jzp^i+l;5wW8c4fv+QM~?_~qGMNY)ZiK^0D9j;Yix>L+K{KL$yFl?H7$C&0#
zdl>tg-NW>NctR2a<1G=wtTc9q4N858CbjfS+6p*&Lt!I@?R}_w%nV}OL(JzKMyg2e
zOc-QceR-5UJCJsGujozC-(i#yU08j5O9vu5S!2|X5xFJq^v|DE94m5%-r+2)7z8sj
z9UUzN(+>Yk#A-2X2dZ{@+RcAq{UZti%YO%d%(PA5ROz>6o>m|{M@B3`vYWPmH$|{B
zB5f$R?&Vg(E*)01yT7||Ux>4gDNB>PN6&Tu`nlFBXSmxJ+Fj_|!KjOy5}X{70^3ac
z$6Dgr4|5stKJfV8^ce?>#K~xx5XE<30D;eMlYz`Tm-mY=OJ+$?xe@s>Q7%+W%<XJ$
zlz+Emw5zR->yQ(AEAS|Ynt9$1+FI@gtUc_OrnSVC=_+Aaq(3{hg8{p-KT=sxZt@{m
z?2jwok;zm;E>)TlI42zIz}8|MPawBzl|&_iK_yQ_?5UZ*0!yMx5J6YQg4a$jm~qIQ
zz>@xDw9dL1aN-Xj75S~s6c&lQ8z@`<ZmbeD1rce81_0j>aK9;$X=#E1A99eHm4a_v
zH$|bTG5Fji#l`rL`C;iG-F-?TPLtC+kIV9c)V;!+^vgVv#I7m*cxzI&qo@Kkw&`?G
z@?HdE@Cq9+1{0A)@btl=j%5ZAi1d#ch6;FYuz!dQeK<Zl7|!Na%>1D{qt>#3S}V^>
zH-$^=g`J?-e|}dtG(JK4h#C}L)*@K#!ZzfYHeF_u6q7mh@~7Ndq&oX3h`X$ajhw2<
zvQ-8AjHsvsvajZh+$fA(#y~|A{y$uRdKo^%1O@>vt5y}ox4Szhx!)q++e%o*sI>IP
z=InuDAyJA}y7)Cs<55Fl8ccR1-BYxvma)Gfd12!xoGd9CIMIiDvKfCwGT-QQln|?9
zmoSb=;=c2KD2QU2u=?tTQbxUbZGH8jpNVB?J(<rZoi{J75hB1<BpBI0CoUCFOBYf=
zkQhX{!IpI!a}0GO)6`T;ud)25V{ul!YqZTR?-up+>}4SCEk$-@ztmlUqBI3%Vva0d
zgf@lfZl?ZVtoE8HPr{2aGhJj&@D7>|DqF9aKOHN|bD@xF#uIJ5@la;X?Sf<oFb<I@
zxmdBS-@#3R!ksE|H`?MIwBDh#(IJ94c#Oi3TAc>IV&W3`t(LkLsUFi8Fe>6|Xz;lA
zV^`#T8M<j9!vz3Sn`=8?p?9PU5MoEl)TfIGbG)doT4i`II@}~H{z$~uuc5Dv_NX+;
zl;ywBbluc7_|R#HdX6Wk({i!SuFXjuDZ@TGZ~^PQ%+dfPx~yihO*>3KM#Lpi$}Zr?
z<_5(09Y;@C+bZp66tYo9K(IOGWxP8XszsFy#m~2PK#=U+M)}m}hG+S;xXG3|!WtPW
zZ-G{Vmwe!rTH|t!gR7-lRQx;T+(rA}$gtd;J}`f1lZzM-AxS$7QEBcTb$^!<v#&o|
z1Z`jS;YMf5$~`*(a81G#CJ@@L8e<@>8cG9G0`0Gy&PKu~c|x~ZM>iSBrS?uwC2mOv
zbLqR+IsH3dOWua+7MM^(Nq;KX80HCQkE|A|Is8N#WEpsuFCA<y%szcFydNzcXOK->
z{!lH5U?V>!Pt`bhmwREkfyoO0NO<b+B1J!24W8a}fDrgy+&LqIO_LrmRIE$u|MqKf
z97{VR*RIg#w7scq=-I$*bC2n#x~u*xYeWy!-}~`FQ`9+;hzoHwl@&V@wB=4xS3<Cs
zg3gdMucg^Nl48;vLxyld$;K5TAktpA`qmj4vE}PR^8rUsu8eM;?M_INa+(mcJwSa2
zd{J`eKc3yn^rO1YqZilaQDYM?F&9iOOqp4_q^@(Pme9IRlQ}wyJm2A`FEa_9u|i%b
zMZ2Wm8}IBQb93r2!#=9@E<8+AOyNwee_Nz8O9$fpg6V#mw=;IW5%P|YXAMEaL&|A&
z8<Xcx>I<zWLQ6M}o71q)3?_!<i32%uO2<(_OC$eF_3NLpeHF_HJl_+-!`k61?b(+}
zWIpp8%8#=Doii8ieqWxc+ODXh2^l|15In&rIxV;P%;TdlK5A)Jocj=%7%BZJIotFl
z%>~-<Z7#S&nKa0mK0>Gu^teX9<VD^`mG;u?p^Os&I=lBRR;FB-JotTHy&GVo6?K;4
zA~#^9VfTKg_YBu!=Wwo{Y|FKZ1T5_oj1%?`cdFKd2MD5uG+*!jtbdbu?N&Uw8#hun
z2=o77BW*R2sRq^U%tV#&CO9d`O2|?$@3K_&4;s>CAP@ViPW0W$NstcV`}=0&ZIzzf
z4=pJo#2Etu)NJPZZS6SV9g~yf#weYVofNH_jTE1=tlpR_A2Jyi9=}Dmo&%o0T<A6i
zIdHR=+NR|AblOwP;RCvpvAK0w!$e9L$(!iaTMrYP;-eP7fYe#fz3TZL(N;9k{`OhX
z>zx>w|9eM*c?gONhY0|o@l>WyNRtNs?dn=j0=({*tZp1d7VtDmvDn}g0-B6VuDaj#
zB@o%jExsd-j3Sf0ImaRIihpKT<PKcUPh=nH0v^<t)#5d`*a{t<+No3?4R)DG-W&SY
zMCSHCsX8E2hm3&nKgX%r|NlbZ(f^Khzo<=%Jw{hqTwycrIxsd|lfa_2i2W6F;N69^
zmp!b|!x)t<LhNdY9>19XZlU6cWZ^dp>#f&n@)mvJyQkQDlmOPKILxz-4H+9^`@7VP
zcxwMC>kmZ7`|NYQ5Mxv4z;Evp8_LX9l~&4w&>wKgdaxH}j*IxSraCjTQ_W>nH=3>+
zRDx$#8Jtxfk>hpV+OKe!Whi{4RAMRk$SBF45hxtjd)q&U<rO{cn_yO)*S<}26G&KO
zlYc18o_c8DlPkYZo$3A<{-HVl(wd&`ov<651MQThlvj{nG<i{Ur25xi6vg%NM{09t
z?oXfeQtnbrQTm?vaQ(vcF?Qegfk)+nXvF^YRuE2v<n%-AX=!)vD>sME_I}u%dh9gu
z;d|;2hPCImesSANk#uRv-d_Th%}7_OlSmy}M;`t4C)l*NaKf^9L6z0#IpWR4hQ-IF
zALf8r2e0Ue7uH^xZ65DVU1XfzY`?e&un%vFaeQ}vRvxAaA>96@hWoevtfK*`q2bX^
z+Yxs?8I|+du)dLR`EdECI8papOuN3&tYN5~o$VJDJ7qhoKp%RX2E~duC}!CNtwvHA
zX;Qjz=^V*#-*;H41$+XHe8XyGm2DAKnUlsp?}PVtD{7aL`g<D`J2Qj(0LqS|x)>Pe
zA)uUGU(VN2a<P}(D!dSs?E{ttgA6mV%x81+<2wgUGF6KB6ZWTf3|48!|D3o9i-z=~
zk<Oa+UC}U{hPL+Q2b|a1{xk>Hc)c_C8Yd&j-i0r+$5stRX@ZyUUPKXfi+f4=PxO^_
zE^aC9KTCTtpz1#$n%y9BzK#2DMC_i+YVDwyD~uY@#`gY{VNs9S)I6~Ud5i3eS7(BC
zBjfo508F~b{#$8U1Hzb(Ypgq5wBG73^9#Ef8tTOl?)+w%^eHyekX<%&8R;a8;UbQ|
znJ#t&$8rVsw|br~sPZyWTbimM;cm=k-~z(?fk{|X?47X$(CMRTIY~J^QsD7Ul`_ga
zXtjtKOg~Nt)(ALo)(9mfB~=9%jb#$r*^Br~g+YgRBt(57<^Rseiy5$Y7PIxji$D9G
z_7!thY$6Fg973Y&F>Z1Lm$HpOU(^kS&#)C}yq<cC(~ImKKR3S6!2yXWfOJaXy^+0X
zHB;dqn$zzWF#l|LKO_+I5+Zix5p>0SWHHO}66vId1lNxK=y_bcMk~JX7<146yuI5S
zlThS$s`b^wfq07I*&f~Q=~8X2MifSFu%FHG5^>WGE783q^)|gyI2$K;1*vU{yLVg>
z4p}3UWX3F!9R*xAo0#m|ZDwH#3JQ|qjPNeL%rjUoRAVTpr#;r_Ej&ND(Ds;!pN%~(
zJTLRqpDxKvB{7XQugmg3od_%xDX1B+jl6CfDKLC-JzDL%`X)a3<<>u<{PYnYX;r^x
zHPxth`?hYw=rua@`mogCnmCBIb_<%Dw==8Fg`TZrMA2+Ekf8dTu+<tRWOjC@?SlI&
z<;U6lKC9Mm<BWMR7=|x5&*9+F`f>?*;dJvyqugJ=ys;DVs{>}sk%Ru`tNDUkW~+iJ
zF>M_g%IlN71Ol=cI(<K+u03jItS<fBxmFi8qvu$D&0R?8TaH~(Hk=hSd>DTQB4egh
zBgw@hA<^fCsyG8fjKAHmlYiT%ewiO${{`$X*JVDqIRX=X^>i0`XI-*yy!p+CD~i`|
z1*^;|sU|=Ubw0qG$m^YCPJkVaTPuu6pgHZ(q4Ug6`?jsS)}DW6{%qr<5$I%(M46#{
z(Ib#+A*aW)=)iFsut(3&H**fVu1ZuSS@v4DfD^5;c>8)<DpBLK3_5AqVov&k>2>W}
zAT~WdZ}cdPCDxr?cS>tfba%3Jx}n*wVWI8(n(^p7;^2_!Mxr#FlEtlAjPu*68?C6K
z2?GXMs5?-Q%2pvPrF9fLSK0{nW9_&{HOEV>yIDt?+~gXqm#l8eQVF;h;sFvFV&o==
zUpg)3kQ~Ud8BIqY&$72PHR>L;ZxdDNO0s;o>MA?l{U}ZK_q<yj0tz~Gc#8GS)|F)V
zaOvf~uxTy*vVD19WOzZ9b3bl?5IHR0mw$H&-m6^exszcz-^E*>V9U3GWtykP&`rZa
zEHaQ2K#rUKTCWHGtUP1Wuv4nm!i4#NbGFYl$m=Jgh+2;w_B2ZZWfJVoF`gy4axcf;
zpQkv>1FqMu_~*GDy68q<!eEOpqdmM12LB&hZvhlX)U=HfBm{y5mqh{uC%9X1w*Voy
zTLLWZunF!2hs9ljyA#|!xVyXCawqTmegCa1RX`0}von2W`gHeso<7}O8#=(&kmazN
z6Zf!<gJh=0?Gx4K=3{WX$;$#kKQnn>K>aF*m)_-FQp1(ple=Wi>GI&ui1J@Eu}5yO
ziFf;+Dc`g%$6fnL1W<&?HnRoVx?!~}nT_5hK6?VvSF;3f@Dtyio^JbMuJk5Zor0zx
zzw*-lbBKM^r3FA0fXa;%NX|nHu%WCa#1{pp;jB7?YGGEsr9ETl^cnS7z;#khGwffB
z`l|Th=KIYfw*y&J8YRPnX-%qTUETW#Fg7_Tw>~Q!^Qq5U-N1pKtGrcC`Pk=+&-QPo
zhwg&8n?#Y=aQ}W%ot{w0y=J%se9tj3UlQ4{?Zrq*5m8qmu#=otq2admoZ;uW5vSut
z`RSNEM0t5a4ig$ZuE(zzMFNd$4<vKIM3(XILJSSnx>0tGR?2(h>g>=n+xC-|HVB1)
zrN9^lKlA*MsL=l4r+XKbLB+<~56yV?A7EtJtzz*`u1i~#^%uY5{#y+U_k+q;3GnkP
zK`pJTztfW~&TmaV^71%sqc3;YFbO;LN!~qugx4b5VNU5zXI8A?zQH02z8vr@%e8eS
zDZLi5Xl~lY%cfwhwr1iP)*HTdBzT22tEQ_WoB)RGnTPsnwNyed%TJa&2w0i}$ye@V
zm%h<huFXBc`}iD+#IQPPBR`CqbBq$J3Js#Gf!7!bd;$!tpOGBpcI>A8e>W6vP_|uN
zB;FlJ8f@q9^Hi3Zk-mH0a%xpZDzXf=a~065ZI<zWI5*v%XxK)lVR@urJZ;L*^ts}K
zuINjz-|g!<o<;#gdZhwYb+e7u*L_YPR}h#!jQHM1AIlnV`>yK$R-TWE7(D<jz$R!<
zKm45+Gnn#$>s+?+Z(a<(a`82ra5f|e1Gj}eAL<*fTxYBody*Me<Rk-LL)+D6r*pAg
zLca%*6Z`|yXS>fDHZ=B%E~7T-87v${7KMF$O8GUll4aqnfSltcaLi$X7MA9W`GUE8
z?WwX%R@StOOx@{67;;y5GoQd!CE~GQwQxftiDPc8Me{*z*KVX$#Q0DR(<p@o;3V=5
zd=Mn3IaFGG0J)_B=|X~}(t{;E_~))KOmwfoFXX<|6Lq)0yfe-6LvEb055DD9yU-T*
zDSK5w)F0}}UXX@_ukkJhX|YL;I@qVE#SpB0HVASaew#gdBrG}b;3Vjag4NIy+@E4w
za?W8mi===jOmdPdC@%_W3w)jJ?M?TQ;9@*?b@i_VSX>A!a*pV(p)vPEfUui%n6N6(
zn2lPJG?M9PWZ=Wn#LuB!@y~cPOgc(^CwjS^?@T3oNZ%MQ8XoZ9Z}w3QvWG5;!y4>4
zZNEzAHj_}rQ+p04RZo*zM963o95=gA%3+59(P#+MJ7Vg;GrcFf>5_3~sI-dN&G~^X
z#rFTe{KEg#5krQXPMcUfE*1IHk^*c-R#$-Cp|m{D0$a|!kWWgi6_)?7Pa?|8sp#5z
z-MdS8G$ni8$Nw$!AsylzAn~;BgSUXD3W`)wzU#k<i9e)D9w*w17@t^@{%*ojE>s&a
zS$S{K3_V3LE}~w9lFZLeNZGg|<Gc-P^jbg?thAwW7Z!13>@z{&=u@W*p#%~vm0!HW
zVlUH1WoW-cXV)~Jo%%3|Yzj&_N3zV*7jAJAY*e=x#qh_TK9}qtI|D4;$l`4E5zZ~8
z9J}{Ot$&lYW%+z>LqK9St+6E;c{a5wuMe?ZgBl*mV%0bfo8Y`VkeQqGn5BGK5B82+
zaAwJNM`-ZsFw*&X9rg4EZsgG7hN~XCpGDH%a{`AAFpWTNqrfjkcI?g!gYbi!je$Xl
zOif*~5+`CffGZg}EXdih0lbK5VmOjvMeEPhNe0}+l#WRA&gEl7)ZR=>TZf@uBjSg5
z$IWO{tbsazuWtC6rX(&O8qaPZRAdJQ-eX^~ThqE<d$3KtKKX=kV)(JhGN~<r-=+16
zuvPJq?fP$U+wph4Pn7I`W(pu-KUgU%v?$s8>k1%2Hj8L=JoeiJ?b3z6oz)-gTuHPf
zx91WhH|ux;AC|ghXr!+bS=tE`S%OO}=Mn;Jbg0s&LpI(ZN^g5_x(#%!lcKoR#D+>~
zot-bv-PhBGTw8=^F-=jce3cQsrpAXHO#Ll&_d8-U<Fr}mKl^052+aHZWT`J)pC{-5
z&0{C-;hc0jSr!MZ-jxIhK=ARx-Uq|`X0nb?DpWKeQoZZLL;GCoMxXL>#~Oc0K!J`z
zeM;*h^vD?|;!L}58oE3t^7e<BcA-8wtW#`f)q#<lb$m(U^e4ibjs5`&qL57Y)?^eF
z+&B1mQ?uhgY{Uy&@eK5Z5ty0#p@F${!Br9^^#oMO)HAEfO72KVI5btqERcM!6EDuw
zeI7y~BfQYANmA>2o5d}B#6m)Ye&>1?v%>{#RC#4#%IYqe8&~HwRSjYH^$$lrSMQxT
zB38FIstF86%Vq#At0O|h!sHOFtJp`y{Je&Vt@uIJEIHRGQ|{s{q|-B6BjWd0UUQ%=
z1V~@-j1nH6o7<N{WsP|)fv7$+7**(;%v3dBU_mjGHYka#tNb1%Pt45}6Iw$0Vh%yZ
zNz@<MNP#7nLLQlc_gmCphyuJd_0h)QjRP&v+#LfzM@GvTCR;r~zYqLQ3O36RBB=JI
zqZnj3@6ZPfWPT|))1yV9?M^$Cb|m-kkL^;&a5n6M<>~gURjraWoBI75%FrDeBC=1H
z4->$Sr6^3JyIaRAHpAX{p33zJay9RDjoj?_a9-{`01T<h$pc%WT!L8xcVqjEBBoaw
zTqRiLouc2PK$2AS*A4<i4hDgzmABUgXNZnTEl=7rA{SKFQxaG}U&GA$%0P+246Tg&
z*T5V!A~n*%>1XL6?Kj&mHOMtt<pwk#6S6xpXmZZ%n$y%;^0JAEa_fZE&q5ZzvW<!T
z_8LHEdpN@t!2cvBS~e(<Sswl`jz9zJq{K!*5QMBkeo&0B{soFq9=tpnaAsg~#BYqb
zdZ*_rl0?T%lvy`_yQ$n9Xpd2%xnI<Tkt%<|Us)1_-Ndt0-)OcQ4wt+=+uk{Uf-Luu
zh&;bj``C`Hif!fdony>v^EXh3#8&ddR9Rynuudt|t=JoO?d1E@9j1A<fZ3+J44v+N
z@Yr8y1}M&K+Yu7e2%;{I)hGV3)!UIA;^tR^m)GT3hleJsj4W#Hwxn-uThk)iUzk~*
zX1#Nb?eQTSYAx4&LGJ51FS54Nyt+Rrd@0U4A<rWGE~nY13r?KniM}DzcbAvZwNuqH
zeDQHnjNRi?Ub|sYRfnZgrq@RvF77NU$%$ii39p53&sdUFzjQ1#5zkCdp9M(!jFhi<
z#qXO8s^&&NgXC%Mhfm3=jrIhq_7z=fxG$@xS*ko<POATga&)y!x<8p-Dt}$AB3u6J
z-Uu^m%GMK9yF|OycVo$b+*~d_Z!FgY>Y#iO_qv%Mn<TT^?O?`aPXcZIyvD4k_T059
z&7qJ}Xn)E%GI)eXI93c#Zu@~A1xu1{_o7NL`^;%ibDg4jem8siX+qGg`fY_`SmMj+
z@4sOE0P5ev4No~Gu<3gDr|<PfxiPwSBQ_<Ftsp_zwSc9MCux;~b(S&~qPea;)pws;
z?{wHHMC$T&KVyd7l1Jjm*H2!VL4)%I=msgC%d}e0L*WeLEm2Fd>;6Soe^1wo9)<lj
zE;1VgsTN&9wa}+QS>T>Mfzl9(`|yC0=PrbxHWGnkJ57>u*p>*Oxq;Kk7%_U<viHN1
zf)`+Nqs6Iqfp7QFKIxv44}Zbh0kn)@^#?0P@LJ>P@(2(JgX-1@lI(k=m7Cg?aDgr$
zs(a!6PQCw*u#`Z)XuvN=*LO^>jAZhr^rcWaxNr&IK=eba)!JCI7bk)uOQ-I4rF4u9
zPp~eR_))=ApwUT)N-%BvlcexXwE>2~A=ujVbcluhdJuDMXp9z*wCJ|v{t1$pX;;FF
z7?_qnVClGll}ee)i%8q&`5vQ#oF6_n+i3FP{g_lt$=ZcQ+VuJ4`d!V*LO2DgKfey4
zHQ4yvsDOah7boZ+@YPb_Hpt0&nl(^c4}%kOu4QW4#Q7R1dby)`#lwRRIUO0Yn9NJe
zYDjH(Q-G?aLDXzH5d8g7vsG%KW*>->_rDhwoJ34hwUN87OKwGxL;;vv<Im(7tEGq6
z-XqRkvtI6!3R*tQp+nyI-*@Z;b=%+s3?6{S^=yX+Q5n&m{QpwiyZCyxhRO-g8{G~#
zLkpBhV1o8NA1s>WP6cr5Qv&Lj*)|qB1%%n2Ai#q2qPRD0{~I4KZz>{i5vzGFi)LaB
z2(o+pG*kQKp~Y4DXma%b=FI#3f0O3_Ur-#0Zw{7|R^i3_a%_)yIAhfJM92<-<p622
zsv{Y|7)0421;VwrA+njXbZ3sKjy(#t^1liXf(~fWqh4W^L3bwhmx%`s9N?=`=zZ^8
z{Syg8=%FPUjqyd{$apc;F`VWocY5?o1KwYwF%YMt)P)`k@2k5h-Y#*de5=Q%t63uj
zh2+Xlb{T=FJIuvY3RF1k!4B(1yli6vzI0h}=3?OH8n4hxmfkuAtWS8YGHzJZNK-7>
zDFW^i`a_$Y7bv;r`H^3)n#B@qXuePf_dM=lo_6Ln3}7havsxf%!Opi31#x$6Mb$2T
zcE&O^%g`IQxxR<{TrMH1{O$oUWXLNy^JAIba&R<Ix5%l?Fz(?OvIkZm{7}m*Yv`ql
zoHTOVBw~bnI#Tk>t)*#vk?u~lYKnOOakQiGlB|OVSs65N$`wKM6Uzu~(zY~zCx=3j
z?vJ{jY-`pZ7TQ>H7~YAeWKoO5RIS@v5tPgm`DIL@1(|C}Ru0-uTQQt>W`fa_3^xWL
z@jez&Qodu9zt65blihik#tT%(PsFhK*$_q65L79+Y|*4tKZ0)?Lx?RbqF7hsX^un3
zjZAxkL!fe%j2siWLsQ*)_SW~qZLIL{l5%A7Iq|q?)LCy3%LCXsezv&Mx_N&!^`(X)
z(Nqped0<_ZTw@vFie%Z$I^B{<7`$^~@ZB5eP@DBjBIFC24^?pJ{aO;P1KA)z&Ar6*
z1gl%7Ov|$WqRg;fiWvz_C3L9v_4u@nD<JB`i(IM4T|`e`VvSSpba5I+g#|*YH(nPW
zW-LfXmRgXoA*9Fp@k4&7OOg!Ep*_5PLZn!bqmQ~+V&2GIb}#wkSd2+HTho~<X1Egh
z8Ma2RNeIJ<8mq95ucEisTQ#C6=K0*ANv8tEl253eBS}J!SVHa%s1_iuc4;ToqR0-8
z1|%WNbEP4EMmNj9?~v&13h3h$pS+2UjuEN8pi+Kz`2ZJsM75XVcIDYfeHFTq1L4{E
z%Od<?BA>O7BKtb$P(U@FLSGZ<)tJvkwWECzyQl$k6}5d8yyz!X>|gBfeDL%{Phw@&
zX_Jly@9b`F^M83tH~R~b%a}K+c)rF`+b#IyC>W#VgFU#HrH*gPP^*BesGNZ;WMJWt
z9x7nI@h!^F*;XDgdjI~e`C*-~5`Bfajyk4)i1I^JB<8OVNN4p6{mZY6@hF2{9K620
zBCCDV%HcM7%lRP%+NrO8KZo$UU{zv}i|sixAAZY;zz;UM$Zl*)tii`e0I`5pN};T%
zpy?U~x;i8NbkfJ)GPD15HkBwSAnz;J9(i(VWWauz`%K0oDSjH4Y+}5Vnr7MeYC<|P
zJ`9We4;<pO6ro9G?X1Z+$vtuSpWF50v{9hdrIh8pSSQ|}X9P_P{&59HlFiXo)Jv_$
z6yZ*MHqBh&#l|IBe4^wc7d%$-+<qov)XZsuRjQIH-^wEsu<hy@-t4PwMozCU^`Opm
z=M(3)0_1bwzZk`hyy^-f4EcKqS$H7kdA<>TjgvgEH}a6CdtJ>k@$N(*c3R-0eEVf>
zj{fTWzZwL$YGscDuJ>ok?PK|Y+D)B)*53++y0`>epH@kf&-P|CrS(=9r@KFhG^Gfo
zMZo#T+S|JAiQIMRl&7u}^S8_b>AixFv7TDjL8q71%^w98r|o&A_d{JOi)^N3cR3XN
z`dmbMQ%1mTzaLZT2Ck#@U&;n~iN}s1kp$lic(=F7B5f80YftIP7H-C^TE{Bwl-__q
zD^vjPTG3h>2~z&Sw>@35&>lZTGm--oSW(w5pib%w>=nA>B=F!@TwtGHo{*7l_}9iW
z`<%m`o?+x!@2ZdLjti4?T^I(rIS~+!yyU-vffrojB$0!~u#XH+72_mS^%9sQW5Txj
zT^vjVVBL}67?RaFdwqfPL+X!>A<u2Il5Gz0*78+Qvyg0w;tI_Qo~sptKR0P<$&bi%
zzp&_IC{egmSwx=T1OzUJIhvD}`8Y09apd(L7RyKHUQ}2-b%^`Vk-oCF#h^=L*d2!R
zao%44Qb>t2-L0rSnd^tub4{&~ckVh=w>U5?l6-3@{q_-a)Id&8nHE_)rSSN~)IyvY
zDR~}rYE8R$%TZh5Pm0d<wXUoCNsML-FH=9;SBikaD1UCjF1R7tRJDim^~O6+jU39a
z{ft5N_KP)=Aj{tcv<u3b4N*^@@NE!;Ku4B;Y9^UJk~evmQFA0FV%i!>O}It6^292n
zG}OqysIHQSRh7bcp~Y2zM8dPv2CZLp8&o_I)>klSm>ZHe1y=slO*;K0HLWNh?b^(7
zSSfa_dtWB(oW<Xivz(NG@?&f}hhuW!0OdE40)|7yAw`IjFP|h(LByh}zyuS8Lo|Ul
zY;yRsSO^v~0&`*Lps)SguLkYuX{7@6X9RXFx4fd@Uu1Bn=>MAwK;Nh~w^BvASeYEh
z5n`Xb`R`;VWqd2fk&Ve7>M2>^u3L{6^D|Y)HL#N6uabI&kt%``lxc<zC#IZ|UA^XB
zH*UoO#q=1sII@ySP@edOHtmH@8|qT(4@4iX`W{vB;sNmrF{h_^$AeeTI0`H=YO}Pl
z&3$v@b9r-;fZMy>&kf-|yhSPKWFoE)wz{9P0|hM~H_K|#Z<wb7644DYm^e<E<KnU}
z5yhD&O!AJ)K(aFwG-({eb~iY@a*sKKnlUkNA;xx;RsxDZlQ~wE{<;Jk(<TnSapxan
zOi#mj_d35A5yf<A+b(;-!PJ{AE>6g4e%?=uhss9t_yQ@BXJ1D2SBi_w9iLw<y!MdI
zSB%-mMP4|R$yfBLJH~^GrNJo-vY6>~z0WS<Eqz{&LtcIHIh0>`nboI`kvUt)@^-?&
zI0KgpN0}`iS(j{=ngqEW&b{ZJCCe)n;rt7k+WPc;O67$IoSD}i6>|idAuHSo0=G(W
zCU2e0zXb%Eixh`24Q`Sc6tEDl2ON-szI``heYbfpw1`4Gswe7MV`IhJ-`nKPdE;Dh
z(yzoleQY7U`OX^DcE7-a($o3mvCIE?pR=z?yTT@AA)ULa^F1&OH?Ko-xFn6w11Vh7
zxy)g8KlRy{gmoYG;|Ew~z^>N$b<MGQ5za^7tpso3H=sLkbc1#Qi%tx((#x+V&0|Pe
zUcyy&_czzOr@`MZzoZL1fmAsL%Wt2d4=O?|<etgQBRX!?FvSCLAWfQjb>+^+rS90&
zWK&9EaVul>$>IjcwEVRD3i}H{1xkPQ^kR-SMO>`hkGj3QE9=`2yFOo9-<_MN^J{Y>
zA0zWfD6=Fm(7!c{*{2>O^@&XQiI2UH_#EOx{6Ml&FEU2%lio_|$z~G^Zo5|ecOTCo
z7f|2&eb<;HpwZ0>E|u9{NGU1Ymlu_ZR&Th@z^6|n^o%`$59pvnq27mw>$%90o0;Py
z%jFKopo~q-KG+{a<6jn-!b64*zrd}}%uwW%BHQwv>CN)0Jgc`UQ0{X;N~;5Kw#dZT
z6mt4b!~)OUQcbcQ!|-M^nT6UEP5Rt3V&vvHv2C8f0{W*(xyZej_LgfFYw+^u%}my(
z{Y8@Yk@b@<T%^!`yELD}`)!I>Ke1+CO(|73ZxdpUG>5P(Utvim^A(I#SMSsPX0ULa
z4gcVVPEef9jumw=m}PThRg_g(j)UA_e*uEo*>Ow&4BH-sSqCOJ*@o`}SnJ5PWH&2d
z=x!3~NjEqw-1>6ORRR_CYc?Mdn1A^iu8E;uT~L(1(`NfcIb~pV#fJr2d{Il_(}lU}
z7123tosSohLqQb_C}V<R>fq3$%Vpnl^Zm$8M2o$t2+HMP6nO4ya|Rxc#rb}otG(;=
z<WxE)t?}I45}yf5b)QBy9V!Nf4hcrpxx^V9mYrvEs^;Kgt>=E^l^PjS%U(C4#l=F_
zx08(eq;_p@T1UCQs7~VUX=;x8;H619-+*np;?V5kZYl)O`~_-h-woEmugo)TE@lX^
zyj-G6y58upZ}Kox(!kF$gpyLNp(*5yoq<;6s^2yQ&fd-ngUQ{bu_u|7&G35Dvbi`V
zLTw2nBPm7uzq43#ep$Is(Y?V{iBavWsxoVFIKm?<@%ZIguq<y;6I)31VGqS&>Mn(D
zfG&%6MjZG^!IkFYh;uLO@6B|~_JDCm1JV-L{YPSC;*ea*0-K@pw#{LN)PS-Bg|U>;
zi;HbSXa{gv;H|#t5pUq)VIgymzIHyeLa)!xtjKjjsaI}GQ=ya?4qxYLSz<rO{}N%)
z6FEaF9>KUav&6)x(|b|1O+AUi&26(aGpM7*VUHfq7k`H?VzbW8&1GW@Sg;R7v>cSF
z3okQvjFLYo7rKs*zAZ{hx*!!7W7{FI*2qKPcfDjjQFKr5h()e|Do8Edo(d2OLw%2V
zb4xf!Lm+m>!kLU9W3UeTYV*q_Q*lz@HX8aH%m?-XyGOV4KQ#Gx2{~!JRF?Djk(-9|
z+>x7TFFsbhbCKS1FKMyQ$^}6)uEwLW&*X++d8$><J2rg9q;M-Ov~H8|q0x1gFUy0s
zBCuW1v9$>+8TpEf_!Ju_y5!GO!6(A@`>>Z$j*MrL3y!M5`tMG4r-BAZeWHAo%f;Eo
z0^CJ4?#7R4>%$TWHvcNGUAl2aF}KCG>pf4r^M3Asntt+H{Ou(DP&CX)Wc#?qt=)CZ
zOTcD1=b{8toR1XRXm`x^w)YOba-MMUV!8D>_l#3C@Hf%otrNOht*k7~8hhK<#*X0I
zbAM?mKK~AV`jcAi`dIp4s$836pR3!!dV$+|G{G4$Er(}O=-|Xk4(L>9w0-G<#p~Nj
zK?NxZ(staK4uUv`3NF%iH^E}Fh#vH<q-x+yz{pPi$S^(BaR%1%*m%TxrN*~C#GW3>
z@eA~)wDFu+A6K$t98LXi){WAzHbL=b02RxbUo+%SpE(3gL)AX89g_ME=(qCBBoJSA
zP&hd5x-o818<Cy51a#usDMHTJ*g@gB+)NWOm%qfHyivL7xxImSz??jcg`ONvS?;@2
zmBZ)KT}29{ee`vB!-3}1MB)_P&VR<8b`q7=i}cFXrltl>!cGkxeCW%`R(fG|{Jmzo
zZ0$~{B$20CCKXYp{9I0f^Y&jMUYP5q;RgxCgMLVb#e-CD!P`SDuwVm`n<&unx7Tae
zIn9L(>3=HA%<~Dp0OdH47Y?*IoICRNA@>sL3vJQRG7K^uAF9~5q3+jz$ZqPn***#c
z-EZT}+ik&-BD{~ucdvSp#sKP2&QGYwdn*(8o%6;c1vmvEC+n&x*njy&Q0&Uu?zM6D
zaFWsPUr=Xb>Y{~IXXAL9D%s-wgr)Jp40LF1;$cr8nr)X^8^Sd;mg;@@S%=M@20X0}
zj4`yhP+^dyFL1N@yiqXe3sfPBe6iBq=!NE{ep)rP_D$4@F+_)Sz$w&{LDa&H?z@)8
zIU;2dFJNA~4D{2Yv4G(u>#KLbX0&Zn1ru!|f|Auf%Z}f6u&A!`dt)e{E{P4wRl7Y=
z`wEL-WYUSh>dbwj1e~|ob&VF`f_tv^yV(SsM}v{kYoLslcH~LN@mRN=r>aH-Y}EXM
zF`s>WT5P}O-m>XX?)J`?UenEMB7Tjl1yM+JX0~+KZq~?SvsGa<d3a}M1TbduHU`Yq
zs1{jts3asjRqfwJyw8Q<Fq}z^eFg?NcW#y2Gb}KKyY7z8{~4JECFP0cR%Ze7<m-1(
zWaw(WGQz;CI!<tw(9&r4nnW*hUv;fJBbcX6(WEfCw?xXpyYUWf^|3Cih*&F~X{FH9
z&rc6|z{w&;fp#8nU+Vmlbvz^)>r}SAS~}GUfFlRY?JtE4cc$gkY`%YX6yRKHy=%kA
z$i|d!pW+4@W0Rpgc(`ugUeuQK-5;%<Sw<K>?4f^N)*k`BfJTGNx8e$#Vn_KP2HNBQ
zbJencbh<Pau?aTSIpp5K9euxto_@M`_w%2@a^5(d@rnaf&^E0p*na-r>pUOX>g05z
ztT=D$v6^mRw9AGNQCGjsel|ljDY^?#(XJ}yHkU_}nk!e1#!OrdRl33cb#hzJxGJEL
zZH70({0f|?!RwF96hkC`@($_fOg&ljPOjUVH-(E+4jP?%t%0d<(Zl6BwKO}V$FUm?
z*>EAR&%&|uyzBKQ&tl~WlTIiD5)s;5#{6phJ(6{}f3}XGcMs#5bqL~zmrJPOWfxJ4
zW@H2r0KFYKPtvM?WsWH?R24GObpnEV`{*_)CCo<2PpRStSZBP%*sL}(Wv~LB$%FG^
zn97m@XYEY1t*X3V!S*ASv9HX|_OciEP;V~-NM8ImN^~a^B*hJoV=sDD)74~T%|A*K
z@d3X7<#I-vC^1c%DZV>zzd=3C8Vr}1k8@>~$-tmR_7cL`NWLVEUv8+gY|p7-bnY)(
za~)(y$p3wp7Xw-xb9~A1Ix4SB-WL2?gHc36%Jv<P&yl?87H-vAq;v&y*rM<=*F!VZ
z-u;>IBW7s!#z4Om&r*Xo5nbp~xLvKbb{#GfQpWmb<bbQn8p~Qf3Qrb2aq@}6jq3?*
zdZeT4{BlyAtaL7d;xyd=labN{$Q*Cfn3b>JPXluj-!_N+Yse-qpz&ns_pE{DcB?gy
zZUyH6^O+F$@%?JX*lO964%|tS7xF#Ki!Ay1ji~aO_hYOK?Y{u>>7xkQa+jv<6Dr?I
z>N+E~!Pf^QAks(FeI?iw^dV2)&aVG@=VEV3noZr0gu(XDcekSc;y*6d|NOcs|3H|u
zfvLl&X;tgI;MZ#kKw`8JL)~6pht@_WW*s@@668Mnq-YVpOrGFux+RfkANVr333Xh(
z=^$hx0yErqvXWo5o8smrt3y)k)ipjW#tLP>42D23zX$>6D-5J`E=GWHlmhInK2{0M
zH*gY&g6;M$7t#XCB3F!>5&x|z2CLrE4GHtXxoI2mO`#4+4?8~A^iYwl@>-3aolb=@
zNC~>SPP4=9#4EivwC_>DPx1XT@c8KXl$@BkD?aGk7>DE-M6S$8+*gxrPqow7`t2mR
zk4Gt)?dkDqBP(_7yLqaQ)XNIrTWP@=_3IjrmoQp18w=6ifu|cj*pR06ph-i6`$fyU
zDcdyes@1oH`RR2_&8O$NPtoPmoflKP-nY+#BIjgnk1P7YnPX$59LjCvv8bU+DHa3%
zZ@~t-dQOQcF|y*1!*P<|;R;h-H_t8uaHQ}<hCebPP+$qRT4pqOD4?acIPg~+(!eF3
z#AY1+rY$3bwPv;dR2)gM#H{Dwb7cpXV#bBEr>+?>0_q%vOuwzW`8oAW6t|&(T)$h;
ze-!&Nz}Y6d4QV~%aOGHkTk1aQoM+-!ZNN1&>lx&*o%LVyUtdj3z=#-t#tESOuurBE
zVVs>QF*6?RUKjo|&a4<FPp~;5RM1Lu3e^;hO6b5}v3A3mVox>@%t#L4P;Q2Ic+6c4
zfvTcS6P`d8lrFD23i$py2bV8DalxTJ2Mm8q9g&wd*igF^mW*_@WHM|Mfw+0+$il<F
z_s0@@WB+Bdy6+P3+;);P$2?D7w&%N2dB@-qSg+O8j8NT4O9Jx<)Aj&zhTyK(>2Z76
zfuQ;|7N&*}r=X%L?n#=<se7p_XTP)He=hdEmlfsBzpK9YC+wo8_|E~BF<LAe;;y`!
z-IB#v@A@8wOQ7PQtP%gtnNt^djSfS9geG&IF5kC|mlqm(e3`lu*%#r6qUK2VW6ULs
zx^&%!)d^V<N%|6t<H>C-CjWbbfi1bNOa~g0^~-f^l&*%eZUZY5y5z*TGda}C_pMV;
zgh#ZMFX?iJeUx?g&A}Y(JBLli=423Tw06i7*5Pur9+#3cz&85KcHjo$JwIP>;IwXk
zrFs96?djg>oipm_w6KNyv2})ccLqgiGw<<lIeQg&ZY!PlR`gxTjp)-I$GNI%G^dKQ
z$AtWm(p{VUm&vS1XLBFX)V7!TaM3{-in<d@e{bNTY_bcK>rNZcCiRgQ?ay?3=;J}8
z<9MJExaTX)KLz|rvUcjoI#FHpt42BU9XZ67p<+8BP}+~z2kxY#8eQ(ndEma|F1Asa
zyEXFBWf^_W9+?U!?g)Bl;IVy9(th-@yKX9@Sz=@$BU0!(+H)m*Ar0ZayobX3GNnkJ
z=E5A0yz+AJRWV4{>@_m-Kz~2sb!sQ_#v?DbNO(j<z9rKQOWrV79E!wSs|Ji_nPFu#
zQfgfq4^KqRO}NTDD(TrErZDGqLiuO1!3ah*QZT1hf+RzH=jOTY<n6MzslYeWA9tNP
zug*?%rHyA$|JuFbde_60j99GuqdDf|EbOC{^pet<wi8bEl3LOpgTSVs;yb?xQlHN^
zDzX{$$*Ku72LmfQeo!(kEJijP!Smt}3+r7y50X2cISI|*xnV*1*B?0bzoY-EP-jO=
z;hM-%{|l0=3|xpB$>@!a5hR|%=ecjSuzDW+{CUlDvBO@}E7|ulJ<5}-IOe<=tppU-
zCd(2NN!Bd*JgMtuE)?z%xO7D>!3(0ndK13?tXFB>BCsGk%s|M$4@Wp6Z8v>{M33d_
zt%f^%VU<gx=SD-xXZ#LO<8I`O4V_2J<Ae6eWh~7i@sVSKnk9jtZ-u8vjeFN%LEWL~
znb9!YUG+y2oq4@kFV*Sga&v4fK>x8eTCRdNIi3CITub6%GEZ`@(z<!DL(D^I!Md?a
z3XjWZZM5eN#*`PMKH;YmdC!#N+zt5r2*wuMa<Kuz|IE#na;Jst0+GSq1LC@q^4-|m
zk5uXG_w!*o`xpW4cQZ%|go#}X%t7~uQRO=vt6niLyBw}+sfo@fy*+q5SYel0VAu;&
z8c=M=DkkM!W>#~eC&7sM{@)O<!+Fg&eb@?2!@KRvbuk$wi~Sw>+=77}iq-iQrWc59
z4~GPH`~0^b(<AbWTAFq#Rz1(1n_hQvI^Uj0M0p1%x{?}NC{h8d>JjPF&3(mY%cFxF
zT9A>(v?-Z>C0u=0tej5C!=UTfT!x)s$4t(313GDa*5%yD&|OBmNBZi^vxS@Un5akB
zJ2GAFjNTLd8}6F3wB6ZtpaC>;r+Pe|V_}{j<heVDZ3^GO^WcN=04}7;T9{|JaEk?U
zg{!gs0R%qWcmYmucNroh7Vr$ok12t&B|nv*#pDmHCn$AR@>ZILTIET4JtS;<ZO^i>
z6Ncf)Qs1B6mUh(HwB2ry6c%@`u0lxOG6bPQU<ai(R1YF;6jA<G7n-iC!*3UFVmKAa
zv@7ARQ4vWU0zUnFou}U)JEYpqV?{RXiM*bxZ&|CW5EV5V%+A$e+bkm8j)%9p`yWHY
zGBbbGF}1-anq`x$$h6rqAYOYP>&QxX^Nu|S&=K7X_q{gq2Nr~ac<281vcMkHN|N+t
zc@uy<P=~SX`K3lfnjq+6qw}a!FTu|F_yrR62<PhKm3RZXU&0EO$o*3eQwDc*0nc$~
zmGy}hXIO?8Of`L_CTu4ii;u|1%XaqhMp_kj@WSD4R(WY`YXkm<t1h)p=rLuXkb&iu
zp|;h#Rj;eAD>NxU5JB?<ys3r7C@Y<kbfNgD65ak&?qfvU-fi>CJj~_J*09~Pqn+i#
zKYl~Imv_4&9crgX6wbz(_3#QG>9>AkE=y8J=Qd>ps8A2S+$Kjwr|kDtSx446R~CN*
zIw`*#o8Bu$GJAkus=dg#$%{-Vh-q;{*s@qpLlF`F;X%OM8l~hB_;ZCKS>tj$H8~w+
zBS6?BrD{vVr{(jS5=BX|$YgPLR|u#OyEF`^pBSc)5%i<aFUtzR3&#xO4F3dW{RZB7
zNaqn+XcfnVMttXyLW?Z{n7rhDx?C0mYQ9-SgHRy_BSK=nkv4vD_8J=+!-b<2COY<+
z#~Zh0H#EOEvx#@a6pN=_CWNPp<UgufD4_WpE=1Iy_t)D6`-mRe%N|(2?<gQs&~^ot
z6`6VnAtDGb1&U58sOim$8SsXd6?dzN6V<|#VZJH{+6be0yr#NqwT0K`3Cv%gbjB_)
z^<fx#n|(-JcRWr+oXg2WQa+SPC-7ELt<mPh$=8P*GS`^#P`2`}{JQA*wgJsNfuP#w
zMOAEqPOT3wa--SH<v(VmA(C!aVT?{VsR$3r>!8i71S*oe+O9g1(i|Df&aB-eR&$H?
z`BzFvpAD8W6wnsMG;iKNhMc0*3Dm^O6I0~xqdE66`$ivq9PIDux<umN&3_)c@Z-f;
z;&fyXu1`@YPYf&*24ZqGSC(iu--w{^U%5=!B|8)za_sFWY|ytgzThI^o)Pk6U@7L$
z&CYek#rUpYzxtY>$Qw;PbttM4cYBQar>*x3fWi7z^4=%E>J0e@4ssB_2Ps$lt_wkm
z-=<n1KNuQUMh-%V5Z8R?WlJFINJp`UE)D$fF3iYE`e1mk(+T+*QHD^TtB3yRbJo^b
zQ;CUHRP@B0kcjCm)1OS}f%Sf|h4KaHm@96NAoR&VtpIaWa4phxxE{;HvRQD*jeXY8
z@e^j5T4OdbFCw#0WpzYGq!Nz*-)O}C%d7H~eVov{jEuH4jZ50-_1lVFnYXL=J^W<N
z7ASnl!k^8d;OC551$xF$qU*=Yee?E((aX{)_sa`<An?gfydtt}{^YKHR)e*=Mi#38
z!Vf?J^NTE@a;$5cU17W9lO!8sK4)+1Kt(ylElHBG40qix9=^-9BU@&#?=HC*?+#*F
zDxTdCKI-j$QX`3%bbq?KRy4hDDGZg<H~+zhm-n=dnUPy9GTg<8AG3bFHhH~%4Ydlp
zXi2TUL8dUvzkiDI5x+I`H&df^n0-q2x(QOQDkwu~UmS^xfk#v9I%E$-#^1I0w_ZvX
z#w6=nySe9Z^IE1k;P>nw8(fQDMt&n{?Dcp*JEuC;-^EC<;n?g3*iPYl1CiY!%q7u3
zn;oYvvB$R?&P0~QVhO8#z-8qes_#0LedKx5|D4+6oHg7xXS}47mjOxeEXneE7Ca(4
z?b|glmZYyrvP#^-Q7I(9^D6)ip3xxB=K*cgmfKL{hFokL2)b&)Fmf?E=zj4dYGM|N
zv%1KV`hH&wn0;?a;c0+=;E5_AOA22kZr9oja()9q+XN(afHf(|Z%vus?5p&c`71wp
zbANZexOh(Z8d7!Cd3l^6tFsCxa<}=q4OWS1WJwGddxN&xh=6#!E=`h^sAJwI%VUAK
zZKQdp+ys`DR%o{2j{9{x3ut@9uWw#&v0DW4Bu)Uk^+Y+ql1X^~pUBl-DKxtxvnsVW
zT_0bm*zo4gepi1FTbtXNa)D00YV^j_-R|D2#`U+^AwSPZs+fK+P+Ir8`}-sZ;d<8g
z<-o{Q`)P~CLHp|FMi{xacaNXe|HQ%F?#WuJox`Cqj8!Kg0q!}IWn;(2u<lq*cqc`(
zZv*$3XL^F}PwpePE3jSDyGHMm@JszM%NS4M|DP!O$A27=STLJH6<UOETx_RP;Kf4I
zH{Aw2o|~<#)Rwu-B-F^E^_s1w9GOaM83A5fn{FqP;6-j*NzrzUYL4(P3GaIpd`&hR
z0;@vlig+W}UoA@6TZ5h-hUT0~qnX5Kxsih!(z^5Z_6C!Kn)7S|a!c2E*C1We4n~NA
zsx6@&3y&v2&})BV<QR&yLaLWbWA<y5WTD;$c_Sed`^)*4X8SK_*##r9$4zJoiXYJ3
z$~M|-Yhum8d#p{x930<Kq3jpU6gItkHuE9bhg)09(gNpDPZtk6QkR^TU6~GyIe+pu
zQ}`Tn6URNTcebeCq%d$H*y=(@20Q7bacuEjreky#S&iMR>7wSA=rif)0$;MfOUoo=
zy_cKpv@hx^x7ht7tS!D+nZ~JgVy|AiwSIW;`r}BJL2*gA=5ymxbJ;~qjqCDf3NB%(
z=fBSWvDXC4sl`on!#c8mG}$;iUTL<HFMc;vVjXKcd;BHZlk+F9!H!<M5v-T+79<l*
z?C}|?&->nlRa#9{z`F_c?t{<{@($ribd)mUYfIwMWPM8@V9d^3QxmdClh!14a^Ab-
zG=%4+=EQr1SBB#y<_#PvLY6Ov_gR*j{*w||DBtB0vDD{W24hh}4)**^IA<Y@s}52C
z<4SV<vCFX(=Udh23roLBb$Kdmu)*b-vm#)d4nogGC!m-9Q0!5mPO_L6UA06PO2901
zK^+o6(v!wUoWh^qdYfyY?Ho+PU1zfCT*O-(xlyQF{NvArZnqO=DBne^!lJkp#QFN^
zHav-_%O3Pp7OUmK)CcfA%4IwRa8uj>yBf1UN}UiIy*)rY5oncC7|Kt%Mk<50HmM6x
zjvto;bV%->y^G`GJ5?g8eSDnynovA^0aM0{N3({Sif@<^;umUFNUe-((@Y3GFl^N0
zC3|U(@Y|GPV|nt9rxN|oj1z@01)3Q=y*ezb(*qszJ^V~?Z#{8N7kLm&aI7*H6EFOE
zY6Dw2+XSFG1Mkh>SKS_Sb2aBR+;!aGyvla_?iyF<%DT+hJD=K9&7;Gm+qOgHMXP_f
zpB;nOj1TPkH$-vB@NH|QzE9n*1se71QCFNXhG<q<nvX|EusplkjKBX~p61Y{%A7ek
zb1kp4W;Ct8hEK0HNE1kL;~#ac_eyk^ykzC{)BL92QN;|PdPZV)@FWL3SR5w*@RXHr
zZeh?i;xAuH5Z6?0-?ug2Hv_~-hRzkAOYNz#*}YukE$QxUkWZAnV{}c-Tx5h*in^=%
zrN{DNQo=xJ6`r8$7N5H0cJ*74p;OG4bO0!zEP9h*5)FVu059C)b!2vu>O>1PNSVhE
z>t)^m5cV}*4CNC@Ovhh}--DXx&3^4vHU*84hiBZ^sQ1UPRc!gby@+f0an5mxbQa8v
z_2$Cal`3l&>++nox>yq1R4FU16UkcFtS!|41Yq5S*S|+n?zfbEeDjM1TTD@eZy#4b
z1G!Mplz;H28fTCNhEP|{lGPH&Hd$SbwUO(it&itfy`b(j>!ZLe0B3l4tQ5HKtZ{UY
z@J4Q)sZ5Li4LIPPbN;p2+Lx@w8r$b8G}^M^j*Ey|L7D<{6PYHR7WCZHno-d064$^4
zp2^mOgO*xj;8%VrXKAi5IehzyM~dTh7l6UX@x!^lD~Y;{B7sE*svw|0qpy7e&XHvt
z$W4`HV$kaT2hsxU`d8I9-&63TaGyt{eak8Dtx{v+Ove{E{+u*`1h+pVfO#CRXpM<@
zZxit^JZL#CGKGd#vmDg669Und;Cn+MI5?25tc0lgKa~tuZJ(<N>HAV<dL2%*^%}mA
zfR9y2LmgfUuKWbxIu$iU@$!=MOg*cB)Dt!#Fh9~y_}4KATW*d3O}`0Qy=(_qk^Eid
z0>R(^xO+>GWp`;a@DYYHxv|f=*AD(@PfkXQhpL83PdyJL#g%x~$h<>A7a9gq#`rk>
zkLe2dDHexAq<i6guP~Z69JfTz%UiHaY-;&hxJx=vz^|6;`~6ty%K*gs9uvq-0G@T>
za8a&cb1INYz-kjK!yq<J0<wJ(J6h5oi^+>$a=Ub@^7C<e9*hu;`>*dX4)-{4%*YV{
z?VQ)IY-}HV!_Jpwil?;E>Q<wltcx&P?EYMI305Z>#NC<H?H(pfu!&ZnJ>Mfkv$6cq
zm~TlvC0_qx{cS;F5D2kpG38TUL;J6{fkXs1be3p!|2DVPvma)Qm0J3Bso>=cW?ad2
zk@{y^pn8b_H(h2C6=qrNml#w25Ub>`2?5f#h63Ce7($g~xVWh}ex-dDQM)PiUa@_r
zo~Z!(^l!-2cpWtg4;$WqAVL#lgH?m9%&|#CfXV~Rr+9}Hd-!Im=Z{mWs8R~7v$LmD
z*NdCLuB#h=SsKMjtFRg?Bd8Uq?>)X^`pb97PgAu2QTce1H)s1KwwaEGKTOinc>lzv
z@Kie-sMpO-S9(oSt7~i@=Sbgg4aV<IoBu5T!E>-;Xgi_nQ+ulkj8VU9$o~ihbRg{S
z+Zp~u^t7`EzAHd~@z4JTFt&Wz5jS=M+sWH*PMEGrO|dU?7v!c595Pz82tX{GQn>%S
z!^OJSBC7AV=l%AI!NvJJxk#yUkyEJVcYQPOWe;1Z9gdnq!P#(@1)%~4P_FdPZ%-l_
zw6l$90{6m$_hhTjy!y*S#i1GsV;Q5D(&@X+I&Jcua#%c@{@CwB5qbdA;#l~I1<)FH
zxw~&%6`K6OHc$93c3mP^c1WWS{P)kcw)$$4oY361vbTLd>u1Q#b+s^fv7!o+_*?x8
zd>TXA?}l_n4{O_1J_i}bHDYAQ2=H$rNX%@+5S8*HZXD1$zv)f*Bmb&%sFV6BBy3@|
zwV9ulkzIB-0z_AcQS4Ul9<oB*io4u;q(D1Tcc+i$gzsjN-CmC&$}cB9wn$ejYINdo
zg<~b%O490flb-JKceIA+Zt4{@=i`bMxtu@~oyh1Xxox|CKk4d5GhKf_dmhGx$X}3L
z^X`kHN%(AeDKa>HG$2qkFI{itM6>O3ncQ+?NLp0+l-U~Vm`8dXCUxGd+x?mTiMAJ4
zB39Pd=Oy?a8;9mhmm(vomDi^G3LwFup@3^MW#tR@^BX%u*JG3EdE?>NMK2k<PyI?0
z4#38zA~C3uD~__N?*I*?OryZXL^sgucd&TJm9tl@M2#mZ`hE=~Gjlv+k9H3wrUDy4
z{vU9j0eC#$CPuD6X3?rjPz(4ENZ?qLfSM#mH4q_Fbj6Drv|u6!Wzq&`5nc6qRw|ns
z4&IpJDEAEje>qHmTqCc*sN8sJ;6#9!hlg|oGm?eSt45A_{ruT|Ufp)HZU^5*xFABh
z*l|3ukJ-~dAVj2VS)3BnsndU@OblK7@%W3R&rU|Mv0_uPQH<h0!DHZdD({&q3GJbD
z!_}pT=@p63Q?Ck1!OA7(+6@L0Av7C(mcXr?>9+p?=m98hFLfSlHNP+pw{)8x0O2`|
znk6j8j63Et%RKSyec8FckP^H(SXp0vix5DSK6JYswG@ft7vdcVc1t@|VnoGFGx|NM
zEY-VDl&*ySS}H93{r>@f|L?H%e<0`o2TE+Ja!yxWee8(7LjZu$(ZlRGx&*6{@rOM(
zsj%riLX7>GtDf>TR8>_vxou-=PW7lx&+qi~#Y$3DAm%Nxk(qS~Y|7t%DEYbM<cr-+
zJ@w-=WBm;3lai8sgXKnl_`a)2ii_`FF5h^|i2E=}H~;E09u=~Gz9YM|qhk+>FjY{T
zP()nO#)N~+Ywx5`S<R-+>_rzw+p4OkxCTz+`$ra0a5+wKWV)LBg6Z%6-d*w_<Gq^X
z(%jUyW5UUKyM}vIx9=GjiC4fZ{)mV<OnP3&K0g82u$GdKC&XsnE}AG^ZL@lmcln&3
z`nOa&@3`dzZiH3k`JNuefG(1>_vxiAD&?g;xdTo*cCS5p+JKy*?7PzNvz<Q=?=C$x
zSnB!fV>f5MYGa8j?-3rqKEJ@l;rQGju*|#6&_+amwu?oY=-9}ukxIEoPxVca8XG;6
z--<Gi?e!2Rw=^+J)3NnP-^Qn3$@FhDVC&ddHxHSF!Y!AutMIXHR+1ln2<7A8YlAa>
z!4uS$6JND2sQdN=t}XQSXej^OVX&%eO=Yaa;M1oeYk~1U%0B;sE|^^^t--t~wJ>h(
zExN@0c{RzrV@)!x{+xq0Nl4tlMDOT&U)^Sf(JtFR$nx!~4^-m9@7k(n`lzJ!=<xrz
zAqP;99XP4uL=-z3F_MG-mM_IBEOBK)Sxx%QP~ARh@FX58j(l!~O^^z&lrjVn_>3fc
zVw&elSI}xw04q-bH(vzQ<pSCmz!gGJ<&?}4z|C@{kb_W5NvdLCF0m5;U)YaAtDjim
z=zj}7p~9A*<RQ!fQw(+zZ4W1ZmQ$t^gYYW3(uoP$-y<M!7_pBF$u#hUCv)x>9vd#c
zU6lRsxx350Q^<(YJ9<6aGDp&u2>U;zMh?)m<2$4UBWl%eeWvj|TEh*#fS|AGYgPxL
zG>O1`#U@>T@=h|(%`dz&V=at~HZalZ^EAnp=|jVAV6x!(#+NFNmT;DV4@v8rZjL<4
zZB4OlKj{oSOu4v5mT`B@E}CaurxwY3HYbUKsd;r*J9-ba$-W9D(&{gX>AU=(-$wcx
zDSka+zB1?IugOhnx*m?&Lgv?rW1^J~ab)pEP-T$6Y=m5cbbGf21cj@%>71JnE0|IS
z-ZJhNo@O^!1}&sH*G$)#q~KTDrSg3@=kRgqDZCah`ww6Z#6K7qD?gCyS0$%9%s-!p
zfr2lEQ61RnSWq2KBdX^c3KH-f8SbCDc&auH5h^7@721O^OQAiZT_frm_?MdGts#;e
z3@RD{zJ@=>M7#ie7+V=?Ei(7X-RM_jx~Q=I81_l4J@)}U!gj1V)RbYQr6_N>kjdO$
z18IsDm#vbMynbj+?UuRgR7k7N?K->2>nbw9=?Vk8eYhUeswt3mt%IF;;k!T*tEb=x
zwJ=rAfP4hSq$1Aq4&K^ksatN!|5<YP79xO2J1a?dR`?T#mxfiF&GyvSzs8`oBa6?D
zbK*@^oi(Ai*!umnOD*|9kkQxEJt>H$N5wM~RaQhti}4Wk*Y&eDAxK+a!;XmGHj!-!
z_BT`v_5|BL5?C2zF<9X?UUr}NiM!369~?tj<8}gptRFA(0-ZoXvoB*L!)<2VY6%Kb
zu0*`FpVjhXtr*NTDvp#B>}GdQ4e$Miq-t6IOAi1~yAnnR%gg{KfmqrzwF$i#>9aro
z!~W?J?O(D^1!&EzQ+Pv^p6ZI{S8rR4aX{;ftCG5$kN?)+WPfLve|35$c$ptx_529|
z`TL^#Mh`QZ6izCXwmz#hYSNdYLxKD}SN6N62p99RQ&4BuC8)%vKmd39r_RpA92q=C
zI_IkE+1aRH#NYbsjmriV+<;|is0_8@fsQADy7#3t5e9dQtj(mS*vtbxVgz6DJf}Th
zx%vbB%q4h#@=VDKg8CHlt@!68JV$y8LWM~F&$D-T9j3w#om^aypGKeBw>rtH&r4j{
zA@k7o0J^AlC?s)`f9q8;So$+VvZJ(7s>ij&3c+s5Kkth`zDrRq=`h<<$WZYgJ2XEZ
zJ_nAvn{0xTwY0>{tb{ojgATs_o4kVxj<e_*cY+^JbX9vENc(ta=6a5Y%vFBHc9h6|
zh}*KTuc(XI<v!Zxz$~Fyb5{F8Xi9-?F&g(77iloefo^ncBhWn_fJ%YKrX}o23C*u?
z2MbL|gi@iC<|>3C)Y|yJHc8yrCKH|FngMz^TYDNI!<a-G^BNJ~Ru2r1Nr+~(6o((w
zPCN13sOV}r4a`Cq{!wkiaU;Ya5H%GQg5-TBTWQ0W|0T@&ABB7SVZt~4Fdr8W5^?7@
z#kinNA!YtG;jP}*S;aeGx6JB*xL+aO`u5L6so@ii)+MOB0GadNaM_AeHbzHGR(0Wt
zw=HP-J(r2oi8Q3#j#q&K>(}iw8f@GU3kA9VI#25}v5AV6p`+<$BY|L0W_p+L3pV@b
zUvn=h^5av7UGoQ6Er+#XCE_ak>D}-z*>Ck>3jebP^v5zu0=WEt42M{_(f_FC8@528
zWX^wrh;Blf|DyH(PBbOLp5+?gXLlL+&v1kXna*1GsVaGYrrF!?)3R_4usgW~MZOT|
zgV{Dw!98ZCIP$++mf}P?z1MdGE(P%!!?I1AnOwTDL5k2N=z5OItgwoJGn-#+>(93*
zeevuQDaOu9XyVR)s?JZ2yB!OI&eCFKc0I<Ljy}n4HU<vA)xW(uqLxSfXhW?2f1;fE
zI!QrLXJskla=cL`Oh3}bD?nWk^%DbGp8XgNMkypxMW-v+9l2S1-%8L$|9HmQJ;K!K
z*Nk|xUFvVf@z&J;rK!7sQWMhkt%1o3y8C5thBO<X)eGVjpL-!-hw~b5&C#mV)XK<L
z^HA6HJJ+MD0fuhUz8lqglP>q#=h3lt@!W#g=kJgp6Fs3<<J<WL`xyUuq``~@+k3X$
zPX|Ew+j|VT;c7~Ws)lL4L=J5tBe|4%$9!@T9wNVE3fL@&@_;aL^sI=$pyDL@nZ_$}
zU%WQt6|`L#6{k~P$)wOfXde>sn7qQlUsl*mSF9l|$~Qmce^?~8yNBP3$QWEyeiBgo
zvJ#U04di|(i7#}vTk;6gl=)yBo>KX%2h-d9Z~eCRd(Ho{^Su@7f}O=tFs|w^%YAL=
zW1qRbUrhncu2i=bxc{&Au00&eZH<o;a@>>KxNIU}vr(?e7@~AxBvdlO6H{bdwtJXy
zDP(7(Nijktw-JRfE{zDYHSWnZ*TI+=3>jvIne(-e_Bs1`&L8`nzt11*pYK`E_pNWO
z_g(M%uJwC=FPCHUdF$?~o+m?4j^W>W%%+(YVlZfTqucTafHLCV=9Wjt_3i47zOTTI
zUTx-)-23EpkyLp)Me;&*sG`4nY$KP*ZKWwSK+$xo{6pwggY_SeRL{-y9*`F}y1O72
zOrj8Az~U6sw@U&@H7CE@9#2SlsBlP)xcst}158v<bgLF~u49BwboX%eNzuBIUp2Tm
z|Ke=%0}$TK+@Cw!oX2Q-j=PyC#BTe^oa*#9=TCt$_pS7SBI?<827{^U>4r`BrGI<g
zxT4q1t{reg-{CMiND`mnT7R5<giX_AvKkjuB`^2125DQg1SvFuErlN^$wOIc(x2%X
zcMldl`_fUeW%E(QZ@ksEZFINw;NgM<)qG<(em5In8PdZ9-Ihyp&?`-G(vHuCiHT{z
z+6zW5$&H`Gq}Y^BF^?2hOY3jt**F*~KFP2ISXGn75(E8mti3=$4*@Yi!U_o8m$7@=
zcYUO7R4anY<|d4ID5dVN9T=Xpz}I0q_Kj!Ne=$dKGLs?!VjtxLBJPg{D+tT*s&yiL
zZp!S;<%vUp*aW+Cg+vMTDAd-x%_w{Qm1eeizG}d3oO?9KAMrUv_n`Eowfp&>dZTHM
z*0;x6f=1Towx~FWb@;A>5tjwk%&pH-?A^)QgzoSIOSrH*{<;g3ka6OcSQVWzb?5-t
zgIEpqd|j4Fv*)I>+yYWrM(#crlPuuFhjp1FYHaq1Dx0l=G36b8*<Wgvj_`J{1&-Dg
z|A{aGykXdX^8CLm9)J(|SBM~p`R04hes%(A^kyPDY=3TDI-R#A`s9U@+Xm0{fP(a#
z&2i~h1-cVLu{N7}{g8~^zb7QeFtjgXpQkT<r01HXJQ*l-e0tLh5wo_@#nk!b)A9ai
zVNqoKEh`T<H<y=_q@xD+kJsN^JyLT~ivO4RH{)P0CIF-u&b(ab;v!l2-mh5gQ#?%=
z8Blslr3MjuCx#KDJ%6cwkn&aScRWj5y=J-&%$<5ISx8(DcjmOAlOa!Rg^82bVX6}r
zY-`9f-G5ADlM*B@ws24I-RgOVwWub%lMIN@Gc{V~#h_Ro!_cZO8z<&09JYCy9vb>A
zi*ki4g@_7m#VNVch3zg#&bpv(BV~A0<50i;*rW5w^CCh}tKY8pwj>EWHfa`c#4w0&
zLxIQTYB&$!e5L05xxE{VqYFFiCHu<qnYGN74rj|fy-F^->R5tFB#1^QCUr6km;KCB
zXd_*U#`l0aaKr{|=@PQQL@VvMTBgF1f05?kU035PMuZgXiZqZ#k*#0xpo0M8mpCxC
zKPspv$O!%NTQ14(;0D~KiHq+y*pn@92F%>v#uZ&qS4@a8tK5`gkq@A%*&-5%lzil_
zDlGwGR<jW*fq3wk5wJiTdu!H=R*kxv)~}SdHX+iihIDx&#&}>pOmnz?2fi0FUVe+4
z{W}2P?zqfZGvTS*Z}stBk%XBU_sH{7Ua>Bhb~_AZKq(F2CD`uklG4y<3JQ^Fcd75v
z%{>_oki%P~10}H$u4GRG=bqGrJ>{B}o9>^`NIChdZo(>LE%IE|`WdWdgUYT+PiGEV
zK|#{{W^6Utq$DNf*%w(XY^*ksQVsLkiUfSZ_;}jLO<bb)cuC5!&pn&4R+Vp-QXr@s
zk7TLjm7*}};e?)sg2vyzZqsG`^G2)V&@A%7Mclb}wE4n)Ig!QwO1FxBlD!so+kp?5
zjHVC|eH(yx-0zoNB#?hnUVwc4IMgOg!Wh%!SeM3$m=pA$pI6)4rQ4j5*Qj2$CsI>d
zZ5AfF7)7dA9<}dOX?jKNGph;|sevG)eW|$*y=B(>nRR`#U#2Hrw62(<_|>u_=UDok
zo8IfjWzchIvo5qNO3YD+1WvXI{5M!7w4un2+D-8Lu7{UvsytfOR$V4Rbuc%RStj@t
zIorViwM~E1`ZIu?Pm9>8C?wywf5ixz9y7MOaZcU+0m@zLyZHSnsQ>=TKdCmql+d@i
zgA)}DQ_BnOb5<@vww<9M^N&dg>NXGqL~cb_Cdk>xk)QaH3Jbve6^<JlWk*T`J>YO?
z=-n5_sVf5x;OrS<RSQpu)I%>hHKhAsgH8!rM}&Bx5@Om^Y#xV;tvJ5M^FdlRCC+~H
z`nYsTsfP(akF(_|0FHk)TVtq`79y-j?4xTu=0p?rWMjC}>9ndtxO%;|Twc+$qf>RD
zXdz{hm3{upPrH+}x@ICDTBmfC@4rCqobjo2$vj-_bpPhmhm!s8`E2neD6ATI*?qJH
z|IY6i{FG|m(sFQyfz$`V2fRjxWZ%W+l2Etgwl{DIeIXbxRFyAUnkRoVZY*c9!s~gX
zgj(TrMcDOAS++U~zE-K~Yl^t{i-Ngu?pJzSacYu7Nrm?JNc^a(q}WCyME><M37t9P
ztG4Ho$?5~dGL6_Y&-|aY2Ct}p?|o}XCVy8Kj<qCN$cXj6%^ts@J(DD#PbtUxA|&mm
z%j#m{m5`{QZ8{|YuM`U@Cr0>i5gg_Uw6WDSaK2M&4;a`zV3?Yt-b*g4+trl!#Zgm5
z`-Y}StLB@bMUoHuP5bm`gIaBQ^vWVs+zFc*TA$fvf2bR_u*{Sh=Vz9fb0@p?&A`)G
z*Km5NTKMx@W68v8kE0@vH<N<zMxjH~_cng`IPB~Wc?b^VY6YrvJa%Ut$nc78BTp`Z
zK$@f%&>vc=kPWK4K$=HtOfJxe6LF`-dRY+JHLt!m6L`e@_OZYv{svseQtx4Cdi!!v
zX7mnaN_nx{<EkYynRpj$8BY)v4L4X*l1C9eR6e{4YdTHz(B-ZTq*3pQ0;y%9M#HPk
z136)%8Zj|3*AdB^)6>deFj(Wm{t~%E6+Vx=GtL#RMVPG<(vDU&r$K<`HWQd-p{%^h
zW(%lc1&U2Id*LAqU|6Y@2cb^1g{4S&Y4y}tk?;MfhV9ue;vfZT6b)ByC7$oLMhF$q
z=_m~qc<Hz#zHFu)x`&w*FR=g1hO$cd-V)YJ5!~p7R*C{z<6%<Y=qOBXyt2=Ei``T3
zLh?dOzx{FT5QRZ&!6%rH3Rc9UUo%KWfPqm!&V3c~91AjNzl*!6TI5<APX_cxqtgI(
z%<h9hP$N9Vbwc>cF1OCDk%Ckadat+BlSEXC)8N-sZhNHM@36*2^PO?^=izhptJ6}5
zLeyHw@*c80B+wV^HZek6Rmk=AzxAbaoyrDZcYQS_x{2o&uQW$Hcel@&P^M&heZvD>
z131@)7$NTA#^g*kbK{jA#B}Nt74b51(f5Z2t>LNPb~rl}6ac_M<&{m)%8IDus`EXZ
z+MTRY5NJnYtmsv3Ju4&N2OTCF@8f$F4N>{EnI>aAF(xBTV-55+GB6r03`2%ps!Xt5
z*)v8`MOYHf2jb-X2_N8FcXbHAST<IQV{Z3BUp2_tCytdTn)+d}D{w{L)kJSaqm7y+
z!g)tSUFZpJQK^sN4n7Lip=AXEiQ9^UKo)wIAP}g9mj?vmJ;?i4A1<L8{2<VD(Gaa+
z0TtEcKaTr|sT}inkV#W=3YG=xu`YQV+I^b9R3^4Z)0Sz}XNeqEX!8l+Q||(nUY;nc
zURjQq2oHudp&gt_tX1Ym-Jx*9VP+WDu9_QxZ!AZ02AYlmqxo;g_1FL4|2_Yg#pmpg
Yyc33t>sA>Aga&l>6wJEfq(}Te0OczeLI3~&

literal 0
HcmV?d00001

diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index aca290854..9b062b050 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/training.py b/megatron/training.py
index 6a4e843ab..3265680c5 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -970,7 +970,28 @@ def train(
 
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
+
+    if neox_args.profile:
+        schedule = torch.profiler.schedule(
+            wait=neox_args.profile_step_start,
+            warmup=1,
+            active=neox_args.profile_step_stop - neox_args.profile_step_start,
+        )
+        prof = torch.profiler.profile(
+            schedule=schedule,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                neox_args.tensorboard_dir
+            ),
+            record_shapes=True,
+            profile_memory=True,
+            with_flops=True,
+            with_modules=True,
+            with_stack=True,
+        )
+        prof.start()
     while iteration < neox_args.train_iters:
+        if neox_args.profile:
+            prof.step()
         if neox_args.profile and iteration == neox_args.profile_step_start:
             torch.cuda.cudart().cudaProfilerStart()
         loss_dict, skipped_iter = train_step(
@@ -983,6 +1004,7 @@ def train(
         )
         if neox_args.profile and iteration == neox_args.profile_step_stop:
             torch.cuda.cudart().cudaProfilerStop()
+            prof.stop()
         iteration += 1
         neox_args.iteration = iteration
         if neox_args.precision == "fp16":

From 1d557086119553f261ea53bb5c3e4ecf4d2ec5e3 Mon Sep 17 00:00:00 2001
From: Lang Xu <59843980+R0n12@users.noreply.github.com>
Date: Tue, 21 May 2024 18:37:57 -0400
Subject: [PATCH 05/27] fixed fused_rope naming in JIT + added readme for amd
 support (#1224)

---
 README.md                          | 11 ++++++++++-
 megatron/fused_kernels/__init__.py |  6 +++---
 tests/model/test_fused_kernels.py  |  4 +---
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e11122f5e..e63a59f28 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,6 @@ To install the remaining basic dependencies, run:
 pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
 pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
-python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels
 ```
 
 from the repository root.
@@ -106,6 +105,16 @@ from the repository root.
 
 </aside>
 
+### Fused Kernels
+We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build:
+
+```python
+python
+from megatron.fused_kernels import load
+load()
+```
+This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py`
+
 ### Flash Attention
 
 To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 1e4c9efac..3694e964b 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -135,8 +135,8 @@ def _cpp_extention_load_helper(
         srcpath / "fused_rotary_positional_embedding.cpp",
         srcpath / "fused_rotary_positional_embedding_cuda.cu",
     ]
-    fused_rotary_positional_embedding_cuda = _cpp_extention_load_helper(
-        "fused_rotary_positional_embedding_cuda",
+    fused_rotary_positional_embedding = _cpp_extention_load_helper(
+        "fused_rotary_positional_embedding",
         sources,
         extra_cuda_flags,
         extra_include_paths,
@@ -174,7 +174,7 @@ def load_fused_kernels():
         print(e)
         print("=" * 100)
         print(
-            f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
+            f"ERROR: Fused kernels configured but not properly installed. Please run `from megatron.fused_kernels import load()` then `load()` to load them correctly"
         )
         print("=" * 100)
         exit()
diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py
index cc458bf4a..125eb6c52 100644
--- a/tests/model/test_fused_kernels.py
+++ b/tests/model/test_fused_kernels.py
@@ -30,9 +30,7 @@
 )
 
 
-@pytest.mark.xfail(
-    reason="ModuleNotFoundError: No module named 'scaled_masked_softmax_cuda'"
-)
+@pytest.mark.xfail(reason="SystemExit: None")
 def test_load_fused_kernels():
     load()
     try:

From d3d59f2f1ff9b104ff024965b7519d757c955c40 Mon Sep 17 00:00:00 2001
From: yang <7129+yang@users.noreply.github.com>
Date: Tue, 21 May 2024 15:39:08 -0700
Subject: [PATCH 06/27] Small tidying (#1222)

* Tolerate no fused kernels

* Fix requirements file syntax

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: Yang Zhang <yang@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md     | 2 +-
 megatron/model/norms.py       | 7 +++++--
 requirements/requirements.txt | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 1dbb4dd8a..c6d369524 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = b68ba6d
+    Default = 1b85a2f
 
     current git hash of repository
 
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
index 8b06b177c..dda44659f 100644
--- a/megatron/model/norms.py
+++ b/megatron/model/norms.py
@@ -14,7 +14,6 @@
 
 import torch
 from torch.nn import LayerNorm as LayerNorm
-from .fused_layer_norm import MixedFusedLayerNorm
 
 
 def get_norm(neox_args):
@@ -23,7 +22,11 @@ def get_norm(neox_args):
         eps = neox_args.rms_norm_epsilon
     elif neox_args.norm == "layernorm":
         eps = neox_args.layernorm_epsilon
-        norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm
+        if neox_args.layernorm_fusion:
+            from .fused_layer_norm import MixedFusedLayerNorm
+            norm = MixedFusedLayerNorm
+        else:
+            norm = LayerNorm
     elif neox_args.norm == "scalenorm":
         eps = neox_args.scalenorm_epsilon
         norm = ScaleNorm
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 501edf345..3ac92598a 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,6 +1,6 @@
-git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
+deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
 ftfy>=6.0.1
-git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
 jinja2==3.1.4
 lm_eval>=0.4.0,<=0.4.1

From dfc6722f2ab0e3efb65ce5b49449a2a8b14a26b7 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Sun, 26 May 2024 17:46:02 -0400
Subject: [PATCH 07/27] Fix markdown formatting error (#1217)

* Update README.md

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/README.md         | 2 +-
 configs/neox_arguments.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/README.md b/configs/README.md
index d8ae81739..e14274b56 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model.
 
 For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
 
-Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
+Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
 ```yaml
 # GPT-3 pretraining setup
 {
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c6d369524..306a0da5d 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 1b85a2f
+    Default = a3fb470
 
     current git hash of repository
 

From b5c0afe42851dc4878aaa1b56021dc71b3ee435d Mon Sep 17 00:00:00 2001
From: Jacob Hatef <74274091+jahatef@users.noreply.github.com>
Date: Tue, 4 Jun 2024 13:22:14 -0400
Subject: [PATCH 08/27] add workflow_dispatch to gh actions pr so we can run on
 command (#1233)

* add workflow_dispatch to gh actions pr so we can run on command

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
---
 .github/workflows/pull_request.yml | 2 +-
 configs/neox_arguments.md          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 3213718df..a2b1a2fc2 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -1,6 +1,6 @@
 name: Pull Request
 
-on: [pull_request]
+on: [pull_request, workflow_dispatch]
 
 jobs:
   pre-commit:
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 306a0da5d..c60d1e15f 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = a3fb470
+    Default = 516169c
 
     current git hash of repository
 

From 4a34e0a565f19a8578210654afcb3bb835fcc35e Mon Sep 17 00:00:00 2001
From: jaimemcc <99298642+jaimemcc-intel@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:26:54 -0700
Subject: [PATCH 09/27] init changes to README (#1232)

* init changes to README

* Update NeoXArgs docs automatically

* Update README.md

* Update NeoXArgs docs automatically

* Update README.md

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md |  2 +-
 tests/README.md           | 77 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c60d1e15f..f6c3ecde3 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 516169c
+    Default = 7aa0074
 
     current git hash of repository
 
diff --git a/tests/README.md b/tests/README.md
index 316096cc5..390a52898 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -32,7 +32,7 @@ pytest --forked tests/model/test_model_generation.py
 
 Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
 The test cases for cpu can be run with:
-````
+```
 pytest tests -m cpu
 ```
 
@@ -49,3 +49,78 @@ if You see this kind of error:
 RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
 ```
 It usually means that you used some pytorch.cuda function before the test creates the processes. However just importing `from torch.utils import cpp_extension` can also trigger this.
+
+
+## CPU Test Integration
+
+Tests can be run against physical CPUs through GitHub Actions. To have tests run on the physical CPU test, here is generally how the CI should be written:
+
+### runs-on
+
+The CI needs to be written to target the CPU Github Action runner. The jobs that need to run on CPU should use the hardware runner's labels:
+```yaml
+jobs:
+  cpu-test-job:
+    runs-on: [ 'self-hosted', 'aws', 'test'] # these labels tell GitHub to execute on the runner with the 'aws' and 'test' labels
+```
+
+### Software dependencies
+
+Hardware tests that need python and docker should install them as part of the test execution to make sure the tests run as expected:
+```yaml
+steps:
+    # sample syntax to setup python with pip
+  - uses: actions/setup-python@v4
+    with:
+      python-version: "3.8"
+      cache: "pip"
+
+    # sample setup of docker (there's no official Docker setup action)
+  - name: Docker setup
+    run: | # taken from Docker's installation page: https://docs.docker.com/engine/install/ubuntu/
+      # Add Docker's official GPG key:
+      sudo apt-get update
+      sudo apt-get install ca-certificates curl
+      sudo install -m 0755 -d /etc/apt/keyrings
+      sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+      sudo chmod a+r /etc/apt/keyrings/docker.asc
+      # Add the repository to Apt sources:
+      echo \
+        "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+        $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+        sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+      sudo apt-get update
+      sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
+```
+
+Any other software dependencies should be assumed to be missing and installed as part of the CI.
+
+### Using Docker image
+
+Using the Docker image and running tests in a container is recommended to resolve environment issues. There is a modified docker-compose.yml in tests/cpu_tests directory that is recommended to be used for CPU tests:
+
+```bash
+cp tests/cpu_tests/docker-compose.yml .
+# export any env variables here that should be used:
+export NEOX_DATA_PATH='./data/enwik8'
+docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null
+# then can set up and run tests in the container using docker exec
+docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt
+# etc.
+# please clean up the container as part of the CI:
+docker rm $CONTAINER
+```
+
+At the time of writing there is no built-in method to provide an offline-built Docker image to `jobs.<job-id>.container`.
+
+### Using existing CPU test CI
+
+There is an existing CPU test workflow that can be included in existing CI:
+
+```yaml
+steps:
+  - name: Run CPU Tests
+    uses:
+      target_test_ref: $GITHUB_REF # replace with the ref/SHA that the tests should be run on
+      # have a look at the reusable workflow here: https://github.com/EleutherAI/gpt-neox/blob/main/tests/cpu_tests/action.yml
+```

From 90a6cdb35f11d3a1892da4cb242c6a2576bcfb6a Mon Sep 17 00:00:00 2001
From: dmahan93 <44207705+dmahan93@users.noreply.github.com>
Date: Thu, 6 Jun 2024 20:24:49 -0500
Subject: [PATCH 10/27] fix summed biases not being divided by mp size (#1220)

---
 tools/ckpts/convert_hf_to_sequential.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tools/ckpts/convert_hf_to_sequential.py b/tools/ckpts/convert_hf_to_sequential.py
index c53f28391..55cfc6517 100644
--- a/tools/ckpts/convert_hf_to_sequential.py
+++ b/tools/ckpts/convert_hf_to_sequential.py
@@ -119,16 +119,27 @@ def shard_sequential_mp(num_mp_ranks, sequential):
     ranks = {x: dict() for x in range(num_mp_ranks)}
     for k, v in sequential.items():
         if reduce(
+            np.logical_or,
+            [
+                x in k
+                for x in [
+                    "dense_4h_to_h.bias",
+                    "attention.dense.bias",
+                ]
+            ],
+        ):
+            # Divide by tp_size since they get added together
+            for x in range(num_mp_ranks):
+                ranks[x][k] = v / num_mp_ranks
+        elif reduce(
             np.logical_or,
             [
                 x in k
                 for x in [
                     "layernorm",
                     "rotary_emb",
-                    "dense_4h_to_h.bias",
                     "norm.weight",
                     "norm.bias",
-                    "attention.dense.bias",
                 ]
             ],
         ):

From 2382bd4a6bfd0ec7199e1b7876cd8c457029e8e1 Mon Sep 17 00:00:00 2001
From: yang <7129+yang@users.noreply.github.com>
Date: Thu, 6 Jun 2024 18:27:01 -0700
Subject: [PATCH 11/27] Fix changed behavior of pipe_parallel (#1219)

* Fix changed behavior of pipe_parallel

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: Yang Zhang <yang@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md            |  2 +-
 megatron/neox_arguments/arguments.py | 17 ++++-------------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index f6c3ecde3..7a56e361e 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 7aa0074
+    Default = 8451671
 
     current git hash of repository
 
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 98a444ea4..9cad02c43 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -180,7 +180,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None)
         config_files = dict()
         # iterate of all to be loaded yaml files
         for conf_file_name in paths_to_yml_files:
-
             # load file
             with open(conf_file_name) as conf_file:
                 conf = yaml.load(conf_file, Loader=yaml.FullLoader)
@@ -477,7 +476,6 @@ def get_extra_deepspeed_args(self):
         return extra_ds_args
 
     def get_deepspeed_main_args(self):
-
         args_list = list()
 
         if self.autotuning_run is not None:
@@ -796,14 +794,11 @@ def calculate_batch_parameters(
 
         # either none of the three parameters are provided or just gradient_accumulation_step is provided
         else:
-            assert (
-                False
-            ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
+            assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
         return int(train_batch), int(micro_batch), int(grad_acc)
 
     @staticmethod
     def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc):
-
         assert (
             train_batch > 0
         ), f"Train batch size: {train_batch} has to be greater than 0"
@@ -1033,10 +1028,7 @@ def calculate_derived(self):
         # Update 'is pipe parallel' flag
         # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
         # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
-        self.update_value(
-            "is_pipe_parallel",
-            self.pipe_parallel_size > 1 and self.moe_num_experts == 1,
-        )
+        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
         if self.moe_num_experts > 1:
             assert not (
                 self.is_pipe_parallel or self.pipe_parallel_size > 1
@@ -1106,8 +1098,8 @@ def calculate_derived(self):
         if "flash" in self.attention_config:
             _flash_version = packaging.version.Version(version("flash-attn"))
             if self.sliding_window_width is not None:
-                assert _flash_version >= packaging.version.Version(
-                    "2.3.0"
+                assert (
+                    _flash_version >= packaging.version.Version("2.3.0")
                 ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention."
             if self.pos_emb == "alibi":
                 if not _flash_version >= packaging.version.Version("2.4.0.post1"):
@@ -1234,7 +1226,6 @@ def validate_values(self):
 
         # Parameters sharing does not work with torch DDP.
         if (self.num_unique_layers is not None) and (self.num_layers is not None):
-
             if not (self.num_unique_layers <= self.num_layers):
                 error_message = (
                     self.__class__.__name__

From 4c426da8b6149e2313bc6e00584531f004cfe457 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Thu, 6 Jun 2024 21:37:48 -0400
Subject: [PATCH 12/27] Conversion script bugfixes (#1218)

* update is_pipe_parallel logic ; handle tied-embeddings case correctly

* Update NeoXArgs docs automatically

* revert PP to be consistent

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md         |  2 +-
 tools/ckpts/convert_neox_to_hf.py | 65 ++++++++++++++++++++++---------
 2 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 7a56e361e..c884afd97 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 8451671
+    Default = 714b299
 
     current git hash of repository
 
diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
index 35812383e..f4e0ccf9f 100644
--- a/tools/ckpts/convert_neox_to_hf.py
+++ b/tools/ckpts/convert_neox_to_hf.py
@@ -580,30 +580,59 @@ def convert(
 
     # Load output embedding
     if not sequential:
-        loaded_tp_ranks = load_partitions(
-            input_checkpoint_path,
-            mp_partitions,
-            get_key(loaded_config, "num-layers") + 4,
-            sequential=sequential,
-        )
+        if get_key(loaded_config, "no-weight-tying", False):
+            # if we have trained input + output embedding layers without tied weights
+            loaded_tp_ranks = load_partitions(
+                input_checkpoint_path,
+                mp_partitions,
+                get_key(loaded_config, "num-layers") + 4,
+                sequential=sequential,
+            )
+        else:
+            # in this case, output embedding layer and input embedding layer are tied.
+            # load + save the input embed weights into the output embedding layer's place.
+            loaded_tp_ranks = load_partitions(
+                input_checkpoint_path,
+                mp_partitions,
+                layer_idx=0,
+                sequential=sequential,
+            )
     # output embedding / LM head
     if architecture == "neox":  # name of lm head / final linear proj varies
         lm_head = hf_model.embed_out
     else:
         lm_head = hf_model.lm_head
-    lm_head.load_state_dict(
-        {
-            "weight": torch.cat(
-                get_state(
-                    loaded_tp_ranks,
-                    "final_linear.weight",
-                    layer_idx=get_key(loaded_config, "num-layers") + 4,
-                    sequential=sequential,
+
+    if get_key(loaded_config, "no-weight-tying", False):
+        # save the (untied) final linear into LM head for HF
+        lm_head.load_state_dict(
+            {
+                "weight": torch.cat(
+                    get_state(
+                        loaded_tp_ranks,
+                        "final_linear.weight",
+                        layer_idx=get_key(loaded_config, "num-layers") + 4,
+                        sequential=sequential,
+                    ),
+                    dim=0,
                 ),
-                dim=0,
-            ),
-        }
-    )
+            }
+        )
+    else:
+        # embedding layers are tied. transpose input layer and save
+        lm_head.load_state_dict(
+            {
+                "weight": torch.cat(
+                    get_state(
+                        loaded_tp_ranks,
+                        "word_embeddings.weight",
+                        layer_idx=0,
+                        sequential=sequential,
+                    ),
+                    dim=0,
+                ),
+            }
+        )
 
     del loaded_tp_ranks
 

From 2608972a4957553bf6556044c8faf0bc28bcdafc Mon Sep 17 00:00:00 2001
From: Jacob Hatef <74274091+jahatef@users.noreply.github.com>
Date: Wed, 19 Jun 2024 16:57:53 -0400
Subject: [PATCH 13/27] fix python version and pytest install (#1234)

* fix python version and pytest install

* Update NeoXArgs docs automatically

* python3

* Update NeoXArgs docs automatically

* pip not pip3

* Update NeoXArgs docs automatically

* python3 pip

* Update NeoXArgs docs automatically

* python3 -m pip

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

* add docker setup to workflow

* Update NeoXArgs docs automatically

* python setup

* Update NeoXArgs docs automatically

* python setup v2

* Update NeoXArgs docs automatically

* python setup v3

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* python setup v3

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

* Add hash back to deep speed version

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 .github/workflows/pull_request.yml | 14 ++++++++++++--
 configs/neox_arguments.md          |  2 +-
 requirements/requirements.txt      |  4 ++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index a2b1a2fc2..99f7f988d 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -9,7 +9,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.10
+          python-version: "3.10.14"
           cache: "pip"
           cache-dependency-path: "**/requirements*.txt"
       # Need the right version of clang-format
@@ -43,7 +43,17 @@ jobs:
     runs-on: self-hosted
     steps:
       - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10.13"
+          cache-dependency-path: "**/requirements*.txt"
       - name: prepare data
-        run: python prepare_data.py
+        run: python3 prepare_data.py
+      - name: install pytest
+        run: python3 -m pip install pytest pytest-forked pyyaml requests wandb
+      - name: install torch
+        run: python3 -m pip install torch 
+      - name: install requirements
+        run: pip install -r requirements/requirements.txt
       - name: Run Tests
         run: pytest --forked tests
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c884afd97..1e67685ed 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 714b299
+    Default = 455446c
 
     current git hash of repository
 
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 3ac92598a..b5a84674b 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,11 +1,11 @@
 deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
 ftfy>=6.0.1
-lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
 jinja2==3.1.4
+lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 lm_eval>=0.4.0,<=0.4.1
 mpi4py>=3.0.3
-numpy>=1.22.0
+numpy<2.0
 pybind11>=2.6.2
 regex
 sentencepiece

From 0e5f6db140819d80cd480d54c63cdc1aa5b818e3 Mon Sep 17 00:00:00 2001
From: dmahan93 <44207705+dmahan93@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:30:02 -0500
Subject: [PATCH 14/27] Add a chat data preprocessing script (#1239)

* Add a chat data preprocessing script

* add EOT at end of a chat

* update README.md

* apply pre-commit

---------

Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 .github/workflows/pull_request.yml            |   2 +-
 megatron/data/helpers.cpp                     |  12 +-
 megatron/model/norms.py                       |   1 +
 megatron/neox_arguments/arguments.py          |   8 +-
 tools/datasets/README.md                      |  51 +++
 .../preprocess_data_with_chat_template.py     | 348 ++++++++++++++++++
 6 files changed, 412 insertions(+), 10 deletions(-)
 create mode 100644 tools/datasets/preprocess_data_with_chat_template.py

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 99f7f988d..53be528ae 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -52,7 +52,7 @@ jobs:
       - name: install pytest
         run: python3 -m pip install pytest pytest-forked pyyaml requests wandb
       - name: install torch
-        run: python3 -m pip install torch 
+        run: python3 -m pip install torch
       - name: install requirements
         run: pip install -r requirements/requirements.txt
       - name: Run Tests
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index 9b062b050..aca290854 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
index dda44659f..19e1aeae6 100644
--- a/megatron/model/norms.py
+++ b/megatron/model/norms.py
@@ -24,6 +24,7 @@ def get_norm(neox_args):
         eps = neox_args.layernorm_epsilon
         if neox_args.layernorm_fusion:
             from .fused_layer_norm import MixedFusedLayerNorm
+
             norm = MixedFusedLayerNorm
         else:
             norm = LayerNorm
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 9cad02c43..054689eda 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -794,7 +794,9 @@ def calculate_batch_parameters(
 
         # either none of the three parameters are provided or just gradient_accumulation_step is provided
         else:
-            assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
+            assert (
+                False
+            ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
         return int(train_batch), int(micro_batch), int(grad_acc)
 
     @staticmethod
@@ -1098,8 +1100,8 @@ def calculate_derived(self):
         if "flash" in self.attention_config:
             _flash_version = packaging.version.Version(version("flash-attn"))
             if self.sliding_window_width is not None:
-                assert (
-                    _flash_version >= packaging.version.Version("2.3.0")
+                assert _flash_version >= packaging.version.Version(
+                    "2.3.0"
                 ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention."
             if self.pos_emb == "alibi":
                 if not _flash_version >= packaging.version.Version("2.4.0.post1"):
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index f8215959c..af3009a23 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -93,6 +93,57 @@ output data:
   --dataset-impl {lazy,cached,mmap}
                         Dataset implementation to use. Default: mmap
 
+runtime:
+  --workers WORKERS     Number of worker processes to launch
+  --log-interval LOG_INTERVAL
+                        Interval between progress updates
+```
+## `preprocess_data_with_chat_template.py`
+Similar, but uses huggingface's [chat templates](https://huggingface.co/docs/transformers/main/en/chat_templating) to
+tokenize the data to support multiturn and more complicated use cases.
+
+N.B. If using this, you  **must** specify your data when training/finetuning with the following configs
+```json
+"train_data_paths": ["train_documents"],
+"test_data_paths": ["test_documents"],
+"valid_data_paths": ["test_documents"],
+"label_data_paths": ["label_documents"]
+```
+
+the `"data_path"` option will not work with `"label_data_paths"`.
+
+
+```
+usage: preprocess_data_with_chat_template.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--no-mask]
+                                             [--generation-role GENERATION_ROLE] [--only-last] [--num-docs NUM_DOCS]
+                                             --tokenizer-path TOKENIZER_PATH [--ftfy] --output-prefix OUTPUT_PREFIX
+                                             [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS]
+                                             [--log-interval LOG_INTERVAL]
+
+options:
+  -h, --help            show this help message and exit
+
+input data:
+  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
+  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
+                        space separate listed of keys to extract from jsonl. Default: text
+  --no-mask             If set, this will not mask any tokens in the input data.
+  --generation-role GENERATION_ROLE
+                        The role of the model generating the chat, usually 'assistant'. Default: assistant
+  --only-last           If set, this will mask everything except the last turn in the chat.
+  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
+
+tokenizer:
+  --tokenizer-path TOKENIZER_PATH
+                        Path to HF Tokenizer.
+  --ftfy                Use ftfy to clean text
+
+output data:
+  --output-prefix OUTPUT_PREFIX
+                        Path to binary output file without suffix
+  --dataset-impl {lazy,cached,mmap}
+                        Dataset implementation to use. Default: mmap
+
 runtime:
   --workers WORKERS     Number of worker processes to launch
   --log-interval LOG_INTERVAL
diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py
new file mode 100644
index 000000000..81770deff
--- /dev/null
+++ b/tools/datasets/preprocess_data_with_chat_template.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A script for processing a dataset such that chat templates are utilized in the creation of the data.
+These are then used to perform instruction/chat model finetunes (for example, finetuning a model on only the assistant
+portions of a chatml dataset).
+
+This follows the same output format as 'preprocess_data_with_mask.py' but using chat templates to generate the data.
+This way we can support multiturn chat data in the finetuning process. instead of relying on a single turn of data.
+
+To run this script, first edit `tools/datasets/corpora.py` such that the command to call
+ `tools/datasets/preprocess_data_with_chat_template.py` is as follows:
+
+```
+cmd = f"python tools/datasets/preprocess_data_with_with_chat_template.py \
+    --input {jsonl_filepath} \
+    --output-prefix {parent_folder}/{self.name} \
+    --tokenizer-path {hf-tokenizer} \
+    --jsonl-keys {jsonl_keys} \
+    --dataset-impl mmap \
+    --workers {self.num_workers} "
+
+if self.only_last:
+    cmd += f"--only-last "
+
+if self.no_mask:
+    cmd += f"--no-mask "
+```
+
+Then, specify
+```
+"train_data_paths": ["/path/to/dataset/name_text_document"],
+"label_data_paths": ["/path/to/dataset/name_label_document"]
+```
+in your YML config. This will then allow for finetuning on the data with loss masks set appropriately.
+
+"""
+
+import argparse
+import multiprocessing
+import os
+import sys
+
+import lm_dataformat as lmd
+import numpy as np
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+
+import time
+import tqdm
+import jsonlines
+
+from megatron.data import indexed_dataset
+from threading import Semaphore
+from typing import List, Dict, Tuple
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+
+def build_chat(
+    chat: List[Dict[str, str]],
+    generation_role: str,
+    apply_mask: bool,
+    tokenizer: PreTrainedTokenizer,
+    only_last_turn: bool = False,
+) -> Tuple[List[int], List[int]]:
+    """
+    Build a chat from a list of dictionaries. Each dictionary should have a "role" and "content" key, this follows the
+    Chat Template from https://huggingface.co/docs/transformers/main/en/chat_templating
+
+    :param chat: A list of dictionaries with "role" and "content" keys
+    :param generation_role: The role of the model generating the chat, usually "assistant"
+    :param apply_mask: Whether to apply a loss mask to the chat, if False, all tokens will be included in the loss
+    :param tokenizer: A HF tokenizer
+    :param only_last_turn: Whether to only include the last turn in the chat, needed for some fine-tuning tasks
+    """
+    tokens = []
+    mask = []
+    if apply_mask is False:
+        tokens = tokenizer.apply_chat_template(chat)
+        mask = tokens
+        return tokens, mask
+    for i, turn in enumerate(chat):
+        add_gen = (
+            False if i == len(chat) - 1 else chat[i + 1]["role"] == generation_role
+        )
+        chat_tokens = tokenizer.apply_chat_template(
+            chat[: i + 1], add_generation_prompt=add_gen
+        )
+        # remove previous stuff...
+        tokens.extend(chat_tokens)
+        if only_last_turn and (i != len(chat) - 1):
+            mask.extend([-100] * len(chat_tokens))
+        elif apply_mask and (turn["role"] != generation_role):
+            mask.extend([-100] * len(chat_tokens))
+        else:
+            mask.extend(chat_tokens)
+    if tokenizer.eos_token_id is not None:
+        mask.append(tokenizer.eos_token_id if mask[-1] != -100 else -100)
+        tokens.append(tokenizer.eos_token_id)
+    return tokens, mask
+
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_path)
+
+    def encode(self, text):
+        ids = {}
+        for key in self.args.jsonl_keys:
+            text_ids, label_ids = build_chat(
+                text[key],
+                self.args.generation_role,
+                not self.args.no_mask,
+                Encoder.tokenizer,
+                self.args.only_last,
+            )
+            ids[key] = (text_ids, label_ids)
+        return ids, len(text)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated "
+        "list",
+    )
+    group.add_argument(
+        "--jsonl-keys",
+        nargs="+",
+        default=["conversation"],
+        help="space separate listed of keys to extract from jsonl. Default: text",
+    )
+    group.add_argument(
+        "--no-mask",
+        help="If set, this will not mask any tokens in the input data.",
+        action="store_true",
+    )
+    group.add_argument(
+        "--generation-role",
+        type=str,
+        default="assistant",
+        help="The role of the model generating the chat, usually 'assistant'. Default: assistant",
+    )
+    group.add_argument(
+        "--only-last",
+        help="If set, this will mask everything except the last turn in the chat.",
+        action="store_true",
+    )
+    group.add_argument(
+        "--num-docs",
+        default=None,
+        help="Optional: Number of documents in the input data (if known) for an accurate progress bar.",
+        type=int,
+    )
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--tokenizer-path",
+        type=str,
+        required=True,
+        help="Path to HF Tokenizer.",
+    )
+    group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text")
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+    group.add_argument(
+        "--dataset-impl",
+        type=str,
+        default="mmap",
+        choices=["lazy", "cached", "mmap"],
+        help="Dataset implementation to use. Default: mmap",
+    )
+
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--workers", type=int, default=1, help="Number of worker processes to launch"
+    )
+    group.add_argument(
+        "--log-interval",
+        type=int,
+        default=100,
+        help="Interval between progress updates",
+    )
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.model_parallel_size = 1
+
+    return args
+
+
+def yield_from_files(fnames: list, semaphore):
+    """
+    Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
+    other compressed formats. Also filters out empty documents.
+
+    :param fnames: list of filenames
+    """
+
+    def yielder(fname, semaphore):
+        with open(fname, encoding="utf-8") as f:
+            reader = jsonlines.Reader(f)
+            for f in reader:
+                semaphore.acquire()
+                yield f
+
+    for fname in fnames:
+        semaphore.acquire()
+
+        yield from yielder(fname, semaphore)
+
+
+def main():
+    args = get_args()
+    encoder = Encoder(args)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+
+    # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and
+    # hence building up memory
+    semaphore = Semaphore(10000 + args.workers)
+
+    # use multiprocessing to iterate over input documents
+    fin = yield_from_files(args.input.split(","), semaphore)
+
+    if args.workers > 1:
+        pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
+    else:
+        encoder.initializer()
+        encoded_docs = (encoder.encode(doc) for doc in fin)
+
+    # make a dataset builder for each key in args.jsonl_keys
+    # each key will output to a different file beginning with args.output_prefix
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.jsonl_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(
+            args.output_prefix, key, "document"
+        )
+        output_idx_files[key] = "{}_{}_{}.idx".format(
+            args.output_prefix, key, "document"
+        )
+        builders[key] = indexed_dataset.make_builder(
+            output_bin_files[key],
+            impl=args.dataset_impl,
+            vocab_size=tokenizer.vocab_size,
+        )
+        builders[key]._dtype = np.int32
+        if not args.no_mask:
+            assert (
+                key + "_label" not in args.jsonl_keys
+            ), "label should not be included as it will be generated according to the mask."
+            key += "_label"
+            output_bin_files[key] = "{}_{}_{}.bin".format(
+                args.output_prefix, key, "document"
+            )
+            output_idx_files[key] = "{}_{}_{}.idx".format(
+                args.output_prefix, key, "document"
+            )
+            builders[key] = indexed_dataset.make_builder(
+                output_bin_files[key],
+                impl=args.dataset_impl,
+                vocab_size=tokenizer.vocab_size,
+            )
+            builders[key]._dtype = np.int32
+
+    # actually do tokenization
+    proc_start = time.time()
+    total_bytes_processed = 0
+    pbar = tqdm.tqdm()
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+
+        # release semaphore so `yield_from_files` can add another file to the buffer
+        semaphore.release()
+
+        # add each tokenized document / sentence
+        for key, conv in doc.items():
+            tokens = conv[0]
+            token_mask = conv[1]
+            builders[key].add_item(np.array(tokens, dtype=builders[key].dtype))
+            builders[key + "_label"].add_item(
+                np.array(token_mask, dtype=builders[key + "_label"].dtype)
+            )
+            # add indx...
+            builders[key].end_document()
+            builders[key + "_label"].end_document()
+            if i == 1:
+                print("key: ", key)
+                print("tokens: ", tokens)
+                print("token_mask: ", token_mask)
+        # log progress
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            pbar.set_description(
+                f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)."
+            )
+            if i != 0:
+                pbar.update(args.log_interval)
+
+    # save output file
+    update_keys = args.jsonl_keys
+    for key in update_keys:
+        builders[key].finalize(output_idx_files[key])
+        builders[key + "_label"].finalize(output_idx_files[key + "_label"])
+
+
+if __name__ == "__main__":
+    main()

From 1cee5b7c7074302de4867ad5cac3f1ea26f7a7d7 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Fri, 28 Jun 2024 11:21:46 -0500
Subject: [PATCH 15/27] Fix paper reference in init_functions.py (#1241)

---
 megatron/model/init_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 86a003dbd..8a0b8e251 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -145,7 +145,7 @@ def init_(tensor, use_mup=use_mup_outer):
 
 def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0):
     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
-    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
+    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution."""
     std = math.sqrt(2 / (5 * dim))
 
     def init_(tensor, use_mup=use_mup_outer):

From c1ea2a1ff1c062ed89ede27780cecf0122ae3f42 Mon Sep 17 00:00:00 2001
From: dmahan93 <44207705+dmahan93@users.noreply.github.com>
Date: Mon, 5 Aug 2024 23:14:27 -0500
Subject: [PATCH 16/27] Add hf llama to neox conversion (#1247)

* - Add conversion of HF llama models to NeoX

* - Add conversion of HF llama models to NeoX

* - minor fix

* pre-commit

---------

Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 tools/ckpts/README.md                   |  17 ++
 tools/ckpts/convert_hf_llama_to_neox.py | 219 ++++++++++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 tools/ckpts/convert_hf_llama_to_neox.py

diff --git a/tools/ckpts/README.md b/tools/ckpts/README.md
index 24d5cf31c..770cfb9c6 100644
--- a/tools/ckpts/README.md
+++ b/tools/ckpts/README.md
@@ -131,3 +131,20 @@ options:
   --num_output_shards NUM_OUTPUT_SHARDS
   --pipeline_parallel   Only use if PP>1
 ```
+
+### `convert_hf_llama_to_neox.py`
+Takes an HF Llama checkpoint and puts it into a NeoX-compatible format.
+
+Note that this does not support pipeline parallelism!
+
+```
+usage: convert_hf_llama_to_neox.py [-h] [--tp TP] [--pp PP] [--model MODEL] [--model_path MODEL_PATH]
+
+options:
+  -h, --help            show this help message and exit
+  --tp TP               Number of tensor parallelism ranks
+  --pp PP               Number of pipeline parallelism stages
+  --model MODEL         HF model name
+  --model_path MODEL_PATH
+                        Path to save model
+```
diff --git a/tools/ckpts/convert_hf_llama_to_neox.py b/tools/ckpts/convert_hf_llama_to_neox.py
new file mode 100644
index 000000000..2adddb19d
--- /dev/null
+++ b/tools/ckpts/convert_hf_llama_to_neox.py
@@ -0,0 +1,219 @@
+import torch
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import os
+import tqdm
+
+
+def convert_model(hf_state_dict, hf_config, tp_ranks):
+    conv_state_dicts = [{} for _ in range(tp_ranks)]
+    # get embeddings...
+    for i, chunk in enumerate(
+        torch.chunk(hf_state_dict["model.embed_tokens.weight"], tp_ranks, dim=0)
+    ):
+        conv_state_dicts[i][
+            "sequential.0.word_embeddings.weight"
+        ] = chunk.clone().detach()
+    print(
+        "model.embed_tokens.weight",
+        hf_state_dict["model.embed_tokens.weight"].shape,
+        "sequential.0.word_embeddings.weight",
+        conv_state_dicts[0]["sequential.0.word_embeddings.weight"].shape,
+    )
+    # Get config data...
+    num_kv_heads = hf_config.num_key_value_heads
+    num_q_heads = hf_config.num_attention_heads
+    head_dim = hf_config.hidden_size // num_q_heads
+    # do layers...
+    for layer_num in tqdm.tqdm(range(model.model.config.num_hidden_layers)):
+        # --- attention ---
+        # Output first since it's a simple row parallel...
+        for i, chunk in enumerate(
+            torch.chunk(
+                hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"],
+                tp_ranks,
+                dim=1,
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.attention.dense.weight"
+            ] = chunk.clone().detach()
+        print(
+            f"model.layers.{layer_num}.self_attn.o_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"].shape,
+            f"sequential.{layer_num+2}.attention.dense.weight",
+            conv_state_dicts[0][
+                f"sequential.{layer_num+2}.attention.dense.weight"
+            ].shape,
+        )
+        # Now for attention...
+        # Split into heads...
+        q = hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"]
+        k = hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"]
+        v = hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"]
+        # The GQA code splits the heads by the num_q_heads so we also do that
+        # here to ensure it matches...
+        q = q.view(num_q_heads, -1, q.shape[-1])
+        k = k.view(num_q_heads, -1, q.shape[-1])
+        v = v.view(num_q_heads, -1, q.shape[-1])
+        # Chunk for tensor parallelism...
+        for i, q_chunk, k_chunk, v_chunk in zip(
+            range(tp_ranks),
+            torch.chunk(q, tp_ranks, dim=0),
+            torch.chunk(k, tp_ranks, dim=0),
+            torch.chunk(v, tp_ranks, dim=0),
+        ):
+            # Need to join the heads across q, k, v...
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.attention.query_key_value.weight"
+            ] = (
+                torch.cat([q_chunk, k_chunk, v_chunk], dim=1)
+                .view(-1, q.shape[-1])
+                .clone()
+                .detach()
+            )
+        print(
+            f"model.layers.{layer_num}.self_attn.(q/k/v)_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"].shape,
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"].shape,
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"].shape,
+            f"sequential.{layer_num+2}.attention.query_key_value.weight",
+            conv_state_dicts[0][
+                f"sequential.{layer_num+2}.attention.query_key_value.weight"
+            ].shape,
+        )
+        # --- mlp ---
+        # Do SwiGLU weights...
+        # w1...
+        for i, chunk in enumerate(
+            torch.chunk(
+                hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"],
+                tp_ranks,
+                dim=0,
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.mlp.w1.weight"
+            ] = chunk.clone().detach()
+        print(
+            f"model.layers.{layer_num}.mlp.gate_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"].shape,
+            f"sequential.{layer_num+2}.mlp.w1.weight",
+            conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.w1.weight"].shape,
+        )
+        # w3...
+        for i, chunk in enumerate(
+            torch.chunk(
+                hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"],
+                tp_ranks,
+                dim=0,
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.mlp.w3.weight"
+            ] = chunk.clone().detach()
+        print(
+            f"model.layers.{layer_num}.mlp.up_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"].shape,
+            f"sequential.{layer_num+2}.mlp.w3.weight",
+            conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.w3.weight"].shape,
+        )
+        # w2 (output)...
+        for i, chunk in enumerate(
+            torch.chunk(
+                hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"],
+                tp_ranks,
+                dim=1,
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.mlp.w2.weight"
+            ] = chunk.clone().detach()
+        print(
+            f"model.layers.{layer_num}.mlp.down_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"].shape,
+            f"sequential.{layer_num+2}.mlp.w2.weight",
+            conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.w2.weight"].shape,
+        )
+        # --- norm ---
+        for i in range(tp_ranks):
+            conv_state_dicts[i][f"sequential.{layer_num+2}.input_layernorm.scale"] = (
+                hf_state_dict[f"model.layers.{layer_num}.input_layernorm.weight"]
+                .clone()
+                .detach()
+            )
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.post_attention_layernorm.scale"
+            ] = (
+                hf_state_dict[
+                    f"model.layers.{layer_num}.post_attention_layernorm.weight"
+                ]
+                .clone()
+                .detach()
+            )
+
+    # Get final ln/linear....
+    index = model.model.config.num_hidden_layers + 3
+    for i in range(tp_ranks):
+        conv_state_dicts[i][f"sequential.{index}.norm.scale"] = (
+            hf_state_dict["model.norm.weight"].clone().detach()
+        )
+    index += 1
+    # do output...
+    for i, chunk in enumerate(
+        torch.chunk(hf_state_dict["lm_head.weight"], tp_ranks, dim=0)
+    ):
+        conv_state_dicts[i][
+            f"sequential.{index}.final_linear.weight"
+        ] = chunk.clone().detach()
+    print(
+        "lm_head.weight",
+        hf_state_dict["lm_head.weight"].shape,
+        f"sequential.{index}.final_linear.weight",
+        conv_state_dicts[0][f"sequential.{index}.final_linear.weight"].shape,
+    )
+    return conv_state_dicts
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tp", type=int, default=1, help="Number of tensor parallelism ranks"
+    )
+    parser.add_argument(
+        "--pp", type=int, default=0, help="Number of pipeline parallelism stages"
+    )
+    parser.add_argument("--model", type=str, default="gpt2", help="HF model name")
+    parser.add_argument(
+        "--model_path", type=str, default=None, help="Path to save model"
+    )
+    args = parser.parse_args()
+    assert args.pp == 0, "Pipeline parallelism not supported yet"
+    tokenizer = AutoTokenizer.from_pretrained(args.model).save_pretrained(
+        args.model_path + "/tokenizer"
+    )
+    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype="auto")
+    state_dict = model.state_dict()
+    for key in state_dict.keys():
+        print(key, state_dict[key].shape)
+    os.makedirs(args.model_path, exist_ok=True)
+    # Setup model directory...
+    os.makedirs(f"{args.model_path}/0", exist_ok=True)
+    # Save the latest file so neox can figure out where to grab the weights...
+    with open(f"{args.model_path}/latest", "w") as f:
+        f.write("0")
+    # Convert the model...
+    tp_state_dicts = convert_model(state_dict, model.model.config, args.tp)
+    for i in range(args.tp):
+        torch.save(
+            {
+                "dp_world_size": 1,
+                "mp_world_size": args.tp,
+                "optimizer": {},
+                "global_steps": 1,
+                "skipped_steps": 1,
+                "iteration": 1,
+                "module": tp_state_dicts[i],
+            },
+            f"{args.model_path}/0/mp_rank_{i:02d}_model_states.pt",
+        )

From 0ef2c074ac03c2b888e9003e7ce4c166cb78cc82 Mon Sep 17 00:00:00 2001
From: dmahan93 <44207705+dmahan93@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:26:15 -0500
Subject: [PATCH 17/27] bugfix: chat turns instead of repeating the
 conversation in preprocess_data_with_chat_template.py (#1258)

* bugfix: chat turns instead of repeating the conversation

* pre-commit
---
 tools/datasets/preprocess_data_with_chat_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py
index 81770deff..55623b303 100644
--- a/tools/datasets/preprocess_data_with_chat_template.py
+++ b/tools/datasets/preprocess_data_with_chat_template.py
@@ -104,7 +104,7 @@ def build_chat(
         )
         chat_tokens = tokenizer.apply_chat_template(
             chat[: i + 1], add_generation_prompt=add_gen
-        )
+        )[len(tokens) :]
         # remove previous stuff...
         tokens.extend(chat_tokens)
         if only_last_turn and (i != len(chat) - 1):

From f8c9e68c4984a0b6f7f5f276b563d2612a6dce9f Mon Sep 17 00:00:00 2001
From: jaimemcc <99298642+jaimemcc-intel@users.noreply.github.com>
Date: Thu, 15 Aug 2024 14:57:02 -0700
Subject: [PATCH 18/27] Conversion for CI from self-hosted hardware (#1245)

* changing from self-hosted runners to Github's ubuntu-22.04 runner environment

* adding warning about not using 'self-hosted' runner labels and using Github runners instead

* updated some guidance in comments for coverity scan CI

* moving CPU tests to workflow_dispatch only
---
 .github/workflows/{cpu_ci_on_pr.yml => .cpu_ci_on_pr.yml} | 4 +++-
 .github/workflows/coverity_scan.yml                       | 5 +++--
 .github/workflows/cpu_ci.yml                              | 2 +-
 .github/workflows/cpu_ci_dispatch.yml                     | 2 +-
 .github/workflows/pull_request.yml                        | 5 +++--
 tests/README.md                                           | 2 ++
 6 files changed, 13 insertions(+), 7 deletions(-)
 rename .github/workflows/{cpu_ci_on_pr.yml => .cpu_ci_on_pr.yml} (58%)

diff --git a/.github/workflows/cpu_ci_on_pr.yml b/.github/workflows/.cpu_ci_on_pr.yml
similarity index 58%
rename from .github/workflows/cpu_ci_on_pr.yml
rename to .github/workflows/.cpu_ci_on_pr.yml
index 971640c18..43ce025c0 100644
--- a/.github/workflows/cpu_ci_on_pr.yml
+++ b/.github/workflows/.cpu_ci_on_pr.yml
@@ -1,3 +1,5 @@
+# This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed.
+
 name: "Pull Request CPU Tests"
 
 on:
@@ -7,7 +9,7 @@ on:
 
 jobs:
   run-tests:
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v4
diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml
index a79d0d8fb..128d279cc 100644
--- a/.github/workflows/coverity_scan.yml
+++ b/.github/workflows/coverity_scan.yml
@@ -17,9 +17,10 @@ jobs:
     runs-on: ubuntu-latest
 
     env:
-      COV_USER: ${{ secrets.COV_USER }}
+      COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions
       COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }}
-      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }}
+      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard:
+        # https://scan.coverity.com/projects/<project>?tab=project_settings
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml
index 9160fccab..6910b8a1c 100644
--- a/.github/workflows/cpu_ci.yml
+++ b/.github/workflows/cpu_ci.yml
@@ -5,7 +5,7 @@ on: "push"
 jobs:
   run-tests:
     #runs-on: ubuntu-latest
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
 
diff --git a/.github/workflows/cpu_ci_dispatch.yml b/.github/workflows/cpu_ci_dispatch.yml
index b1d108b3b..38485d6a6 100644
--- a/.github/workflows/cpu_ci_dispatch.yml
+++ b/.github/workflows/cpu_ci_dispatch.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   run-tests:
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v4
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 53be528ae..7b06256bf 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -1,6 +1,7 @@
 name: Pull Request
 
-on: [pull_request, workflow_dispatch]
+#on: [pull_request, workflow_dispatch]
+on: workflow_dispatch
 
 jobs:
   pre-commit:
@@ -40,7 +41,7 @@ jobs:
           git commit -m "Update NeoXArgs docs automatically"
           git push
   run-tests:
-    runs-on: self-hosted
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
diff --git a/tests/README.md b/tests/README.md
index 390a52898..f5ba5e560 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -57,6 +57,8 @@ Tests can be run against physical CPUs through GitHub Actions. To have tests run
 
 ### runs-on
 
+#### NOTE: These BKMs were written to work with CI infrastructure that is no longer in place. To use the Github runners (ubuntu-22.04 / ubuntu-latest), skip the 'runs-on' section.
+
 The CI needs to be written to target the CPU Github Action runner. The jobs that need to run on CPU should use the hardware runner's labels:
 ```yaml
 jobs:

From 8b43196fbd832b797be9f3d88d54481171010507 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 23 Aug 2024 14:02:59 -0400
Subject: [PATCH 19/27] Megatron-LM style Sequence Parallel (#1257)

* first draft (shape errors occurring)

* training works (but poor convergence)

* debugging progress: current commit works if we do regular TP via impl-ing AR in rowparallel as RS then AG

* Update NeoXArgs docs automatically

* push most recent code (updated mark_norms fn, back to 'real' sequence parallel)

* Update NeoXArgs docs automatically

* Fix LayerNorm all reduce gradient hook

* Sum instead of average for LayerNorm gradient all reduce

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

* Fix gather and reduce scatter ops on sequence dimension

* Fix sequence parallel with tied weight embeddings

* Update NeoXArgs docs automatically

* cleanup pass + add MoE arguments.py guard

* pre-commit and clean up comments

* remove vestigial debug code

* remove unused debugging code

* remove dummy test config

* update fp32_allreduce to handle fp16 ; don't cast to fp32 for gathers

* run linter on the rest of the files

* Improve performance of sequence parallel gather, scatter, and reduce

* Add comment

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Brandon Yang <bclyang@gmail.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md            |  12 +-
 megatron/model/__init__.py           |   5 +-
 megatron/model/gpt2_model.py         |   5 +-
 megatron/model/transformer.py        |  29 ++++-
 megatron/model/utils.py              |  56 ++++++--
 megatron/model/word_embeddings.py    |  10 ++
 megatron/mpu/__init__.py             |   3 +
 megatron/mpu/layers.py               |  39 +++++-
 megatron/mpu/mappings.py             | 187 +++++++++++++++++++++++++--
 megatron/mpu/utils.py                |  22 ++++
 megatron/neox_arguments/arguments.py |   4 +
 megatron/neox_arguments/neox_args.py |   7 +
 megatron/training.py                 |   3 +
 13 files changed, 349 insertions(+), 33 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 1e67685ed..413138597 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 455446c
+    Default = 53d0ae8
 
     current git hash of repository
 
@@ -1056,6 +1056,16 @@ Parallelism Arguments
 
 
 
+- **sequence_parallel**: bool
+
+    Default = False
+
+    flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198)
+    (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1.
+    **Set by user, in contrast to neox_args.is_pipe_parallel.**
+
+
+
 - **expert_interval**: int
 
     Default = 2
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 619b4c33d..23be28936 100755
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -16,5 +16,8 @@
 # limitations under the License.
 
 from .gpt2_model import GPT2ModelPipe
-from .utils import get_params_for_weight_decay_optimization
+from .utils import (
+    get_params_for_weight_decay_optimization,
+    mark_norms_for_sequence_parallel_grad_sync,
+)
 from .word_embeddings import SoftEmbedding
diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 9e643874a..7899048db 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -308,7 +308,10 @@ def _logits_helper(embedding, lm_output):
                 )
 
             logits = parallel_lm_logits(
-                lm_output, embedding.word_embeddings_weight, self.parallel_output
+                lm_output,
+                embedding.word_embeddings_weight,
+                self.parallel_output,
+                seq_parallel=self.neox_args.sequence_parallel,
             )
             return logits
 
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index c154b09f4..62e7d3a9c 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -254,6 +254,7 @@ def __init__(
                 gather_output=not parallel_output,
                 skip_bias_add=False,
                 mup_rescale_parameters=is_last_layer,  # rescale params only called if neox_args.use_mup = True, despite it not being included here
+                seq_dim=1,  # important: must mark that this layer receives shape [b, s, h] not [s, b, h] and so Seq. Parallel comms must gather along dim=1 rather than dim=0
             )
 
     #        else:
@@ -1024,7 +1025,14 @@ def __init__(
         self.moe_type = neox_args.moe_type
 
         if self.gpt_j_residual:
-            self.reduce = mpu.mappings.reduce_from_model_parallel_region
+            # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers.
+            # the reduction we use is a simple allreduce for pure Tensor Parallel,
+            # but needs to be a reduce-scatter when using Megatron-style Sequence Parallel (LN sharding.)
+            self.reduce = (
+                mpu.mappings.reduce_from_model_parallel_region
+                if not neox_args.sequence_parallel
+                else mpu.mappings.reduce_scatter_to_sequence_parallel_region
+            )
 
         # Self attention.
         self.attention = ParallelSelfAttention(
@@ -1339,10 +1347,25 @@ def forward(self, args):
         return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+def parallel_lm_logits(
+    input_,
+    word_embeddings_weight,
+    parallel_output,
+    seq_parallel=False,
+    seq_dim=1,
+    bias=None,
+):
     """LM logits using word embedding weights."""
     # Parallel logits.
-    input_parallel = mpu.copy_to_model_parallel_region(input_)
+    if seq_parallel:
+        # if using Sequence Parallelism, our logits are sharded along the sequence dimension.
+        # gather them here. (backward pass: reduce-scatter)
+        input_parallel = mpu.gather_from_sequence_parallel_region(
+            input_, seq_dim=seq_dim
+        )
+    else:
+        # Set up backprop all-reduce.
+        input_parallel = mpu.copy_to_model_parallel_region(input_)
 
     # Matrix multiply.
     if bias is None:
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index c3da2ce8b..97b409c1d 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -18,8 +18,8 @@
 """Utilities for models."""
 
 import torch
-from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm
 from megatron.model.fused_softmax import SoftmaxFusionTypes
+from megatron import mpu
 from types import GeneratorType
 import torch.distributed as dist
 
@@ -35,15 +35,9 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         "name": "no_weight_decay_params",
     }
     for module_ in module.modules():
-        if any(
-            [
-                isinstance(module_, LayerNorm),
-                isinstance(module_, RMSNorm),
-                isinstance(module_, ScaleNorm),
-            ]
-        ) or (
-            neox_args.weight_decay == 0.0
-        ):  # also include all parameters here if no weight decay is being done
+        # apply weight decay to any "...Norm" modules.
+        if "norm" in type(module_).__name__.lower() or neox_args.weight_decay == 0.0:
+            # also include all parameters here if no weight decay is being done
             no_weight_decay_params["params"].extend(
                 [p for p in list(module_._parameters.values()) if p is not None]
             )
@@ -359,3 +353,45 @@ def get_fusion_type(neox_args):
     elif neox_args.scaled_masked_softmax_fusion:
         fusion_type = SoftmaxFusionTypes.general
     return fusion_type
+
+
+def reduce_weight_grads_from_model_parallel_region(input_):
+    """A hook that can be applied to any weight tensor via .register_hook().
+    Allreduces grads for e.g. LN weights across the model parallel group.
+    Needed to keep LNs in sync, despite them getting diff data -> diff gradients when using sequence parallel.
+    """
+    # Bypass the function if no TP -> no comm needed.
+    if mpu.get_model_parallel_world_size() == 1:
+        return input_
+
+    # Bf16 convert
+    dt = input_.dtype
+    if dt == torch.bfloat16 and mpu.get_fp32_allreduce():
+        input_ = input_.float()
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=mpu.get_model_parallel_group())
+
+    # Bf16 convert
+    if dt == torch.bfloat16 and mpu.get_fp32_allreduce():
+        input_ = input_.bfloat16()
+
+    return input_
+
+
+def mark_norms_for_sequence_parallel_grad_sync(module, neox_args):
+    """Iterate through the modules in our model, and for any "...Norm" classnames,
+    register a hook on each of that module's parameters which will allreduce norms' weights' grads across
+    the model (sequence) parallel region.
+    """
+
+    if not neox_args.sequence_parallel:
+        # if we aren't using sequence parallelism, this is a no-op
+        return
+
+    for module_ in module.modules():
+        if "norm" in type(module_).__name__.lower():
+            # this is a norm, we want to allreduce its weight grads across sequence parallel region
+            for name, param in module_.named_parameters():
+                if param.requires_grad:
+                    param.register_hook(reduce_weight_grads_from_model_parallel_region)
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index f7372bc55..ce3c1117e 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -50,6 +50,11 @@ def __init__(
         self.hidden_size = hidden_size
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
+
+        self.sequence_parallel = (
+            neox_args.sequence_parallel
+        )  # if we are using sequence parallelism, then we'll want to scatter our inputs across the seqlen dim across TP ranks
+
         self.use_mup = neox_args.use_mup
         self.mup_embedding_mult = neox_args.mup_embedding_mult
         self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult
@@ -159,6 +164,11 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
             with torch.no_grad():
                 embeddings.mul_(self.mup_embedding_mult)
 
+        if self.sequence_parallel:
+            # TODO: megatron-lm does dropout using the scattered embs. This would save a tiny bit of time, perhaps?
+            # Not a priority since we don't often use dropout
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+
         return embeddings
 
 
diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py
index 2365507d9..780fb33e8 100644
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -47,6 +47,9 @@
 from .mappings import gather_from_model_parallel_region
 from .mappings import reduce_from_model_parallel_region
 from .mappings import scatter_to_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
+from .mappings import gather_from_sequence_parallel_region
+from .mappings import scatter_to_sequence_parallel_region
 
 from .random import checkpoint
 from .random import get_cuda_rng_tracker
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 0d14806ac..d59edab94 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -33,6 +33,8 @@
 from .mappings import gather_from_model_parallel_region
 from .mappings import reduce_from_model_parallel_region
 from .mappings import scatter_to_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
+from .mappings import gather_from_sequence_parallel_region
 from .random import get_cuda_rng_tracker
 from .utils import divide
 from .utils import VocabUtility
@@ -416,6 +418,7 @@ def __init__(
         MOE=False,
         MoE_mp_size=1,
         mup_rescale_parameters=False,
+        seq_dim=0,  # Dimension which is the seq_len dimension. final ParallelLinear overrides this to be 1 ; otherwise, the default is used throughout.
     ):
         super(ColumnParallelLinear, self).__init__()
 
@@ -427,6 +430,10 @@ def __init__(
         world_size = MoE_mp_size if MOE else get_model_parallel_world_size()
         self.output_size_per_partition = divide(output_size, world_size)
         self.skip_bias_add = skip_bias_add
+
+        self.sequence_parallel = neox_args.sequence_parallel
+        self.seq_dim = seq_dim
+
         self.init_method = init_method
         self.stride = stride
         self.mup_rescale_parameters = mup_rescale_parameters
@@ -551,14 +558,29 @@ def set_parallel_output(self, value: bool):
     def forward(self, input_):
         if self.use_mup and self.mup_rescale_parameters:
             input_ /= self.width_mult()
-        # Set up backprop all-reduce.
-        input_parallel = copy_to_model_parallel_region(input_)
+
+        if self.sequence_parallel:
+            input_parallel = input_
+        else:
+            # Set up backprop all-reduce.
+            input_parallel = copy_to_model_parallel_region(input_)
         # Matrix multiply.
 
+        if self.sequence_parallel:
+            # do an AG in the fwd pass, RS in bwd pass.
+            # gather / scatter portion happens across the sequence dim (self.seq_dim)--
+            # almost always is [s, b, h] and so dim 0, but for lm_head ParallelLinear it is seq_dim=1 and [b, s, h]
+            input_parallel = gather_from_sequence_parallel_region(
+                input_parallel, seq_dim=self.seq_dim
+            )
+
         bias = self.bias if not self.skip_bias_add else None
         output_parallel = F.linear(input_parallel, self.weight, bias)
         if self.gather_output:
             # All-gather across the partitions.
+            assert (
+                not self.sequence_parallel
+            ), "sequence_parallel=True and gather_output=True are incompatible!"
             output = gather_from_model_parallel_region(output_parallel)
         else:
             output = output_parallel
@@ -623,6 +645,12 @@ def __init__(
         self.input_size_per_partition = divide(input_size, world_size)
         self.skip_bias_add = skip_bias_add
         self.parallel_output = parallel_output
+
+        self.sequence_parallel = neox_args.sequence_parallel
+        assert not (
+            self.sequence_parallel and not self.input_is_parallel
+        ), "Cannot have self.input_is_parallel=False and self.sequence_parallel=True."
+
         self.init_method = init_method
         self.stride = stride
         self.keep_master_weight_for_test = keep_master_weight_for_test
@@ -748,7 +776,12 @@ def forward(self, input_):
         # Matrix multiply.
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
-        if not self.parallel_output:
+        if self.sequence_parallel and not self.parallel_output:
+            # do an RS in the fwd pass, AG in bwd pass.
+            # skip in the gpt-j parallel sublayer case (self.parallel_output=True)
+            # (user responsible for calling reduce-scatter)
+            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        elif not self.parallel_output:
             output_ = reduce_from_model_parallel_region(output_parallel)
         else:
             output_ = output_parallel
diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py
index 535fe6255..f11d9e6ab 100644
--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -23,7 +23,7 @@
     get_model_parallel_rank,
     get_fp32_allreduce,
 )
-from .utils import split_tensor_along_last_dim
+from .utils import split_tensor_along_last_dim, split_tensor_along_any_dim
 
 
 def _reduce(input_):
@@ -33,17 +33,17 @@ def _reduce(input_):
     if get_model_parallel_world_size() == 1:
         return input_
 
-    # Bf16 convert
+    # upcast to fp32 if using fp32 allreduce
     dt = input_.dtype
-    if dt == torch.bfloat16 and get_fp32_allreduce():
+    if get_fp32_allreduce():
         input_ = input_.float()
 
     # All-reduce.
     torch.distributed.all_reduce(input_, group=get_model_parallel_group())
 
-    # Bf16 convert
-    if dt == torch.bfloat16 and get_fp32_allreduce():
-        input_ = input_.bfloat16()
+    # reconvert to original Bf16/Fp16 dtype
+    if get_fp32_allreduce():
+        input_ = input_.to(dt)
 
     return input_
 
@@ -75,11 +75,6 @@ def _gather(input_):
     if world_size == 1:
         return input_
 
-    # Bf16 convert
-    dt = input_.dtype
-    if dt == torch.bfloat16 and get_fp32_allreduce():
-        input_ = input_.float()
-
     # Size and dimension.
     last_dim = input_.dim() - 1
     rank = get_model_parallel_rank()
@@ -91,9 +86,100 @@ def _gather(input_):
     # Note: torch.cat already creates a contiguous tensor.
     output = torch.cat(tensor_list, dim=last_dim).contiguous()
 
-    # Bf16 convert
-    if dt == torch.bfloat16 and get_fp32_allreduce():
-        output = output.bfloat16()
+    return output
+
+
+def _reduce_scatter_along_seq_dim(input_, seq_dim):
+    """Reduce-scatter the input tensor across model parallel group, scattering across sequence dim."""
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # upcast to fp32 if using fp32 allreduce
+    dt = input_.dtype
+    if get_fp32_allreduce():
+        input_ = input_.float()
+
+    dim_size = list(input_.size())
+    assert (
+        isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0
+    ), "seq_dim must be a valid tensor dim"
+    assert dim_size[seq_dim] % world_size == 0
+
+    if seq_dim == 0:
+        # reduce_scatter_tensor is faster but only works correctly on dimension 0
+        dim_size[seq_dim] = dim_size[seq_dim] // world_size
+        output = torch.empty(
+            dim_size, dtype=input_.dtype, device=torch.cuda.current_device()
+        )
+        torch.distributed.reduce_scatter_tensor(
+            output, input_.contiguous(), group=get_model_parallel_group()
+        )
+    else:
+        tensor_list = list(
+            torch.split(input_, input_.shape[seq_dim] // world_size, seq_dim)
+        )
+        output = torch.empty_like(tensor_list[0])
+        torch.distributed.reduce_scatter(output, tensor_list)
+
+    # reconvert to original Bf16/Fp16 dtype
+    if get_fp32_allreduce():
+        output = output.to(dt)
+
+    return output
+
+
+def _gather_along_seq_dim(input_, seq_dim):
+    """Gather tensors and concatinate along the (manually-specified) sequence dimension."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert (
+        isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0
+    ), "seq_dim must be a valid tensor dim"
+    dim_size[seq_dim] = dim_size[seq_dim] * world_size
+
+    if seq_dim == 0:
+        # reduce_gather_tensor is faster but only works correctly on dimension 0
+        output = torch.empty(
+            dim_size, dtype=input_.dtype, device=torch.cuda.current_device()
+        )
+        torch.distributed.all_gather_into_tensor(
+            output, input_.contiguous(), group=get_model_parallel_group()
+        )
+    else:
+        input_ = input_.contiguous()
+        rank = get_model_parallel_rank()
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        tensor_list[rank] = input_
+        torch.distributed.all_gather(
+            tensor_list, input_, group=get_model_parallel_group()
+        )
+        output = torch.cat(tensor_list, dim=seq_dim)
+
+    return output
+
+
+def _split_along_seq_dim(input_, seq_dim):
+    """Split the tensor along the sequence dimension (as manually selected) and keep the
+    corresponding slice."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along second dimension.
+    input_list = split_tensor_along_any_dim(input_, world_size, seq_dim)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_model_parallel_rank()
+    output = input_list[rank].contiguous()
 
     return output
 
@@ -162,6 +248,65 @@ def backward(ctx, grad_output):
         return _split(grad_output)
 
 
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce-Scatter across sequence parallel region (same as model parallel region.)
+    Note: same region as model parallel region
+    """
+
+    @staticmethod
+    def symbolic(graph, input_, seq_dim):
+        return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def forward(ctx, input_, seq_dim):
+        ctx.seq_dim = seq_dim
+        return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        seq_dim = ctx.seq_dim
+        return _gather_along_seq_dim(grad_output, seq_dim=seq_dim), None
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """All-Gather across sequence parallel region (same region as model parallel region.)"""
+
+    @staticmethod
+    def symbolic(graph, input_, seq_dim):
+        return _gather_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def forward(ctx, input_, seq_dim):
+        ctx.seq_dim = seq_dim
+        return _gather_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        seq_dim = ctx.seq_dim
+        return _reduce_scatter_along_seq_dim(grad_output, seq_dim=seq_dim), None
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Scatter (split) sequence length across sequence parallel region (=> same region as model parallel.)"""
+
+    @staticmethod
+    def symbolic(graph, input_, seq_dim):
+        return _split_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def forward(ctx, input_, seq_dim):
+        ctx.seq_dim = seq_dim
+        return _split_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        seq_dim = ctx.seq_dim
+        return (
+            _gather_along_seq_dim(grad_output, seq_dim=seq_dim),
+            None,
+        )
+
+
 # -----------------
 # Helper functions.
 # -----------------
@@ -181,3 +326,17 @@ def scatter_to_model_parallel_region(input_):
 
 def gather_from_model_parallel_region(input_):
     return _GatherFromModelParallelRegion.apply(input_)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_, seq_dim=0):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_, seq_dim)
+
+
+def gather_from_sequence_parallel_region(input_, seq_dim=0):
+    return _GatherFromSequenceParallelRegion.apply(input_, seq_dim)
+
+
+def scatter_to_sequence_parallel_region(
+    input_, seq_dim=1
+):  # use this fn in scattering input embeds across TP ranks. There, shape of inps is [b, s, h] instead of the usual [s, b, h]
+    return _ScatterToSequenceParallelRegion.apply(input_, seq_dim)
diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
index 13941dc29..1f97e0e76 100644
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
@@ -53,6 +53,28 @@ def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=
     return tensor_list
 
 
+def split_tensor_along_any_dim(
+    tensor, num_partitions, seq_dim, contiguous_split_chunks=False
+):
+    """Split a tensor along a user-specified dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        seq_dim: dimension along which to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    seq_dim_size = divide(tensor.size()[seq_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, seq_dim_size, dim=seq_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
 class VocabUtility:
     """Split the vocabulary into `world_size` chunks amd return the
     first and last index of the vocabulary belonging to the `rank`
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 054689eda..6a84df6c7 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1038,6 +1038,10 @@ def calculate_derived(self):
             assert self.zero_optimization["stage"] != 3, "MoE not compatible with zero3"
             assert self.mlp_type == "regular", "MoE not compatible with LLaMA"
 
+            assert (
+                self.sequence_parallel is False
+            ), "MoE not compatible with Sequence Parallel"
+
         # Attention config
         if self.attention_config is None:
             self.update_value("attention_config", [[["global"], self.num_layers]])
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index febefb3c2..7993f785f 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -85,6 +85,13 @@ class NeoXArgsParallelism(NeoXArgsTemplate):
     according to pipeline parallel size.
     """
 
+    sequence_parallel: bool = False
+    """
+    flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198)
+    (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1.
+    **Set by user, in contrast to neox_args.is_pipe_parallel.**
+    """
+
     expert_interval: int = 2
     """
     Have one MoE layer every expert_interval layers
diff --git a/megatron/training.py b/megatron/training.py
index 3265680c5..ce59b242a 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -43,6 +43,7 @@
     GPT2ModelPipe,
     SoftEmbedding,
     get_params_for_weight_decay_optimization,
+    mark_norms_for_sequence_parallel_grad_sync,
 )
 from megatron.checkpointing import load_checkpoint, save_checkpoint
 from megatron.data.data_utils import build_train_valid_test_data_iterators
@@ -765,6 +766,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
             # config_params=neox_args.deepspeed_config,
             mpu=mpu if not neox_args.is_pipe_parallel else None,
         )
+        mark_norms_for_sequence_parallel_grad_sync(model, neox_args)
         if neox_args.moe_num_experts > 1 and neox_args.moe_type == "megablocks":
             # We need to additionally set this flag to ensure DS parallelism properly handles this foreign MoE.
             model.has_moe_layers = True
@@ -891,6 +893,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 and neox_args.iteration <= neox_args.profile_step_stop
             ):
                 torch.cuda.nvtx.range_push(f"Optimizer step")
+
             timers("optimizer").start()
             if neox_args.deepspeed:
                 model.step()

From e7c0182f4b8f0be7c894e292baebabc6d16cecc2 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 23 Aug 2024 21:38:52 -0400
Subject: [PATCH 20/27] Add new cites (#1255)

* Update README.md

I added new models that have come out trained with the GPT-NeoX library. The library itself is sufficiently well-used that simply listing all citing papers is rapidly becoming non-viable. I'm currently leaning towards providing a curated list of "exciting" papers? I haven't looked at other libraries to see what they do yet.

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e63a59f28..b5fc0d877 100644
--- a/README.md
+++ b/README.md
@@ -736,7 +736,7 @@ The following publications by other research groups use this library:
 The following models were trained using this library:
 
 ### English LLMs
-- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
 - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
 - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
 - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
@@ -747,13 +747,15 @@ The following models were trained using this library:
 ### Non-English LLMs
 - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
 - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
-- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b)
+- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese)
 - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
 - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
 - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
 - The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
 - The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
 - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
+- Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese)
+- The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque)
 
 ### Code Models
 - Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
@@ -761,11 +763,13 @@ The following models were trained using this library:
 - CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
 
 ### AI for Science
+- EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
 - Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
-- Oak Ridge National Lab and EleutherAI's [Unnamed Material Science Domain Models (7B)](https://github.com/at-aaims/forge)
+- Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691)
 - Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr)
 
 ### Other Modalities
+-  Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text)
 -  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
 -  Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
 

From 591563d3f6a54af2279aad40444c3a04033cf22b Mon Sep 17 00:00:00 2001
From: Jacob Hatef <74274091+jahatef@users.noreply.github.com>
Date: Tue, 27 Aug 2024 16:51:56 -0400
Subject: [PATCH 21/27] mamba fixes and cleaning (#1262)

* mamba fixes and cleaning

* space

* revert assertion change for now

---------

Co-authored-by: Jacob Hatef <jahatef@login07.frontier.olcf.ornl.gov>
---
 configs/mamba/mamba-1.4B.yml         | 68 ++++++++++++++++++++++++++-
 configs/mamba/mamba-130M.yml         | 69 ++++++++++++++++++++++++++-
 configs/mamba/mamba-2.8B.yml         | 68 ++++++++++++++++++++++++++-
 configs/mamba/mamba-370M.yml         | 69 ++++++++++++++++++++++++++-
 configs/mamba/mamba-790M.yml         | 70 +++++++++++++++++++++++++++-
 megatron/model/mamba/mamba.py        |  5 +-
 megatron/neox_arguments/arguments.py |  2 +-
 7 files changed, 339 insertions(+), 12 deletions(-)

diff --git a/configs/mamba/mamba-1.4B.yml b/configs/mamba/mamba-1.4B.yml
index 2898a72fd..eae467d0e 100644
--- a/configs/mamba/mamba-1.4B.yml
+++ b/configs/mamba/mamba-1.4B.yml
@@ -19,5 +19,71 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 1,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml
index d9a6ab92e..7187048e6 100644
--- a/configs/mamba/mamba-130M.yml
+++ b/configs/mamba/mamba-130M.yml
@@ -19,5 +19,70 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
-}
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
diff --git a/configs/mamba/mamba-2.8B.yml b/configs/mamba/mamba-2.8B.yml
index 1aacb264b..d5afef368 100644
--- a/configs/mamba/mamba-2.8B.yml
+++ b/configs/mamba/mamba-2.8B.yml
@@ -19,5 +19,71 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00016,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000016,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-370M.yml b/configs/mamba/mamba-370M.yml
index 5e5a78cca..0058f1c0e 100644
--- a/configs/mamba/mamba-370M.yml
+++ b/configs/mamba/mamba-370M.yml
@@ -12,12 +12,77 @@
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-5,
 
-  "attention_config": [[["mamba"], 64]],
+  "attention_config": [[["mamba"], 48]],
 
   "mamba_selective_scan_fusion": true,
   "mamba_causal_conv_fusion": true,
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0003,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00003,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-790M.yml b/configs/mamba/mamba-790M.yml
index fcd324d9d..4aef7e813 100644
--- a/configs/mamba/mamba-790M.yml
+++ b/configs/mamba/mamba-790M.yml
@@ -12,12 +12,78 @@
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-5,
 
-  "attention_config": [[["mamba"], 64]],
+  "attention_config": [[["mamba"], 48]],
 
   "mamba_selective_scan_fusion": true,
   "mamba_causal_conv_fusion": true,
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00025,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000025,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py
index d5d6b336f..3177267cb 100644
--- a/megatron/model/mamba/mamba.py
+++ b/megatron/model/mamba/mamba.py
@@ -13,9 +13,8 @@
     from causal_conv1d import causal_conv1d_fn
     import einops
 except ModuleNotFoundError:
-    print(
-        "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba"
-    )
+    print( "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
+    or directly from https://github.com/state-spaces/mamba")
     pass
 
 from megatron.model.norms import get_norm
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 6a84df6c7..fb26fb4aa 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1189,7 +1189,7 @@ def validate_values(self):
                 return False
 
         # Checks.
-        if self.hidden_size % self.num_attention_heads != 0:
+        if self.hidden_size % self.num_attention_heads != 0 and not ("mamba" in self.attention_config):
             error_message = (
                 self.__class__.__name__
                 + ".validate_values() hidden_size must be divisible by num_attention_heads"

From c7863673e3c08b5886cae36cf096a0fb5789dd0e Mon Sep 17 00:00:00 2001
From: dmahan93 <44207705+dmahan93@users.noreply.github.com>
Date: Tue, 27 Aug 2024 15:58:21 -0500
Subject: [PATCH 22/27] SFT improvements (labeling fixes, different packing
 implementations) (#1240)

* - add different packing impl (Unpacked, packing until overflow)
- fix labels to also have valid/test implementations
- fix label masking in _get_batch to also include anything from get_ltor_masks_and_position_ids

* Update arguments.py to use train_label_data_paths instead of label_data_paths

* - fix precommit
---
 megatron/data/data_utils.py          |  37 +++++-
 megatron/data/gpt2_dataset.py        | 188 ++++++++++++++++++++++-----
 megatron/neox_arguments/arguments.py |   6 +-
 megatron/neox_arguments/neox_args.py |  29 ++++-
 megatron/training.py                 |  21 +--
 5 files changed, 227 insertions(+), 54 deletions(-)

diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
index bc5754cdb..7e4dbdb37 100644
--- a/megatron/data/data_utils.py
+++ b/megatron/data/data_utils.py
@@ -55,6 +55,8 @@ def build_the_dataset(
     data_prefix,
     name,
     data_impl,
+    pack_impl,
+    allow_chopped,
     num_samples,
     seq_length,
     seed,
@@ -83,6 +85,8 @@ def build_the_dataset(
         num_samples,
         seq_length,
         seed,
+        pack_impl=pack_impl,
+        allow_chopped=allow_chopped,
         build_index_mappings=build_index_mappings,
         label_dataset=label_dataset,
     )
@@ -93,6 +97,8 @@ def build_train_valid_test_datasets(
     data_prefix,
     use_shared_fs,
     data_impl,
+    pack_impl,
+    allow_chopped,
     splits_string,
     train_valid_test_num_samples,
     seq_length,
@@ -138,6 +144,8 @@ def build_dataset(index, name):
                 train_valid_test_num_samples[index],
                 seq_length,
                 seed,
+                pack_impl=pack_impl,
+                allow_chopped=allow_chopped,
                 use_shared_fs=use_shared_fs,
             )
         return dataset
@@ -204,12 +212,25 @@ def build_weighted_datasets(
 ):
     # build individual datasets
     train_datasets, valid_datasets, test_datasets = [], [], []
-    for i, (train_path, label_path, valid_path, test_path) in enumerate(
+    for i, (
+        train_path,
+        train_label_path,
+        valid_path,
+        valid_label_path,
+        test_path,
+        test_label_path,
+    ) in enumerate(
         zip_longest(
             neox_args.train_data_paths,
-            neox_args.label_data_paths if neox_args.label_data_paths else [],
+            neox_args.train_label_data_paths
+            if neox_args.train_label_data_paths
+            else [],
             neox_args.valid_data_paths,
+            neox_args.valid_label_data_paths
+            if neox_args.valid_label_data_paths
+            else [],
             neox_args.test_data_paths,
+            neox_args.test_label_data_paths if neox_args.test_label_data_paths else [],
         )
     ):
         if train_path:
@@ -218,12 +239,14 @@ def build_weighted_datasets(
                     data_prefix=train_path,
                     name=f"train_{i}",
                     data_impl=neox_args.data_impl,
+                    pack_impl=neox_args.pack_impl,
+                    allow_chopped=neox_args.allow_chopped,
                     num_samples=train_num_samples[i],
                     seq_length=neox_args.seq_length,
                     seed=neox_args.seed,
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
-                    label_prefix=label_path,
+                    label_prefix=train_label_path,
                 )
             )
 
@@ -233,11 +256,14 @@ def build_weighted_datasets(
                     data_prefix=valid_path,
                     name=f"valid_{i}",
                     data_impl=neox_args.data_impl,
+                    pack_impl=neox_args.pack_impl,
+                    allow_chopped=neox_args.allow_chopped,
                     num_samples=valid_num_samples[i],
                     seq_length=neox_args.seq_length,
                     seed=neox_args.seed,
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
+                    label_prefix=valid_label_path,
                 )
             )
 
@@ -247,11 +273,14 @@ def build_weighted_datasets(
                     data_prefix=test_path,
                     name=f"test_{i}",
                     data_impl=neox_args.data_impl,
+                    pack_impl=neox_args.pack_impl,
+                    allow_chopped=neox_args.allow_chopped,
                     num_samples=test_num_samples[i],
                     seq_length=neox_args.seq_length,
                     seed=neox_args.seed,
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
+                    label_prefix=test_label_path,
                 )
             )
     return train_datasets, valid_datasets, test_datasets
@@ -414,6 +443,8 @@ def build_train_valid_test_data_iterators(neox_args):
                 seq_length=neox_args.seq_length,
                 seed=neox_args.seed,
                 skip_warmup=(not neox_args.mmap_warmup),
+                pack_impl=neox_args.pack_impl,
+                allow_chopped=neox_args.allow_chopped,
             )
 
         # Build dataloders.
diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
index 75e601fda..edba57df2 100644
--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -36,14 +36,19 @@ def __init__(
         num_samples,
         seq_length,
         seed,
+        pack_impl="packed",
+        allow_chopped=True,
         build_index_mappings=True,
         use_shared_fs=True,
         label_dataset=None,
     ):
 
         self.name = name
+        self.pack_impl = pack_impl
+        self.allow_chopped = allow_chopped
         self.indexed_dataset = indexed_dataset
         self.label_dataset = label_dataset
+        self.seq_length = seq_length
 
         # Checks
         assert np.min(documents) >= 0
@@ -56,10 +61,13 @@ def __init__(
                 data_prefix,
                 documents,
                 self.indexed_dataset.sizes,
+                self.label_dataset,
                 num_samples,
                 seq_length,
                 seed,
+                self.pack_impl,
                 use_shared_fs=use_shared_fs,
+                allow_chopped=self.allow_chopped,
             )
             self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1
             self.sample_idx_len = self.sample_idx.shape[0] - 1
@@ -113,8 +121,38 @@ def __getitem__(self, idx):
                     samples.append(np.concatenate(sample_list))
 
             if len(datasets) == 1:
+                if len(samples[0]) < (self.seq_length + 1):
+                    # Pad with -100s so the masking function can ignore these.
+                    samples[0] = np.pad(
+                        samples[0],
+                        (0, (self.seq_length + 1) - len(samples[0])),
+                        mode="constant",
+                        constant_values=-100,
+                    )
+                elif len(samples[0]) > (self.seq_length + 1):
+                    # Check for overflow and truncate.
+                    samples[0] = samples[0][: (self.seq_length + 1)]
                 return {"text": np.array(samples[0], dtype=np.int64)}
             else:
+                if len(samples[0]) < (self.seq_length + 1):
+                    # Pad with 0s, can use any number since it's masked.
+                    samples[0] = np.pad(
+                        samples[0],
+                        (0, (self.seq_length + 1) - len(samples[0])),
+                        mode="constant",
+                        constant_values=0,
+                    )
+                    # pad with -100s so we can mask it out
+                    samples[1] = np.pad(
+                        samples[1],
+                        (0, (self.seq_length + 1) - len(samples[1])),
+                        mode="constant",
+                        constant_values=-100,
+                    )
+                elif len(samples[0]) > (self.seq_length + 1):
+                    # Check for overflow and truncate.
+                    samples[0] = samples[0][: (self.seq_length + 1)]
+                    samples[1] = samples[1][: (self.seq_length + 1)]
                 return {
                     "text": np.array(samples[0], dtype=np.int64),
                     "label": np.array(samples[1], dtype=np.int64),
@@ -132,10 +170,13 @@ def _build_index_mappings(
     data_prefix,
     documents,
     sizes,
+    label_dataset,
     num_samples,
     seq_length,
     seed,
+    packing_impl,
     use_shared_fs=True,
+    allow_chopped=True,
 ):
     """Build doc-idx, sample-idx, and shuffle-idx.
     doc-idx: is an array (ordered) of documents to be used in training.
@@ -155,6 +196,9 @@ def _build_index_mappings(
     _filename += "_{}ns".format(num_samples)
     _filename += "_{}sl".format(seq_length)
     _filename += "_{}s".format(seed)
+    _filename += "_{}pi".format(packing_impl)
+    if allow_chopped:
+        _filename += "_ac"
     doc_idx_filename = _filename + "_doc_idx.npy"
     sample_idx_filename = _filename + "_sample_idx.npy"
     shuffle_idx_filename = _filename + "_shuffle_idx.npy"
@@ -177,44 +221,116 @@ def _build_index_mappings(
             )
             # doc-idx.
             start_time = time.time()
-            doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
-            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
-            print_rank_0(
-                " > elapsed time to build and save doc-idx mapping "
-                "(seconds): {:4f}".format(time.time() - start_time)
-            )
-            # sample-idx.
-            start_time = time.time()
-            # Use C++ implementation for speed.
-            from megatron.data import helpers
-
-            assert doc_idx.dtype == np.int32
-            assert sizes.dtype == np.int32
-
-            num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
-            if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
-                sample_idx = helpers.build_sample_idx_int32(
-                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+            if packing_impl == "packed":
+                doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                print_rank_0(
+                    " > elapsed time to build and save doc-idx mapping "
+                    "(seconds): {:4f}".format(time.time() - start_time)
                 )
-            else:
-                sample_idx = helpers.build_sample_idx_int64(
-                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                # sample-idx.
+                start_time = time.time()
+                # Use C++ implementation for speed.
+                from megatron.data import helpers
+
+                assert doc_idx.dtype == np.int32
+                assert sizes.dtype == np.int32
+
+                num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
+                if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
+                    sample_idx = helpers.build_sample_idx_int32(
+                        sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                    )
+                else:
+                    sample_idx = helpers.build_sample_idx_int64(
+                        sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                    )
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                print_rank_0(
+                    " > elapsed time to build and save sample-idx mapping "
+                    "(seconds): {:4f}".format(time.time() - start_time)
                 )
-            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
-            print_rank_0(
-                " > elapsed time to build and save sample-idx mapping "
-                "(seconds): {:4f}".format(time.time() - start_time)
-            )
-            # shuffle-idx.
-            start_time = time.time()
-            # -1 is due to data structure used to retrieve the index:
-            #    sample i --> [sample_idx[i], sample_idx[i+1])
-            shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
-            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
-            print_rank_0(
-                " > elapsed time to build and save shuffle-idx mapping"
-                " (seconds): {:4f}".format(time.time() - start_time)
-            )
+                # shuffle-idx.
+                start_time = time.time()
+                # -1 is due to data structure used to retrieve the index:
+                #    sample i --> [sample_idx[i], sample_idx[i+1])
+                shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+                print_rank_0(
+                    " > elapsed time to build and save shuffle-idx mapping"
+                    " (seconds): {:4f}".format(time.time() - start_time)
+                )
+            elif packing_impl == "pack_until_overflow":
+                # Naively pack data until it overflows, then roll it over to a new one instead.
+                shuffle_idx = np.arange(num_samples)  # Shuffle index around epochs
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = []
+                doc_idx = []
+                # Iterate over files until we have enough samples.
+                temp_shuffle_idx = np.arange(len(documents))
+                np_rng.shuffle(temp_shuffle_idx)
+                running_length = 0
+                curr_shuffle_idx = 0
+                while len(sample_idx) < num_samples:
+                    if not allow_chopped:
+                        # +1 since we shift left/right by 1
+                        if sizes[temp_shuffle_idx[curr_shuffle_idx]] > seq_length + 1:
+                            curr_shuffle_idx += 1
+                            continue
+                    # First, check if we need to skip this item...
+                    if label_dataset is not None:
+                        if np.all(
+                            label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[
+                                : seq_length + 1
+                            ]
+                            == -100
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                    doc_length = sizes[temp_shuffle_idx[curr_shuffle_idx]]
+                    if running_length == 0:
+                        sample_idx.append(np.array([len(doc_idx), 0]))
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                        running_length += doc_length
+                    else:
+                        if running_length + doc_length > (seq_length + 1):
+                            running_length = doc_length
+                            sample_idx.append(np.array([len(doc_idx), 0]))
+                        else:
+                            running_length += doc_length
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                    curr_shuffle_idx += 1
+                    if curr_shuffle_idx == len(documents):
+                        curr_shuffle_idx = 0
+                        np_rng.shuffle(temp_shuffle_idx)
+                sample_idx.append(np.array([len(doc_idx), 0]))
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            elif packing_impl == "unpacked":
+                # Unpacked data, one sample per document.
+                shuffle_idx = np.arange(num_samples)  # Shuffle index around epochs
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64)
+                sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)])
+                sample_idx[:, 1] = 0
+                doc_idx = list()
+                doc_i = 0
+                while len(doc_idx) <= num_samples:
+                    if not allow_chopped:
+                        # +1 since we shift left/right by 1
+                        if sizes[doc_i] > seq_length + 1:
+                            doc_i = (doc_i + 1) % len(documents)
+                            continue
+                    # Just in case we have bad data in the loop...
+                    if np.all(label_dataset.get(doc_i)[:seq_length] == -100):
+                        doc_i = (doc_i + 1) % len(documents)
+                        continue
+                    doc_idx.append(doc_i)
+                    doc_i = (doc_i + 1) % len(documents)
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
 
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index fb26fb4aa..327639454 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1121,10 +1121,8 @@ def calculate_derived(self):
         if self.test_data_paths and (self.test_data_weights is None):
             self.test_data_weights = [1.0] * len(self.test_data_paths)
 
-        if self.label_data_paths:
-            err_str = (
-                "Must use `label_data_paths` with `train_data_paths`, not `data_path`"
-            )
+        if self.train_label_data_paths:
+            err_str = "Must use `train_label_data_paths` with `train_data_paths`, not `data_path`"
             assert self.train_data_paths and not self.data_path, err_str
 
         # if a sample input file is provided, default text_gen_type type to input-file
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 7993f785f..dd51c7778 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -855,9 +855,9 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     List of paths to train datasets.
     """
 
-    label_data_paths: list = None
+    train_label_data_paths: list = None
     """
-    List of paths to label datasets (not shifted by 1 yet!).
+    List of paths to train label datasets (not shifted by 1 yet!).
     """
 
     test_data_paths: list = None
@@ -865,11 +865,21 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     List of paths to test datasets.
     """
 
+    test_label_data_paths: list = None
+    """
+    List of paths to test label datasets (not shifted by 1 yet!).
+    """
+
     valid_data_paths: list = None
     """
     List of paths to validation datasets.
     """
 
+    valid_label_data_paths: list = None
+    """
+    List of paths to validation label datasets (not shifted by 1 yet!).
+    """
+
     train_data_weights: list = None
     """
     List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
@@ -919,6 +929,21 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Implementation of indexed datasets, can be one of "infer", "cached", or "mmap"
     """
 
+    pack_impl: Literal["packed", "pack_until_overflow", "unpacked"] = "packed"
+    """
+    Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked".
+
+    warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets
+    """
+
+    allow_chopped: bool = True
+    """
+    WARNING: if your packing impl is packed, this is ignored.
+
+    Allow chopped samples in the dataset.
+    (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024)
+    """
+
     mmap_warmup: bool = False
     """
     Warm up mmap files.
diff --git a/megatron/training.py b/megatron/training.py
index ce59b242a..fc3d9e129 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -278,16 +278,19 @@ def pretrain(neox_args):
 def _get_batch(neox_args, tokenizer, keys, data, datatype):
     """Support function for get_batch / get_batch pipe (to avoid code repetition)"""
     data_b = mpu.broadcast_data(keys, data, datatype)
-
+    token_key = keys[0]
+    label_key = keys[1] if len(keys) > 1 else None
     # Unpack.
-    tokens_ = data_b["text"].long()
+    tokens_ = data_b[token_key].long()
     if "label" in data_b:
+        label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous()
         labels = torch.where(
-            data_b["label"].long() >= 0,
-            data_b["label"].long(),
+            data_b[label_key].long() >= 0,
+            data_b[label_key].long(),
             torch.zeros_like(data_b["label"].long()),
         )[:, 1:].contiguous()
     else:
+        label_mask = (tokens_.long() >= 0)[:, 1:].contiguous()
         labels = tokens_[:, 1:].contiguous()
     tokens = tokens_[:, :-1].contiguous()
 
@@ -298,9 +301,9 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype):
         eod_mask_loss=neox_args.eod_mask_loss,
         sliding_window_width=neox_args.sliding_window_width,
     )
-    # If `label` is present, any token < 0 (e.g., -100, the default for torch) skips the loss computation
-    if "label" in data_b:
-        loss_mask = (data_b["label"][:, 1:] >= 0).to(loss_mask.dtype)
+
+    # combine loss masks from get_ltor_masks_and_position_ids with loss masks from data
+    loss_mask = label_mask.to(loss_mask.dtype) * loss_mask
     return tokens, labels, loss_mask, attention_mask, position_ids
 
 
@@ -308,7 +311,7 @@ def get_batch(neox_args, data_iterator):
     """Generate a batch"""
 
     # Items and their type.
-    keys = ["text", "label"] if neox_args.label_data_paths else ["text"]
+    keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
     datatype = torch.int64
 
     # Broadcast data.
@@ -328,7 +331,7 @@ def get_batch(neox_args, data_iterator):
 def get_batch_pipe(data, neox_args, curr_scheduler=None):
     """A modification of get_batch() to work with the latest batch instead of an iterator."""
     # Items and their type.
-    keys = ["text", "label"] if neox_args.label_data_paths else ["text"]
+    keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
     datatype = torch.int64
 
     tokens, labels, loss_mask, attention_mask, position_ids = _get_batch(

From 7548a8b76426fbf64e52343ad1846022793de58d Mon Sep 17 00:00:00 2001
From: AI-WAIFU <67525070+AI-WAIFU@users.noreply.github.com>
Date: Thu, 5 Sep 2024 21:16:02 +0100
Subject: [PATCH 23/27] add assert for missing tokenizer_type in config (#1267)

---
 megatron/tokenizer/tokenizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 348c7cefe..e450504c8 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -31,6 +31,8 @@ def build_tokenizer(args):
     """Initialize tokenizer."""
     if args.rank == 0:
         print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
+    
+    assert args.tokenizer_type is not None, "tokenizer_type must be specified in the .yml config"
 
     # Select and instantiate the tokenizer.
     if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():

From 0d4bdb965e3bd9eed6df8f5a4f024f9ff310601c Mon Sep 17 00:00:00 2001
From: dtamayo <119006120+dtamayo-nlp@users.noreply.github.com>
Date: Sat, 7 Sep 2024 06:17:14 +0200
Subject: [PATCH 24/27] Add `intermediate_size` to GPT-NeoX models (#1212)

* Update transformer.py -> Add `intermediate_size`

* add support for rwkv and mamba and add todos about swiglu

* refactor activations and mlps

* change llama config to swiglu

* fixes gelu fusion

* pre-commit run

* add assert message to mamba linear

* Update 1-3B.yml

revert accidental change

* Update 1-3B.yml

* fixes various issues

* add back swiglu check

---------

Co-authored-by: jahatef <hatef.4@buckeyemail.osu.edu>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
Co-authored-by: Jacob Hatef <74274091+jahatef@users.noreply.github.com>
---
 configs/llama/13B.yml                |   2 +-
 configs/llama/30B.yml                |   2 +-
 configs/llama/65B.yml                |   2 +-
 configs/llama/7B.yml                 |   2 +-
 megatron/data/helpers.cpp            |  12 +-
 megatron/model/activations.py        |  38 +++---
 megatron/model/gmlp.py               |   2 +-
 megatron/model/mamba/mamba.py        |   9 +-
 megatron/model/rwkv/v6/rwkv.py       |  17 ++-
 megatron/model/transformer.py        | 167 +++++++++------------------
 megatron/neox_arguments/neox_args.py |  23 +++-
 11 files changed, 117 insertions(+), 159 deletions(-)

diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
index 305567be1..7a823a43c 100644
--- a/configs/llama/13B.yml
+++ b/configs/llama/13B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
index 450f8da38..2c356cea2 100644
--- a/configs/llama/30B.yml
+++ b/configs/llama/30B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
index 85f199ce2..cc22d3734 100644
--- a/configs/llama/65B.yml
+++ b/configs/llama/65B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
index ecbf187a8..0b134ae27 100644
--- a/configs/llama/7B.yml
+++ b/configs/llama/7B.yml
@@ -22,5 +22,5 @@
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
   "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 }
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index aca290854..9b062b050 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 7a29b0716..c0b825261 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -25,9 +25,23 @@
 
 
 def get_activation(neox_args):
-    """retrieves the activation function specified in neox_args"""
+    """retrieves the activation function specified in neox_args and whether or not the activation is gated"""
+    is_gated = False
     if neox_args.activation == "geglu":
-        activation_func = GEGLU(neox_args=neox_args)
+        is_gated = True
+        activation_func = F.gelu
+    elif neox_args.activation == "reglu":
+        is_gated = True
+        activation_func = F.relu
+    elif neox_args.activation == "bilinear":
+        is_gated = True
+        activation_func = lambda x: x
+    elif neox_args.activation == "swiglu":
+        is_gated = True
+        activation_func = swish
+    elif neox_args.activation == "glu":
+        is_gated = True
+        activation_func = F.sigmoid
     elif neox_args.activation == "gelu":
         if neox_args.onnx_safe and neox_args.bias_gelu_fusion:
             raise ValueError("onnx_safe + bias_gelu_fusion not compatible")
@@ -49,7 +63,7 @@ def get_activation(neox_args):
         activation_func = F.silu
     else:
         raise ValueError(f"Activation function {neox_args.activation} not recognized")
-    return activation_func
+    return activation_func, is_gated
 
 
 ###### BIAS GELU FUSION/ NO AUTOGRAD ################
@@ -119,21 +133,3 @@ def swish(x, beta: float = 1.0):
 @torch.jit.script
 def mish(x):
     return x * torch.tanh(F.softplus(x))
-
-
-class GEGLU(torch.nn.Module):
-    def __init__(self, neox_args):
-        super(GEGLU, self).__init__()
-        if neox_args.onnx_safe:
-            self.activation_func = erf_gelu
-        else:
-            self.activation_func = F.gelu
-
-    def forward(self, x, bias=None):
-        x, gate = x.chunk(2, dim=-1)
-        if bias is not None:
-            bias_1, bias_2 = bias.chunk(2, dim=-1)
-            x = x + bias_1
-            gate = gate + bias_2
-        intermediate_parallel = self.activation_func(gate)
-        return intermediate_parallel * x
diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py
index c3462c651..6400640bd 100644
--- a/megatron/model/gmlp.py
+++ b/megatron/model/gmlp.py
@@ -112,7 +112,7 @@ def __init__(
             init_method=init_method,
             skip_bias_add=True,
         )
-        self.activation_func = get_activation(neox_args)
+        self.activation_func, _ = get_activation(neox_args)
         ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size())
         if neox_args.attention_config[layer_number] == "amlp":
             d_attn = neox_args.gmlp_attn_dim
diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py
index 3177267cb..b3d9e1549 100644
--- a/megatron/model/mamba/mamba.py
+++ b/megatron/model/mamba/mamba.py
@@ -44,12 +44,17 @@ def __init__(
             neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion
         ), "Mamba fused inner fn and bias in x_proj not compatible!"
 
+        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+
         # set variables, mostly following mamba defaults
         self.d_model = neox_args.hidden_size
         self.d_state = 16  # state dimensions per channel
         self.d_conv = 4  # convolution width
-        self.expand = 2  # linear projection expansion factors
-        self.d_inner = int(self.expand * self.d_model)
+        if neox_args.intermediate_size:
+            self.d_inner = neox_args.intermediate_size
+        else:
+            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 
+            self.d_inner = int(self.expand * self.d_model)
         self.dt_rank = math.ceil(self.d_model / 16)  # rank of dt / Delta parameter
         self.dt_scale = 1.0
 
diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py
index 5d4e0d144..ec8cc1aa6 100644
--- a/megatron/model/rwkv/v6/rwkv.py
+++ b/megatron/model/rwkv/v6/rwkv.py
@@ -247,11 +247,11 @@ def __init__(self, neox_args, layer_number):
             self.time_maa_k = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
             self.time_maa_r = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
 
-        self.key = nn.Linear(neox_args.hidden_size, neox_args.dim_ffn, bias=False)
+        self.key = nn.Linear(neox_args.hidden_size, neox_args.ffn_dim, bias=False)
         self.receptance = nn.Linear(
             neox_args.hidden_size, neox_args.hidden_size, bias=False
         )
-        self.value = nn.Linear(neox_args.dim_ffn, neox_args.hidden_size, bias=False)
+        self.value = nn.Linear(neox_args.ffn_dim, neox_args.hidden_size, bias=False)
 
     def forward(self, x):
         xx = self.time_shift(x) - x
@@ -275,14 +275,19 @@ def __init__(self, neox_args, layer_number):
         self.layer_number = layer_number
         self.fp16 = neox_args.precision == "fp16"
         self.bf16 = neox_args.precision == "bfloat16"
+        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
         if not hasattr(neox_args, "dim_att"):
             neox_args.dim_att = neox_args.hidden_size
-        if not hasattr(neox_args, "dim_ffn"):
-            # Make hidden size 3.5x. Round to nearest multiple of 32 until we add hdim rounding logic
-            neox_args.dim_ffn = int((neox_args.hidden_size * 3.5) // 32 * 32)
+        if neox_args.intermediate_size:
+            neox_args.ffn_dim = neox_args.intermediate_size
+        else:
+            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            neox_args.ffn_dim = int(self.expand * neox_args.hidden_size)
+            # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic
+        neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32)
         assert neox_args.hidden_size % 32 == 0
         assert neox_args.dim_att % 32 == 0
-        assert neox_args.dim_ffn % 32 == 0
+        assert neox_args.ffn_dim % 32 == 0
         self.neox_args.head_size = neox_args.dim_att // neox_args.num_attention_heads
         self.head_size = self.neox_args.head_size
         self.num_attention_heads = neox_args.num_attention_heads
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 62e7d3a9c..119676c54 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -93,37 +93,55 @@ def __init__(
         init_method,
         output_layer_init_method,
         parallel_output=False,
+        multiple_of=256,
         MOE=False,
         MoE_mp_size=1,
     ):
         super().__init__()
+        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
-        self.activation_func = get_activation(neox_args)
+        self.activation_func, self.is_gated = get_activation(neox_args)
         self.activation_type = neox_args.activation
         self.bias_gelu_fusion = neox_args.bias_gelu_fusion
+        self.multiple_of = multiple_of
 
-        # auto scale so geglu has equal parameters
-        ff_mult = int(4 * 2 / 3) if self.activation_type == "geglu" else 4
-        ff_dim = (
-            int(ff_mult * neox_args.hidden_size) * 2
-            if self.activation_type == "geglu"
-            else ff_mult * neox_args.hidden_size
+        if neox_args.intermediate_size:
+            ffn_dim = neox_args.intermediate_size
+        elif neox_args.expansion_factor:
+            ffn_dim = int(neox_args.expansion_factor * neox_args.hidden_size)
+        else:
+            # 4h is default for ffn_dim
+            ffn_dim = 4 * neox_args.hidden_size
+        ffn_dim_in = ffn_dim
+        if self.is_gated:
+            # set activation function to be gated implementation
+            self.activation_func = Gated_Activation(self.activation_func)
+            # auto scale so gated activations has equal parameters
+            ffn_dim = int(ffn_dim * 2 / 3)
+            ffn_dim_in = ffn_dim // 2
+        # set multiple
+        ffn_dim = int(
+            (2 * self.multiple_of)
+            * ((ffn_dim + (2 * multiple_of) - 1) // (2 * multiple_of))
+        )
+        ffn_dim_in = int(
+            self.multiple_of * ((ffn_dim_in + multiple_of - 1) // multiple_of)
         )
-        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+
+        self.linear1 = mpu.ColumnParallelLinear(
             neox_args=neox_args,
             input_size=neox_args.hidden_size,
-            output_size=ff_dim,
+            output_size=ffn_dim,
             gather_output=False,
             init_method=init_method,
             skip_bias_add=True,
             MOE=MOE,
             MoE_mp_size=MoE_mp_size,
         )
-        ff_dim_in = ff_dim // 2 if self.activation_type == "geglu" else ff_dim
         # Project back to h.
-        self.dense_4h_to_h = mpu.RowParallelLinear(
+        self.linear2 = mpu.RowParallelLinear(
             neox_args=neox_args,
-            input_size=ff_dim_in,
+            input_size=ffn_dim_in,
             output_size=neox_args.hidden_size,
             input_is_parallel=True,
             init_method=output_layer_init_method,
@@ -134,13 +152,10 @@ def __init__(
         )
 
     def forward(self, hidden_states):
+        # [s, b, intermediate_size]
+        intermediate_parallel, bias_parallel = self.linear1(hidden_states)
 
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
-
-        if (
-            self.activation_type == "gelu" and self.bias_gelu_fusion
-        ) or self.activation_type == "geglu":
+        if self.is_gated or (self.activation_type == "gelu" and self.bias_gelu_fusion):
             intermediate_parallel = self.activation_func(
                 intermediate_parallel, bias_parallel
             )
@@ -150,84 +165,23 @@ def forward(self, hidden_states):
             )
 
         # [s, b, h]
-        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        output, output_bias = self.linear2(intermediate_parallel)
         return output, output_bias
 
 
-class LLaMAParallelMLP(nn.Module):
-    """LLaMA's MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension. At the end, dropout is also
-    applied.
-
-    Note: multiple_of is used to compute the hidden dimension of the MLP
-    """
-
-    def __init__(
-        self,
-        neox_args,
-        init_method,
-        output_layer_init_method,
-        parallel_output=False,
-        multiple_of=256,
-        MOE=False,
-        MoE_mp_size=1,
-    ):
+class Gated_Activation(torch.nn.Module):
+    def __init__(self, activation_func):
         super().__init__()
+        self.activation_func = activation_func
 
-        self.activation_func = get_activation(neox_args)
-        self.activation_type = neox_args.activation
-
-        self.multiple_of = multiple_of
-
-        # Allow custom intermediate size, e.g. for Mistral
-        if neox_args.intermediate_size is not None:
-            ff_dim = neox_args.intermediate_size
-        else:
-            ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
-            ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=ff_dim,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True,
-            bias=False,
-            MOE=MOE,
-            MoE_mp_size=MoE_mp_size,
-        )
-        self.w3 = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=ff_dim,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True,
-            bias=False,
-            MOE=MOE,
-            MoE_mp_size=MoE_mp_size,
-        )
-        self.w2 = mpu.RowParallelLinear(
-            neox_args=neox_args,
-            input_size=ff_dim,
-            output_size=neox_args.hidden_size,
-            input_is_parallel=True,
-            init_method=output_layer_init_method,
-            skip_bias_add=True,
-            parallel_output=parallel_output,
-            bias=False,
-            MOE=MOE,
-            MoE_mp_size=MoE_mp_size,
-        )
-
-    def forward(self, hidden_states):
-        w1_out, _ = self.w1(hidden_states)
-        w3_out, _ = self.w3(hidden_states)
-        return self.w2(self.activation_func(w1_out) * w3_out)
+    def forward(self, x, bias=None):
+        x, gate = x.chunk(2, dim=-1)
+        if bias is not None:
+            bias_1, bias_2 = bias.chunk(2, dim=-1)
+            x = x + bias_1
+            gate = gate + bias_2
+        intermediate_parallel = self.activation_func(gate)
+        return intermediate_parallel * x
 
 
 class ParallelLinear(nn.Module):
@@ -1054,24 +1008,13 @@ def __init__(
 
         # MLP
         def get_mlp(mlp_type, **kw):
-            if mlp_type == "regular":
-                return ParallelMLP(
-                    neox_args=neox_args,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    parallel_output=self.gpt_j_residual,
-                    **kw,
-                )
-            elif mlp_type == "llama":
-                return LLaMAParallelMLP(
-                    neox_args=neox_args,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    parallel_output=self.gpt_j_residual,
-                    **kw,
-                )
-            else:
-                raise KeyError(mlp_type)
+            return ParallelMLP(
+                neox_args=neox_args,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                parallel_output=self.gpt_j_residual,
+                **kw,
+            )
 
         self.num_experts = (
             neox_args.moe_num_experts
@@ -1287,11 +1230,7 @@ def forward(self, x, attention_mask, layer_past=None):
                     raise KeyError(self.moe_type)
 
             with torch.enable_grad():
-                if (
-                    self.mlp_type == "llama"
-                    or self.num_experts > 1
-                    and self.moe_type == "deepspeed"
-                ):
+                if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed":
                     # No dropout either
                     assert mlp_bias is None
                     output = mlp_output + attention_output
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index dd51c7778..818c86d31 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -121,9 +121,12 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     intermediate_size: int = None
     """
-    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+    Transformer intermediate size. Default = 4h
+    """
 
-    If not passed, will be set to a reasonable default.
+    expansion_factor: float = None
+    """
+    Transformer intermediate size. Default = 4
     """
 
     num_attention_heads: int = None
@@ -278,10 +281,20 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
 
     activation: Literal[
-        "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
+        "gelu",
+        "geglu",
+        "relu",
+        "softsign",
+        "swish",
+        "mish",
+        "silu",
+        "reglu",
+        "swiglu",
+        "bilinear",
+        "glu",
     ] = "gelu"
     """
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"]
     """
 
     scaled_upper_triang_masked_softmax_fusion: bool = False
@@ -421,9 +434,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     mlp_type: str = "regular"
     """
+    Currently, the only mlp_type is "regular." This behavior is currently deprecated.
     Types:
         regular: Megatron implementation
-        llama: LLaMA MLP (SiLU-gated MLP)
     """
 
     soft_prompt_tuning: dict = None

From ec82c05780d40404c618d4905ad14b670a91bd3c Mon Sep 17 00:00:00 2001
From: Quentin Anthony <qganthony@yahoo.com>
Date: Sun, 8 Sep 2024 14:14:33 -0700
Subject: [PATCH 25/27] apply pre-commit and add missing close-paren to mamba
 config (#1270)

---
 configs/mamba/mamba-130M.yml         |  1 +
 megatron/data/helpers.cpp            | 12 ++++++------
 megatron/model/mamba/mamba.py        | 14 ++++++++++----
 megatron/model/rwkv/v6/rwkv.py       |  8 ++++++--
 megatron/model/transformer.py        | 10 ++++++++--
 megatron/neox_arguments/arguments.py |  4 +++-
 megatron/tokenizer/tokenizer.py      |  6 ++++--
 7 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml
index 7187048e6..bd05723b2 100644
--- a/configs/mamba/mamba-130M.yml
+++ b/configs/mamba/mamba-130M.yml
@@ -86,3 +86,4 @@
    "steps_per_print": 10,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
+}
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index 9b062b050..aca290854 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py
index b3d9e1549..950e36fed 100644
--- a/megatron/model/mamba/mamba.py
+++ b/megatron/model/mamba/mamba.py
@@ -13,8 +13,10 @@
     from causal_conv1d import causal_conv1d_fn
     import einops
 except ModuleNotFoundError:
-    print( "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
-    or directly from https://github.com/state-spaces/mamba")
+    print(
+        "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
+    or directly from https://github.com/state-spaces/mamba"
+    )
     pass
 
 from megatron.model.norms import get_norm
@@ -44,7 +46,9 @@ def __init__(
             neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion
         ), "Mamba fused inner fn and bias in x_proj not compatible!"
 
-        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
         # set variables, mostly following mamba defaults
         self.d_model = neox_args.hidden_size
@@ -53,7 +57,9 @@ def __init__(
         if neox_args.intermediate_size:
             self.d_inner = neox_args.intermediate_size
         else:
-            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 
+            self.expand = (
+                neox_args.expansion_factor if neox_args.expansion_factor else 2
+            )
             self.d_inner = int(self.expand * self.d_model)
         self.dt_rank = math.ceil(self.d_model / 16)  # rank of dt / Delta parameter
         self.dt_scale = 1.0
diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py
index ec8cc1aa6..b3741a3fc 100644
--- a/megatron/model/rwkv/v6/rwkv.py
+++ b/megatron/model/rwkv/v6/rwkv.py
@@ -275,13 +275,17 @@ def __init__(self, neox_args, layer_number):
         self.layer_number = layer_number
         self.fp16 = neox_args.precision == "fp16"
         self.bf16 = neox_args.precision == "bfloat16"
-        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
         if not hasattr(neox_args, "dim_att"):
             neox_args.dim_att = neox_args.hidden_size
         if neox_args.intermediate_size:
             neox_args.ffn_dim = neox_args.intermediate_size
         else:
-            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            self.expand = (
+                neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            )
             neox_args.ffn_dim = int(self.expand * neox_args.hidden_size)
             # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic
         neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32)
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 119676c54..d2b93eb06 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -98,7 +98,9 @@ def __init__(
         MoE_mp_size=1,
     ):
         super().__init__()
-        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
         self.activation_func, self.is_gated = get_activation(neox_args)
         self.activation_type = neox_args.activation
@@ -1230,7 +1232,11 @@ def forward(self, x, attention_mask, layer_past=None):
                     raise KeyError(self.moe_type)
 
             with torch.enable_grad():
-                if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed":
+                if (
+                    self.activation == "swiglu"
+                    or self.num_experts > 1
+                    and self.moe_type == "deepspeed"
+                ):
                     # No dropout either
                     assert mlp_bias is None
                     output = mlp_output + attention_output
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 327639454..8fbe045bb 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1187,7 +1187,9 @@ def validate_values(self):
                 return False
 
         # Checks.
-        if self.hidden_size % self.num_attention_heads != 0 and not ("mamba" in self.attention_config):
+        if self.hidden_size % self.num_attention_heads != 0 and not (
+            "mamba" in self.attention_config
+        ):
             error_message = (
                 self.__class__.__name__
                 + ".validate_values() hidden_size must be divisible by num_attention_heads"
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index e450504c8..d39e18243 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -31,8 +31,10 @@ def build_tokenizer(args):
     """Initialize tokenizer."""
     if args.rank == 0:
         print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
-    
-    assert args.tokenizer_type is not None, "tokenizer_type must be specified in the .yml config"
+
+    assert (
+        args.tokenizer_type is not None
+    ), "tokenizer_type must be specified in the .yml config"
 
     # Select and instantiate the tokenizer.
     if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():

From 77e8158ac845c44ba3d3d63e15083b766a41afe0 Mon Sep 17 00:00:00 2001
From: dmahan93 <44207705+dmahan93@users.noreply.github.com>
Date: Sun, 8 Sep 2024 18:58:11 -0500
Subject: [PATCH 26/27] Add DPO training (#1242)

* Add a chat data preprocessing script

* add EOT at end of a chat

* - add different packing impl (Unpacked, packing until overflow)
- fix labels to also have valid/test implementations
- fix label masking in _get_batch to also include anything from get_ltor_masks_and_position_ids

* update README.md

* - Add metrics to forward step to add DPO specific metrics that are useful (accuracy, etc)
- Add reference model setup for DPO
- Add pairwise dataset for positive/negative pairs
- Add DPO loss

* Update arguments.py to use train_label_data_paths instead of label_data_paths

* - Bugfixes from upstreaming....

* - add precompute logprobs...

* - Finishing up precompute logprobs...

* - update readme for DPO...

* fix varname

* Fix pipeline parallelism and incorrect neox_args name

* apply precommit

---------

Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/README.md                             |  27 ++
 generate.py                                   |   3 +
 megatron/data/data_utils.py                   | 178 +++++--
 megatron/data/pairwise_dataset.py             | 457 ++++++++++++++++++
 megatron/neox_arguments/arguments.py          |   6 +
 megatron/neox_arguments/neox_args.py          |  63 ++-
 megatron/text_generation_utils.py             | 192 +++++++-
 megatron/training.py                          | 313 +++++++++---
 megatron/utils.py                             |   2 +-
 .../preprocess_data_with_chat_template.py     |   1 +
 10 files changed, 1145 insertions(+), 97 deletions(-)
 create mode 100644 megatron/data/pairwise_dataset.py

diff --git a/configs/README.md b/configs/README.md
index e14274b56..3102a34d1 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -235,6 +235,33 @@ Additional DeepSpeed settings besides those mentioned above should be wrapped in
    "eval_iters": 10,
 ```
 
+However, if you want to use DPO style training you'll need to set pos/neg data paths instead of a single one, e.g.
+
+```yaml
+   "dataset_impl": "pairwise",
+   "train_impl": "dpo",
+   "pack_impl": "unpacked",
+   "dpo_beta": 0.1,
+   "dpo_fp32": true,
+   "pos_train_data_path": "data/enwik8/enwik8_text_pos_document",
+   "pos_valid_data_path": "data/enwik8/enwik8_text_pos_document",
+   "pos_test_data_path": "data/enwik8/enwik8_text_pos_document",
+   "neg_train_data_path": "data/enwik8/enwik8_text_neg_document",
+   "neg_valid_data_path": "data/enwik8/enwik8_text_neg_document",
+   "neg_test_data_path": "data/enwik8/enwik8_text_neg_document",
+   ## If you have labels... (likely to mask out user turns)
+   "pos_train_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "pos_valid_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "pos_test_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "neg_train_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   "neg_valid_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   "neg_test_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   ## If you want to precompute the logits over your dataset...
+   "precompute_model_name": "gpt2",
+   ## Needed for the generation.py step, if precomputing
+   "text_gen_type": "precompute"
+```
+
 ### LR Scheduler settings
 
 ```yaml
diff --git a/generate.py b/generate.py
index 743e350d0..e19ef2e0e 100755
--- a/generate.py
+++ b/generate.py
@@ -23,6 +23,7 @@
     generate_samples_from_prompt,
     generate_samples_unconditional,
     generate_samples_interactive,
+    precompute_logits,
 )
 
 
@@ -83,6 +84,8 @@ def main(input_args=None, overwrite_values=None):
             top_p=neox_args.top_p,
         )
 
+    elif neox_args.text_gen_type == "precompute":
+        precompute_logits(neox_args=neox_args, model=model)
     else:
         raise ValueError(
             f"`text_gen_type` either not specified or not recognised: {neox_args.text_gen_type}"
diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
index 7e4dbdb37..7c13131ad 100644
--- a/megatron/data/data_utils.py
+++ b/megatron/data/data_utils.py
@@ -23,6 +23,7 @@
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.gpt2_dataset import GPT2Dataset
+from megatron.data.pairwise_dataset import PairwiseDataset
 from megatron.data.samplers import DistributedBatchSampler
 
 
@@ -53,9 +54,12 @@ def make_data_loader(dataset, neox_args):
 
 def build_the_dataset(
     data_prefix,
+    pos_data_prefix,
+    neg_data_prefix,
     name,
     data_impl,
     pack_impl,
+    dataset_impl,
     allow_chopped,
     num_samples,
     seq_length,
@@ -63,33 +67,100 @@ def build_the_dataset(
     skip_warmup,
     build_index_mappings=True,
     label_prefix=None,
+    pos_label_prefix=None,
+    neg_label_prefix=None,
+    precompute_model_name=None,
 ):
     """Build train/valid/test datasets."""
-
-    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
-    if label_prefix is None:
-        label_dataset = None
+    if dataset_impl == "gpt2":
+        indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+        if label_prefix is None:
+            label_dataset = None
+        else:
+            label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup)
+        if precompute_model_name is not None:
+            # If we have the name, assume it exists. If it doesn't, it will just be None which is fine.
+            precompute_indexed_dataset = make_indexed_dataset(
+                data_prefix + "_" + precompute_model_name, data_impl, skip_warmup
+            )
+            precompute_indexed_dataset = precompute_indexed_dataset
+    elif dataset_impl == "pairwise":
+        pos_indexed_dataset = make_indexed_dataset(
+            pos_data_prefix, data_impl, skip_warmup
+        )
+        neg_indexed_dataset = make_indexed_dataset(
+            neg_data_prefix, data_impl, skip_warmup
+        )
+        if pos_label_prefix is None:
+            pos_label_dataset = None
+            # Also do neg here since they both must be the same
+            assert neg_label_prefix is None
+            neg_label_dataset = None
+        else:
+            pos_label_dataset = make_indexed_dataset(
+                pos_label_prefix, data_impl, skip_warmup
+            )
+            # Also do neg here since they both must be the same
+            assert neg_label_prefix is not None
+            neg_label_dataset = make_indexed_dataset(
+                neg_label_prefix, data_impl, skip_warmup
+            )
+        if precompute_model_name is None:
+            pos_ref_dataset = None
+            neg_ref_dataset = None
+        else:
+            pos_ref_dataset = make_indexed_dataset(
+                pos_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup
+            )
+            neg_ref_dataset = make_indexed_dataset(
+                neg_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup
+            )
     else:
-        label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup)
+        raise NotImplementedError(f"dataset_impl={dataset_impl} not implemented")
 
-    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    total_num_of_documents = (
+        indexed_dataset.sizes.shape[0]
+        if dataset_impl == "gpt2"
+        else pos_indexed_dataset.sizes.shape[0]
+    )
     print_rank_0("    {}:".format(name))
     print_rank_0("     no. of documents:{}".format(total_num_of_documents))
     dataset = None
     documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32)
-    dataset = GPT2Dataset(
-        name,
-        data_prefix,
-        documents,
-        indexed_dataset,
-        num_samples,
-        seq_length,
-        seed,
-        pack_impl=pack_impl,
-        allow_chopped=allow_chopped,
-        build_index_mappings=build_index_mappings,
-        label_dataset=label_dataset,
-    )
+
+    if dataset_impl == "gpt2":
+        dataset = GPT2Dataset(
+            name,
+            data_prefix,
+            documents,
+            indexed_dataset,
+            num_samples,
+            seq_length,
+            seed,
+            pack_impl=pack_impl,
+            allow_chopped=allow_chopped,
+            build_index_mappings=build_index_mappings,
+            label_dataset=label_dataset,
+        )
+    elif dataset_impl == "pairwise":
+        dataset = PairwiseDataset(
+            name,
+            pos_data_prefix,
+            documents,
+            pos_indexed_dataset,
+            neg_indexed_dataset,
+            num_samples,
+            seq_length,
+            seed,
+            pack_impl=pack_impl,
+            allow_chopped=allow_chopped,
+            build_index_mappings=build_index_mappings,
+            pos_label_dataset=pos_label_dataset,
+            neg_label_dataset=neg_label_dataset,
+            pos_ref_dataset=pos_ref_dataset,
+            neg_ref_dataset=neg_ref_dataset,
+        )
+
     return dataset
 
 
@@ -135,7 +206,6 @@ def build_dataset(index, name):
             documents = np.arange(
                 start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
             )
-
             dataset = GPT2Dataset(
                 name,
                 data_prefix,
@@ -219,21 +289,57 @@ def build_weighted_datasets(
         valid_label_path,
         test_path,
         test_label_path,
+        pos_train_path,
+        neg_train_path,
+        pos_train_label_path,
+        neg_train_label_path,
+        pos_valid_path,
+        neg_valid_path,
+        pos_valid_label_path,
+        neg_valid_label_path,
+        pos_test_path,
+        neg_test_path,
+        pos_test_label_path,
+        neg_test_label_path,
     ) in enumerate(
         zip_longest(
-            neox_args.train_data_paths,
+            neox_args.train_data_paths if neox_args.train_data_paths else [],
             neox_args.train_label_data_paths
             if neox_args.train_label_data_paths
             else [],
-            neox_args.valid_data_paths,
+            neox_args.valid_data_paths if neox_args.valid_data_paths else [],
             neox_args.valid_label_data_paths
             if neox_args.valid_label_data_paths
             else [],
-            neox_args.test_data_paths,
+            neox_args.test_data_paths if neox_args.test_data_paths else [],
             neox_args.test_label_data_paths if neox_args.test_label_data_paths else [],
+            neox_args.pos_train_data_paths if neox_args.pos_train_data_paths else [],
+            neox_args.neg_train_data_paths if neox_args.neg_train_data_paths else [],
+            neox_args.pos_train_label_data_paths
+            if neox_args.pos_train_label_data_paths
+            else [],
+            neox_args.neg_train_label_data_paths
+            if neox_args.neg_train_label_data_paths
+            else [],
+            neox_args.pos_valid_data_paths if neox_args.pos_valid_data_paths else [],
+            neox_args.neg_valid_data_paths if neox_args.neg_valid_data_paths else [],
+            neox_args.pos_valid_label_data_paths
+            if neox_args.pos_valid_label_data_paths
+            else [],
+            neox_args.neg_valid_label_data_paths
+            if neox_args.neg_valid_label_data_paths
+            else [],
+            neox_args.pos_test_data_paths if neox_args.pos_test_data_paths else [],
+            neox_args.neg_test_data_paths if neox_args.neg_test_data_paths else [],
+            neox_args.pos_test_label_data_paths
+            if neox_args.pos_test_label_data_paths
+            else [],
+            neox_args.neg_test_label_data_paths
+            if neox_args.neg_test_label_data_paths
+            else [],
         )
     ):
-        if train_path:
+        if train_path or pos_train_path:
             train_datasets.append(
                 build_the_dataset(
                     data_prefix=train_path,
@@ -247,10 +353,16 @@ def build_weighted_datasets(
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
                     label_prefix=train_label_path,
+                    dataset_impl=neox_args.dataset_impl,
+                    pos_data_prefix=pos_train_path,
+                    neg_data_prefix=neg_train_path,
+                    pos_label_prefix=pos_train_label_path,
+                    neg_label_prefix=neg_train_label_path,
+                    precompute_model_name=neox_args.precompute_model_name,
                 )
             )
 
-        if valid_path:
+        if valid_path or pos_valid_path:
             valid_datasets.append(
                 build_the_dataset(
                     data_prefix=valid_path,
@@ -264,10 +376,16 @@ def build_weighted_datasets(
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
                     label_prefix=valid_label_path,
+                    dataset_impl=neox_args.dataset_impl,
+                    pos_data_prefix=pos_valid_path,
+                    neg_data_prefix=neg_valid_path,
+                    pos_label_prefix=pos_valid_label_path,
+                    neg_label_prefix=neg_valid_label_path,
+                    precompute_model_name=neox_args.precompute_model_name,
                 )
             )
 
-        if test_path:
+        if test_path or pos_test_path:
             test_datasets.append(
                 build_the_dataset(
                     data_prefix=test_path,
@@ -281,6 +399,12 @@ def build_weighted_datasets(
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
                     label_prefix=test_label_path,
+                    dataset_impl=neox_args.dataset_impl,
+                    pos_data_prefix=pos_test_path,
+                    neg_data_prefix=neg_test_path,
+                    pos_label_prefix=pos_test_label_path,
+                    neg_label_prefix=neg_test_label_path,
+                    precompute_model_name=neox_args.precompute_model_name,
                 )
             )
     return train_datasets, valid_datasets, test_datasets
@@ -352,7 +476,7 @@ def build_train_valid_test_data_iterators(neox_args):
             test_iters * neox_args.train_batch_size,
         ]
 
-        if neox_args.train_data_paths:
+        if (neox_args.train_data_paths) or (neox_args.pos_train_data_paths):
             # when individual train / valid / test data paths are provided
             # normalize weight values and get num samples for each dataset
             train_weights, train_num_samples = get_normalized_weights_and_num_samples(
diff --git a/megatron/data/pairwise_dataset.py b/megatron/data/pairwise_dataset.py
new file mode 100644
index 000000000..e39b4d626
--- /dev/null
+++ b/megatron/data/pairwise_dataset.py
@@ -0,0 +1,457 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pairwise style dataset."""
+
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import mpu, print_rank_0
+
+
+class PairwiseDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        name,
+        pos_data_prefix,  # Don't need neg since it's assumed you have paired the data already.
+        documents,
+        pos_indexed_dataset,
+        neg_indexed_dataset,
+        num_samples,
+        seq_length,
+        seed,
+        pack_impl="unpacked",
+        build_index_mappings=True,
+        use_shared_fs=True,
+        pos_label_dataset=None,
+        pos_ref_dataset=None,
+        neg_label_dataset=None,
+        neg_ref_dataset=None,
+        allow_chopped=True,
+    ):
+
+        self.name = name
+        self.pos_indexed_dataset = pos_indexed_dataset
+        self.pos_label_dataset = pos_label_dataset
+        self.pos_ref_dataset = pos_ref_dataset
+        self.neg_indexed_dataset = neg_indexed_dataset
+        self.neg_label_dataset = neg_label_dataset
+        self.neg_ref_dataset = neg_ref_dataset
+        self.pack_impl = pack_impl
+        self.seq_length = seq_length
+        # Checks
+        assert np.min(documents) >= 0
+        assert (neg_label_dataset is not None and pos_label_dataset is not None) or (
+            neg_label_dataset is None and pos_label_dataset is None
+        ), "Label datasets must be both None or both not None"
+        assert np.max(documents) < pos_indexed_dataset.sizes.shape[0]
+        assert pos_indexed_dataset.sizes.shape[0] == neg_indexed_dataset.sizes.shape[0]
+        assert (
+            pack_impl != "packed"
+        ), "Packed implementation not supported for pairwise dataset"
+
+        if build_index_mappings:
+            # Build index mappings.
+            self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+                self.name,
+                pos_data_prefix,
+                documents,
+                self.pos_indexed_dataset.sizes,
+                self.neg_indexed_dataset.sizes,
+                self.pos_label_dataset,
+                self.neg_label_dataset,
+                num_samples,
+                seq_length,
+                seed,
+                pack_impl,
+                use_shared_fs=use_shared_fs,
+                allow_chopped=allow_chopped,
+            )
+            self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1
+            self.sample_idx_len = self.sample_idx.shape[0] - 1
+
+            if self.shuffle_idx_len != self.sample_idx_len - 1:
+                print(
+                    f"WARNING: shuffle index length ({self.shuffle_idx_len}) is not equal to sample index length ({self.sample_idx_len})"
+                )
+
+    def __len__(self):
+        return min(self.shuffle_idx_len, self.sample_idx_len)
+
+    def __getitem__(self, idx):
+        try:
+            # Get the shuffled index.
+            idx = self.shuffle_idx[idx]
+            # Start and end documents and offsets.
+            doc_index_f = self.sample_idx[idx][0]
+            doc_index_l = self.sample_idx[idx + 1][0]
+            offset_f = self.sample_idx[idx][1]
+            offset_l = self.sample_idx[idx + 1][1]
+            # Labels and texts are supposed to be fully in sync.
+            datasets = [self.pos_indexed_dataset, self.neg_indexed_dataset]
+
+            if self.pos_label_dataset is not None:
+                datasets += [
+                    self.pos_label_dataset,
+                    self.neg_label_dataset,
+                ]
+            if self.pos_ref_dataset is not None:
+                datasets += [
+                    self.pos_ref_dataset,
+                    self.neg_ref_dataset,
+                ]
+            samples = []
+            pos_ref_samples = []
+            neg_ref_samples = []
+            # If we are within the same document, just extract the chunk.
+            for n, dataset in enumerate(datasets):
+                if doc_index_f == doc_index_l:
+                    samples.append(
+                        dataset.get(
+                            self.doc_idx[doc_index_f],
+                            offset=offset_f,
+                            length=offset_l - offset_f + 1,
+                        )
+                    )
+                else:
+                    # Otherwise, get the rest of the initial document.
+                    sample_list = [
+                        dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+                    ]
+                    # Loop over all in between documents and add the entire document.
+                    for i in range(doc_index_f + 1, doc_index_l):
+                        sample_list.append(dataset.get(self.doc_idx[i]))
+                    # And finally add the relevant portion of last document.
+                    sample_list.append(
+                        dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
+                    )
+                    samples.append(np.concatenate(sample_list))
+            for i in range(len(samples)):
+                if len(samples[i]) < (self.seq_length + 1):
+                    if ((i == 2) or (i == 3)) and self.pos_label_dataset is not None:
+                        # Labels... So pad with -100
+                        samples[i] = np.pad(
+                            samples[i],
+                            (0, (self.seq_length + 1) - len(samples[i])),
+                            mode="constant",
+                            constant_values=-100,
+                        )
+                    else:
+                        # Pad with 0s, can use any number since it's masked.
+                        samples[i] = np.pad(
+                            samples[i],
+                            (0, (self.seq_length + 1) - len(samples[i])),
+                            mode="constant",
+                            constant_values=0,
+                        )
+                elif len(samples[i]) > (self.seq_length + 1):
+                    # Check for overflow and truncate.
+                    samples[i] = samples[i][: (self.seq_length + 1)]
+            ret = {}
+            ret["pos"] = np.array(samples[0], dtype=np.int64)
+            ret["neg"] = np.array(samples[1], dtype=np.int64)
+            if self.pos_label_dataset is not None:
+                ret["pos_label"] = np.array(samples[2], dtype=np.int64)
+                ret["neg_label"] = np.array(samples[3], dtype=np.int64)
+                if self.pos_ref_dataset is not None:
+                    ret["pos_ref"] = np.array(samples[4], dtype=np.float32)
+                    ret["neg_ref"] = np.array(samples[5], dtype=np.float32)
+            elif self.pos_ref_dataset is not None:
+                # Don't have labels...
+                ret["pos_ref"] = np.array(samples[2], dtype=np.float32)
+                ret["neg_ref"] = np.array(samples[3], dtype=np.float32)
+            return ret
+        except IndexError:
+            new_idx = idx % len(self)
+            print(
+                f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
+            )
+            return self[new_idx]
+
+
+def _build_index_mappings(
+    name,
+    pos_data_prefix,
+    documents,
+    pos_sizes,
+    neg_sizes,
+    pos_label_dataset,
+    neg_label_dataset,
+    num_samples,
+    seq_length,
+    seed,
+    packing_impl,
+    use_shared_fs=True,
+    allow_chopped=True,
+):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, pos_sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    _filename = pos_data_prefix
+    _filename += "_{}_indexmap".format(name)
+    _filename += "_{}ns".format(num_samples)
+    _filename += "_{}sl".format(seq_length)
+    _filename += "_{}s".format(seed)
+    _filename += "_{}pi".format(packing_impl)
+    doc_idx_filename = _filename + "_doc_idx.npy"
+    sample_idx_filename = _filename + "_sample_idx.npy"
+    shuffle_idx_filename = _filename + "_shuffle_idx.npy"
+
+    if not use_shared_fs:
+        should_process_dataset = int(os.environ["LOCAL_RANK"]) == 0
+    else:
+        should_process_dataset = torch.distributed.get_rank() == 0
+
+    # Build the indexed mapping if not exist.
+    if should_process_dataset:
+        if (
+            (not os.path.isfile(doc_idx_filename))
+            or (not os.path.isfile(sample_idx_filename))
+            or (not os.path.isfile(shuffle_idx_filename))
+        ):
+            print_rank_0(
+                " > WARNING: could not find index map files, building "
+                "the indices on rank 0 ..."
+            )
+            # doc-idx.
+            start_time = time.time()
+            if packing_impl == "pack_until_overflow":
+                # Naively pack data until it overflows, then roll it over to a new one instead.
+                shuffle_idx = np.arange(num_samples)  # Shuffle index around epochs
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = []
+                doc_idx = []
+                # Iterate over files until we have enough samples.
+                temp_shuffle_idx = np.arange(len(documents))
+                np_rng.shuffle(temp_shuffle_idx)
+                running_length = 0
+                curr_shuffle_idx = 0
+                while len(sample_idx) < num_samples:
+                    # If not allow_chopped, skip this item if it's chopped.
+                    if not allow_chopped:
+                        if (
+                            pos_sizes[temp_shuffle_idx[curr_shuffle_idx]]
+                            < seq_length + 1
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                        if (
+                            neg_sizes[temp_shuffle_idx[curr_shuffle_idx]]
+                            < seq_length + 1
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                    # Then, check if we need to skip this item...
+                    if pos_label_dataset is not None:
+                        if np.all(
+                            pos_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[
+                                : seq_length + 1
+                            ]
+                            == -100
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                        if np.all(
+                            neg_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[
+                                : seq_length + 1
+                            ]
+                            == -100
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                    doc_length = max(
+                        pos_sizes[temp_shuffle_idx[curr_shuffle_idx]],
+                        neg_sizes[temp_shuffle_idx[curr_shuffle_idx]],
+                    )
+                    if running_length == 0:
+                        sample_idx.append(np.array([len(doc_idx), 0]))
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                        running_length += doc_length
+                    else:
+                        if running_length + doc_length > (seq_length + 1):
+                            running_length = doc_length
+                            sample_idx.append(np.array([len(doc_idx), 0]))
+                        else:
+                            running_length += doc_length
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                    curr_shuffle_idx += 1
+                    if curr_shuffle_idx == len(documents):
+                        curr_shuffle_idx = 0
+                        np_rng.shuffle(temp_shuffle_idx)
+                sample_idx.append(np.array([len(doc_idx), 0]))
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            elif packing_impl == "unpacked":
+                # Unpacked data, one sample per document.
+                shuffle_idx = np.array([i % len(documents) for i in range(num_samples)])
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64)
+                sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)])
+                sample_idx[:, 1] = 0
+                doc_idx = list()
+                doc_i = 0
+                while len(doc_idx) <= num_samples:
+                    # Check if we need to skip this item...
+                    if not allow_chopped:
+                        # +1 since we shift left/right by 1
+                        if pos_sizes[doc_i] > seq_length + 1:
+                            doc_i = (doc_i + 1) % len(documents)
+                            continue
+                        if neg_sizes[doc_i] > seq_length + 1:
+                            doc_i = (doc_i + 1) % len(documents)
+                            continue
+                    # In theory if we don't allow chopped we should be able to skip it, but the warm fuzzies I get
+                    # from this are worth the extra bool check
+                    if np.all(pos_label_dataset.get(doc_i)[:seq_length] == -100):
+                        doc_i = (doc_i + 1) % len(documents)
+                        continue
+                    if np.all(neg_label_dataset.get(doc_i)[:seq_length] == -100):
+                        doc_i = (doc_i + 1) % len(documents)
+                        continue
+                    doc_idx.append(doc_i)
+                    doc_i = (doc_i + 1) % len(documents)
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_io_parallel_group()
+    )
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(" > loading doc-idx mapping from {}".format(doc_idx_filename))
+    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(" > loading sample-idx mapping from {}".format(sample_idx_filename))
+    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(" > loading shuffle-idx mapping from {}".format(shuffle_idx_filename))
+    shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(
+        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
+    )
+    print_rank_0("    total number of samples: {}".format(sample_idx.shape[0]))
+    print_rank_0("    total number of epochs: {}".format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence length, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng):
+    """Build an array with length = number-of-epochs * number-of-documents.
+    Each index is mapped to a corresponding document."""
+    doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
+    doc_idx[:] = documents
+    doc_idx = doc_idx.reshape(-1)
+    doc_idx = doc_idx.astype(np.int32)
+    np_rng.shuffle(doc_idx)
+    return doc_idx
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int64)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Beginning offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += remaining_seq_length + doc_length - 1
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the beginning of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    dtype_ = np.uint32
+    if size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+    shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx)
+    return shuffle_idx
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 8fbe045bb..1677bf072 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1116,10 +1116,16 @@ def calculate_derived(self):
         # Adding equal dataset weights if none are provided
         if self.train_data_paths and (self.train_data_weights is None):
             self.train_data_weights = [1.0] * len(self.train_data_paths)
+        elif self.pos_train_data_paths and (self.train_data_weights is None):
+            self.train_data_weights = [1.0] * len(self.pos_train_data_paths)
         if self.valid_data_paths and (self.valid_data_weights is None):
             self.valid_data_weights = [1.0] * len(self.valid_data_paths)
+        elif self.pos_valid_data_paths and (self.valid_data_weights is None):
+            self.valid_data_weights = [1.0] * len(self.pos_valid_data_paths)
         if self.test_data_paths and (self.test_data_weights is None):
             self.test_data_weights = [1.0] * len(self.test_data_paths)
+        elif self.pos_test_data_paths and (self.test_data_weights is None):
+            self.test_data_weights = [1.0] * len(self.pos_test_data_paths)
 
         if self.train_label_data_paths:
             err_str = "Must use `train_label_data_paths` with `train_data_paths`, not `data_path`"
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 818c86d31..814622a5b 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -893,6 +893,42 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     List of paths to validation label datasets (not shifted by 1 yet!).
     """
 
+    pos_train_data_paths: list = None
+    neg_train_data_paths: list = None
+    """
+    List of paths to positive and negative training datasets.
+    """
+
+    pos_train_label_data_paths: list = None
+    neg_train_label_data_paths: list = None
+    """
+    List of paths to positive and negative training label datasets (not shifted by 1 yet!).
+    """
+
+    pos_valid_data_paths: list = None
+    neg_valid_data_paths: list = None
+    """
+    List of paths to positive and negative validation datasets.
+    """
+
+    pos_valid_label_data_paths: list = None
+    neg_valid_label_data_paths: list = None
+    """
+    List of paths to positive and negative validation label datasets (not shifted by 1 yet!).
+    """
+
+    pos_test_data_paths: list = None
+    neg_test_data_paths: list = None
+    """
+    List of paths to positive and negative test datasets.
+    """
+
+    pos_test_label_data_paths: list = None
+    neg_test_label_data_paths: list = None
+    """
+    List of paths to positive and negative test label datasets (not shifted by 1 yet!).
+    """
+
     train_data_weights: list = None
     """
     List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
@@ -949,6 +985,26 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets
     """
 
+    dataset_impl: Literal["gpt2", "pairwise"] = "gpt2"
+    """
+    Dataset implementation, can be one of "gpt2" or "pairwise"
+    """
+
+    train_impl: Literal["normal", "dpo"] = "normal"
+    """
+    Training implementation, can be one of "normal" or "dpo"
+    """
+
+    dpo_fp32: bool = True
+    """
+    Whether to cast logits to fp32 for DPO loss calculation.
+    """
+
+    dpo_beta: float = 0.1
+    """
+    Beta value for DPO
+    """
+
     allow_chopped: bool = True
     """
     WARNING: if your packing impl is packed, this is ignored.
@@ -1245,7 +1301,12 @@ class NeoXArgsTextgen(NeoXArgsTemplate):
     text_gen_type: str = None
     """
     How to generate text/sample the model.
-    Options: `unconditional`, `input-file`, `interactive`
+    Options: `unconditional`, `input-file`, `interactive`, `precompute`
+    """
+
+    precompute_model_name: str = None
+    """
+    Model name to use for saving precomputed logprobs
     """
 
     temperature: float = 0.0
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 7b7a390ab..02926c2c3 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -23,12 +23,15 @@
 import time
 from typing import List, Union
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 
 from megatron import print_rank_0
 from megatron import mpu
 from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0
+from megatron.data.indexed_dataset import make_builder, make_dataset
+from megatron.mpu.mappings import gather_from_model_parallel_region
 
 
 def get_batch(neox_args, context_tokens: torch.Tensor):
@@ -52,7 +55,9 @@ def get_batch(neox_args, context_tokens: torch.Tensor):
     return tokens, attention_mask, position_ids
 
 
-def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int):
+def pad_batch(
+    context_tokens: List[List[int]], pad_id: int, pad_len: int, truncate: bool = False
+):
     """
     pads context lengths in context_tokens with pad_id to equal neox_args.seq_length,
     and returns the padded batch and the new lengths.
@@ -60,17 +65,21 @@ def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int):
     context_tokens: list of lists of tokens
     pad_id: int, integer to use as padding token
     pad_len: int, context length to be padded; all batch items will be padded to the same length
+    truncate: bool, if True, truncate context tokens to pad_len if they are longer than pad_len
 
     returns: tuple of padded context tokens and a list of unpadded token count
     """
 
     context_lengths = []
-    for tokens in context_tokens:
+    for i, tokens in enumerate(context_tokens):
         context_length = len(tokens)
         if context_length < pad_len:
             tokens.extend([pad_id] * (pad_len - context_length))
         elif context_length > pad_len:
-            raise ValueError("context_length is bigger than to be padded length")
+            if not truncate:
+                raise ValueError("context_length is bigger than to be padded length")
+            context_tokens[i] = tokens[:pad_len]
+            context_length = pad_len
         context_lengths.append(context_length)
     return context_tokens, context_lengths
 
@@ -807,3 +816,180 @@ def generate_samples_interactive(
                 print_rank_0("Generated Text: " + generated_text)
         if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
             _ = input("\n<press enter to continue>")
+
+
+def get_logp(logits, labels, force_fp32=False):
+    if force_fp32:
+        logits = logits.float()
+    logp = logits.log_softmax(dim=-1)
+    return torch.gather(logp, dim=2, index=labels.unsqueeze(2)).squeeze(2)
+
+
+def precompute_logits(neox_args, model):
+    """
+    Precomputes logprobs from training/testing/validation datasets
+
+    Saves it to the same directory as the dataset with the model name appended to it
+
+    neox_args: NeoXArgs.
+    model: a Megatron model
+
+    """
+    if neox_args.precompute_model_name is None:
+        mdl_name = str(hash(neox_args.load))
+    else:
+        mdl_name = neox_args.precompute_model_name
+    print_rank_0("Precomputing logprobs...")
+    model.eval()
+    data_paths = list()
+    if neox_args.train_data_paths is not None:
+        for path in neox_args.train_data_paths:
+            data_paths.append(path)
+        for path in neox_args.test_data_paths:
+            data_paths.append(path)
+        for path in neox_args.valid_data_paths:
+            data_paths.append(path)
+    elif neox_args.pos_train_data_paths is not None:
+        # Pairwise data...
+        for path in neox_args.pos_train_data_paths:
+            data_paths.append(path)
+        for path in neox_args.neg_train_data_paths:
+            data_paths.append(path)
+        for path in neox_args.pos_valid_data_paths:
+            data_paths.append(path)
+        for path in neox_args.neg_valid_data_paths:
+            data_paths.append(path)
+        for path in neox_args.pos_test_data_paths:
+            data_paths.append(path)
+        for path in neox_args.neg_test_data_paths:
+            data_paths.append(path)
+    for path in data_paths:
+        print_rank_0(f"Precomputing logits for {path}")
+        # Add hash to path...
+        out_path = path + f"_{mdl_name}"
+        if os.path.exists(out_path + ".idx"):
+            continue
+        dataset = make_dataset(path, neox_args.data_impl, not neox_args.mmap_warmup)
+        if is_mp_rank_0():
+            out_dataset = make_builder(out_path + ".bin", neox_args.data_impl)
+            out_dataset._dtype = np.float32
+        i = 0
+        while i < len(dataset):
+            start = time.time()
+            model.module.clear_cache()  # clear kv cache between batches
+            if is_mp_rank_0():
+                offset = (
+                    mpu.get_data_parallel_rank()
+                    * neox_args.train_micro_batch_size_per_gpu
+                )
+                context_tokens = [
+                    [int(x) for x in dataset.get(j % len(dataset)).tolist()]
+                    for j in range(
+                        i + offset,
+                        i + (neox_args.train_micro_batch_size_per_gpu + offset),
+                    )
+                ]
+                # grab microbatch
+                # pad batch in order to allow conversion to tensor
+                context_tokens, context_lengths = pad_batch(
+                    copy.deepcopy(context_tokens),
+                    pad_id=0,
+                    pad_len=neox_args.seq_length + 1,
+                    truncate=True,
+                )
+                # print(context_tokens)
+                label_tokens = [tokens[1:] for tokens in context_tokens]
+                context_tokens = [tokens[:-1] for tokens in context_tokens]
+            else:
+                context_tokens = [
+                    [0 for _ in range(neox_args.seq_length)]
+                    for _ in range(neox_args.batch_size)
+                ]
+                label_tokens = [
+                    [0 for _ in range(neox_args.seq_length)]
+                    for _ in range(neox_args.batch_size)
+                ]
+                context_lengths = [0 for _ in range(neox_args.batch_size)]
+            i += (
+                neox_args.train_micro_batch_size_per_gpu
+                * mpu.get_data_parallel_world_size()
+            )
+            # print(context_tokens)
+            # convert to tensor and broadcast
+            context_tokens = torch.cuda.LongTensor(context_tokens)
+            label_tokens = torch.cuda.LongTensor(label_tokens)
+            # Make sure context tokens + start tokens are the same across all ranks
+            token_generation_start_index = torch.cuda.LongTensor(context_lengths)
+            torch.distributed.broadcast(
+                context_tokens,
+                mpu.get_model_parallel_src_rank(),
+                group=mpu.get_model_parallel_group(),
+            )
+            torch.distributed.broadcast(
+                token_generation_start_index,
+                mpu.get_model_parallel_src_rank(),
+                group=mpu.get_model_parallel_group(),
+            )
+            torch.distributed.broadcast(
+                label_tokens,
+                mpu.get_model_parallel_src_rank(),
+                group=mpu.get_model_parallel_group(),
+            )
+            # context_tokens = context_tokens[:, :chop_len].contiguous()
+            # label_tokens = label_tokens[:, :chop_len].contiguous()
+            with torch.no_grad():
+                # get attention mask / position ids
+                context_tokens, attention_mask, position_ids = get_batch(
+                    neox_args, context_tokens
+                )
+                model_inputs = (
+                    context_tokens,
+                    position_ids,
+                    attention_mask,
+                )
+                maybe_tuple = forward_model(
+                    model, model_inputs, neox_args.is_pipe_parallel
+                )
+                if isinstance(maybe_tuple, tuple):
+                    logits, _ = maybe_tuple
+                else:
+                    logits = maybe_tuple
+                if logits is not None:  # if pipe parallel, not all ranks return logits
+                    logits = gather_from_model_parallel_region(logits)
+                    logp = get_logp(logits, label_tokens, True).squeeze()
+                if neox_args.is_pipe_parallel:
+                    # broadcast generated tokens to pipe parallel group
+                    src_rank = model.grid.stage_to_global(model.num_stages - 1)
+                    logp = (
+                        logp
+                        if logits is not None
+                        else torch.zeros(
+                            neox_args.batch_size, dtype=torch.float32
+                        ).cuda()
+                    )
+                    torch.distributed.broadcast(
+                        tensor=logp,
+                        src=src_rank,
+                        group=mpu.get_pipe_parallel_group(),
+                    )
+                    logp = logp.squeeze()
+                logp_list = [
+                    torch.zeros_like(logp)
+                    for _ in range(mpu.get_data_parallel_world_size())
+                ]
+                torch.distributed.all_gather(
+                    logp_list, logp, group=mpu.get_data_parallel_group()
+                )
+                logp = torch.cat(logp_list, dim=0).cpu().numpy()
+                if (mpu.get_model_parallel_rank() == 0) and (
+                    mpu.get_data_parallel_rank() == 0
+                ):
+                    for j in range(logp.shape[0]):
+                        out_dataset.add_item(logp[j])
+                        out_dataset.end_document()
+            print_rank_0(f"Processed {i} / {len(dataset)} in {time.time() - start}")
+        if is_mp_rank_0():
+            out_dataset.finalize(
+                out_path + ".idx",
+            )
+        torch.distributed.barrier()
diff --git a/megatron/training.py b/megatron/training.py
index fc3d9e129..d9932483a 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -21,12 +21,14 @@
 """Pretrain utilities."""
 from datetime import datetime
 from functools import partial
+from collections import defaultdict
 
 import math
 import sys
 from contextlib import nullcontext
 
 import torch
+import torch.nn.functional as F
 import deepspeed
 from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
 import numpy as np
@@ -45,6 +47,7 @@
     get_params_for_weight_decay_optimization,
     mark_norms_for_sequence_parallel_grad_sync,
 )
+from megatron.mpu.mappings import gather_from_model_parallel_region
 from megatron.checkpointing import load_checkpoint, save_checkpoint
 from megatron.data.data_utils import build_train_valid_test_data_iterators
 from megatron.initialize import initialize_megatron
@@ -137,7 +140,7 @@ def gen():
             old_hidden_size = neox_args.hidden_size
             neox_args.hidden_size = hidden_size
 
-            model, optimizer, _ = setup_model_and_optimizer(
+            model, optimizer, _, _ = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
@@ -193,7 +196,7 @@ def pretrain(neox_args):
 
     # Model, optimizer, and learning rate.
     timers("model and optimizer").start()
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(
+    model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer(
         neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
     )
     timers("model and optimizer").stop()
@@ -231,6 +234,7 @@ def pretrain(neox_args):
             neox_args=neox_args,
             timers=timers,
             model=model,
+            reference_model=reference_model,
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
             train_data_iterator=train_data_iterator,
@@ -282,12 +286,12 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype):
     label_key = keys[1] if len(keys) > 1 else None
     # Unpack.
     tokens_ = data_b[token_key].long()
-    if "label" in data_b:
+    if label_key in data_b:
         label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous()
         labels = torch.where(
             data_b[label_key].long() >= 0,
             data_b[label_key].long(),
-            torch.zeros_like(data_b["label"].long()),
+            torch.zeros_like(data_b[label_key].long()),
         )[:, 1:].contiguous()
     else:
         label_mask = (tokens_.long() >= 0)[:, 1:].contiguous()
@@ -311,7 +315,14 @@ def get_batch(neox_args, data_iterator):
     """Generate a batch"""
 
     # Items and their type.
-    keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
+    if neox_args.train_impl == "normal":
+        keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
+    elif neox_args.train_impl == "dpo":
+        keys = (
+            [["pos", "pos_label"], ["neg", "neg_label"]]
+            if neox_args.pos_train_label_data_paths
+            else [["pos"], ["neg"]]
+        )
     datatype = torch.int64
 
     # Broadcast data.
@@ -319,13 +330,43 @@ def get_batch(neox_args, data_iterator):
         data = next(data_iterator)
     else:
         data = None
-    return _get_batch(
-        neox_args=neox_args,
-        tokenizer=neox_args.tokenizer,
-        keys=keys,
-        data=data,
-        datatype=datatype,
-    )
+    if neox_args.train_impl == "normal":
+        return _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys,
+            data=data,
+            datatype=datatype,
+        )
+    elif neox_args.train_impl == "dpo":
+        pos_tup = _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys[0],
+            data=data,
+            datatype=datatype,
+        )
+        neg_tup = _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys[1],
+            data=data,
+            datatype=datatype,
+        )
+        if neox_args.precompute_model_name:
+            ref_data = mpu.broadcast_data(["pos_ref", "neg_ref"], data, torch.float)
+        else:
+            ref_data = {"pos_ref": None}
+        return [
+            torch.cat((pos_item, neg_item), dim=0)
+            for pos_item, neg_item in zip(pos_tup, neg_tup)
+        ] + [
+            torch.cat((ref_data["pos_ref"], ref_data["neg_ref"]), dim=0)[
+                :, :-1
+            ].contiguous()
+            if ref_data["pos_ref"] is not None
+            else None
+        ]
 
 
 def get_batch_pipe(data, neox_args, curr_scheduler=None):
@@ -419,8 +460,23 @@ def mb_moe_loss_func(args, loss_mask, output_tensor=None):
     return averaged_lbl, loss_dict
 
 
+def get_pos_neg_logp(logits, labels, force_fp32=False):
+    if force_fp32:
+        logits = logits.float()
+    logp = logits.log_softmax(dim=-1)
+    per_token_logp = torch.gather(logp, dim=2, index=labels.unsqueeze(2)).squeeze(2)
+    # Split to pos/neg...
+    return torch.chunk(per_token_logp, 2, 0)
+
+
 def forward_step(
-    data_iterator, model, neox_args, timers, return_logits=False, is_train=False
+    data_iterator,
+    model,
+    neox_args,
+    timers,
+    return_logits=False,
+    is_train=False,
+    reference_model=None,
 ):
     """Forward step."""
     if neox_args.is_pipe_parallel:
@@ -431,9 +487,14 @@ def forward_step(
         torch.cuda.nvtx.range_push(f"Get batch")
     if timers is not None:
         timers("batch generator").start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        neox_args=neox_args, data_iterator=data_iterator
-    )
+    if neox_args.train_impl == "normal":
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            neox_args=neox_args, data_iterator=data_iterator
+        )
+    if neox_args.train_impl == "dpo":
+        tokens, labels, loss_mask, attention_mask, position_ids, ref_logp = get_batch(
+            neox_args=neox_args, data_iterator=data_iterator
+        )
 
     if timers is not None:
         timers("batch generator").stop()
@@ -442,38 +503,100 @@ def forward_step(
 
     if neox_args.memory_profiling:
         torch.cuda.nvtx.range_push(f"Forward pass")
-    # Sequential returns moe_losses, but this is not yet supported by pipe parallel
-    maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
-    if type(maybe_tuple) is tuple:
-        outputs, moe_losses = maybe_tuple
-    else:
-        outputs = maybe_tuple
-        moe_losses = []
-    if (
-        is_train
-        and neox_args.curriculum_learning
-        and neox_args.curriculum_seqlen < neox_args.seq_length
-    ):
-        loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous()
-        labels = labels[:, : neox_args.curriculum_seqlen].contiguous()
-    main_loss = cross_entropy(
-        outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
-    )
-    if neox_args.moe_num_experts > 1:
-        if neox_args.moe_type == "deepspeed":
-            moe_loss = neox_args.moe_loss_coeff * sum(m.item() for m in moe_losses)
-        elif neox_args.moe_type == "megablocks":
-            moe_loss = mb_moe_loss_func(neox_args, loss_mask, outputs)[0]
+    metrics = {}
+    if neox_args.train_impl == "normal":
+        # Sequential returns moe_losses, but this is not yet supported by pipe parallel
+        maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
+        if type(maybe_tuple) is tuple:
+            outputs, moe_losses = maybe_tuple
         else:
-            raise ValueError(f"Unsupported moe_type: {neox_args.moe_type}")
-    else:
-        moe_loss = 0.0
-    loss = main_loss + moe_loss
+            outputs = maybe_tuple
+            moe_losses = []
+        if (
+            is_train
+            and neox_args.curriculum_learning
+            and neox_args.curriculum_seqlen < neox_args.seq_length
+        ):
+            loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous()
+            labels = labels[:, : neox_args.curriculum_seqlen].contiguous()
+        main_loss = cross_entropy(
+            outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
+        )
+        if neox_args.moe_num_experts > 1:
+            if neox_args.moe_type == "deepspeed":
+                moe_loss = neox_args.moe_loss_coeff * sum(m.item() for m in moe_losses)
+            elif neox_args.moe_type == "megablocks":
+                moe_loss = mb_moe_loss_func(neox_args, loss_mask, outputs)[0]
+            else:
+                raise ValueError(f"Unsupported moe_type: {neox_args.moe_type}")
+        else:
+            moe_loss = 0.0
+        loss = main_loss + moe_loss
+    elif neox_args.train_impl == "dpo":
+        # Based on https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
+        with torch.no_grad():
+            # So we can gather token logps...
+            token_logp_labels = labels.clone()
+            token_logp_labels[token_logp_labels == -100] = 0
+            pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0)
+            if ref_logp is None:
+                ref_maybe_tuple = reference_model(
+                    (tokens, position_ids, attention_mask), neox_args=neox_args
+                )
+                if type(ref_maybe_tuple) is tuple:
+                    # We should ignore MoE losses yeah?
+                    ref_outputs, _ = ref_maybe_tuple
+                else:
+                    ref_outputs = ref_maybe_tuple
+                # gather across tensor parallel group
+                ref_outputs = gather_from_model_parallel_region(ref_outputs)
+                ref_pos, ref_neg = get_pos_neg_logp(
+                    ref_outputs, token_logp_labels, neox_args.dpo_fp32
+                )
+            else:
+                ref_pos, ref_neg = torch.chunk(ref_logp, 2, 0)
+            ref_pos = (ref_pos * pos_loss_mask).sum(-1)
+            ref_neg = (ref_neg * neg_loss_mask).sum(-1)
+        chosen_maybe_tuple = model(
+            (tokens, position_ids, attention_mask), neox_args=neox_args
+        )
+        if type(chosen_maybe_tuple) is tuple:
+            # We should ignore MoE losses yeah?
+            chosen_outputs, _ = chosen_maybe_tuple
+        else:
+            chosen_outputs = chosen_maybe_tuple
+        chosen_outputs = gather_from_model_parallel_region(chosen_outputs)
+        chosen_pos, chosen_neg = get_pos_neg_logp(
+            chosen_outputs, token_logp_labels, neox_args.dpo_fp32
+        )
+        chosen_pos = (chosen_pos * pos_loss_mask).sum(-1)
+        chosen_neg = (chosen_neg * neg_loss_mask).sum(-1)
+        with torch.no_grad():
+            # Collect metrics...
+            metrics["ref_neg"] = ref_neg.clone().detach().mean()
+            metrics["ref_pos"] = ref_pos.clone().detach().mean()
+            metrics["chosen_neg"] = chosen_neg.clone().detach().mean()
+            metrics["chosen_pos"] = chosen_pos.clone().detach().mean()
+            chosen_rewards = neox_args.dpo_beta * (
+                chosen_pos.clone().detach() - ref_pos.clone().detach()
+            )
+            rejected_rewards = neox_args.dpo_beta * (
+                chosen_neg.clone().detach() - ref_neg.clone().detach()
+            )
+            reward_acc = (chosen_rewards > rejected_rewards).float()
+            metrics["reward_acc"] = reward_acc.mean()
+            metrics["chosen_rewards"] = chosen_rewards.mean()
+            metrics["rejected_rewards"] = rejected_rewards.mean()
+            metrics["margins"] = (chosen_rewards - rejected_rewards).mean()
+        pi_logrations = chosen_pos - chosen_neg
+        ref_logrations = ref_pos - ref_neg
+        logits = pi_logrations - ref_logrations
+        loss = -F.logsigmoid(neox_args.dpo_beta * logits).mean()
     if neox_args.memory_profiling:
         torch.cuda.nvtx.range_pop()
     if return_logits:
-        return loss, outputs
-    return loss
+        return loss, outputs, metrics
+    return loss, metrics
 
 
 def get_model(neox_args, use_cache=False):
@@ -548,9 +671,14 @@ def get_model(neox_args, use_cache=False):
         raise ValueError("Must be using deepspeed to run neox")
 
 
-def get_optimizer(model, neox_args):
+def get_optimizer(model, neox_args, dummy=False):
     """Set up the optimizer."""
-    if neox_args.no_load_optim:
+    if neox_args.no_load_optim and neox_args.deepspeed:
+        # Required to have something so...
+        dummy = True
+        neox_args.optimizer = {"params": {"lr": 0.0}}
+        neox_args.optimizer_type = "adam"
+    elif neox_args.no_load_optim:
         return None, None
 
     if neox_args.optimizer is None:
@@ -584,8 +712,13 @@ def get_optimizer(model, neox_args):
     _param_groups = []
     for param_group in param_groups:
         trainable_params = [p for p in param_group["params"] if p.requires_grad]
+        if dummy:
+            trainable_params = [trainable_params[0]]  # just take the first one
         param_group["params"] = trainable_params
         _param_groups.append(param_group)
+        if dummy:
+            # Only need one.
+            break
     param_groups = _param_groups
 
     # If we're using mup, then the optimizer must be adam or sgd
@@ -699,7 +832,7 @@ def get_optimizer(model, neox_args):
 
 def get_learning_rate_scheduler(optimizer, neox_args):
     """Build the learning rate scheduler."""
-    if neox_args.no_load_optim:
+    if (neox_args.no_load_optim) and not neox_args.deepspeed:
         # TODO: this should be configured as a separate arg
         return None
     if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam":
@@ -744,19 +877,30 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
         )
 
     """Setup model and optimizer."""
+    needs_reference_model = (neox_args.train_impl == "dpo") and (
+        neox_args.precompute_model_name is None
+    )
     model = get_model(neox_args=neox_args, use_cache=use_cache)
+    if needs_reference_model:
+        reference_model = get_model(neox_args=neox_args, use_cache=use_cache)
+    else:
+        reference_model = None
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
-
+    if neox_args.deepspeed and needs_reference_model:
+        # Need an optimizer & lr_scheduler so make a very small one to keep deepspeed happy...
+        ref_optimizer, ref_param_groups = get_optimizer(
+            model=reference_model, neox_args=neox_args, dummy=True
+        )
+        ref_lr_scheduler = get_learning_rate_scheduler(
+            optimizer=ref_optimizer, neox_args=neox_args
+        )
+    else:
+        ref_optimizer, ref_param_groups, ref_lr_scheduler = None, None, None
     if neox_args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")
-        if neox_args.no_load_optim:
-            assert optimizer is None
-            _model_params = None
-            _lr_scheduler = None
-        else:
-            _model_params = param_groups if optimizer is None else None
-            _lr_scheduler = lr_scheduler
+        _model_params = param_groups if optimizer is None else None
+        _lr_scheduler = lr_scheduler
 
         model, optimizer, _, lr_scheduler = deepspeed.initialize(
             model=model,
@@ -769,6 +913,16 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
             # config_params=neox_args.deepspeed_config,
             mpu=mpu if not neox_args.is_pipe_parallel else None,
         )
+        if needs_reference_model:
+            reference_model, _, _, _ = deepspeed.initialize(
+                model=reference_model,
+                optimizer=ref_optimizer,
+                args=neox_args,
+                lr_scheduler=ref_lr_scheduler,
+                dist_init_required=False,
+                model_parameters=ref_param_groups,
+                mpu=mpu if not neox_args.is_pipe_parallel else None,
+            )
         mark_norms_for_sequence_parallel_grad_sync(model, neox_args)
         if neox_args.moe_num_experts > 1 and neox_args.moe_type == "megablocks":
             # We need to additionally set this flag to ensure DS parallelism properly handles this foreign MoE.
@@ -805,6 +959,14 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
             lr_scheduler=lr_scheduler,
             iteration=iteration,
         )
+        if needs_reference_model:
+            _ = load_checkpoint(
+                neox_args=neox_args,
+                model=reference_model,
+                optimizer=ref_optimizer,
+                lr_scheduler=ref_lr_scheduler,
+                iteration=iteration,
+            )
         print_rank_0(
             f"Loading checkpoint and starting from iteration {neox_args.iteration}"
         )
@@ -816,7 +978,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     if lr_scheduler is not None:
         lr_scheduler.optimizer = model.optimizer
 
-    return model, optimizer, lr_scheduler
+    return model, optimizer, lr_scheduler, reference_model
 
 
 def backward_step(neox_args, timers, optimizer, model, loss):
@@ -838,7 +1000,15 @@ def backward_step(neox_args, timers, optimizer, model, loss):
         raise ValueError("Must be using deepspeed to run neox")
 
 
-def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler):
+def train_step(
+    neox_args,
+    timers,
+    data_iterator,
+    model,
+    optimizer,
+    lr_scheduler,
+    reference_model=None,
+):
     """Single training step."""
 
     # Pipeline parallelism schedules forward/backward/step
@@ -846,6 +1016,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         reduced_loss = train_step_pipe(
             neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator
         )
+        reduce_metrics = reduced_loss
         if (
             neox_args.memory_profiling
             and neox_args.iteration >= neox_args.profile_step_start
@@ -855,18 +1026,22 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
             save_snapshot(neox_args)
     else:
         losses = []
+        metric_dicts = defaultdict(list)
         for _ in range(neox_args.gradient_accumulation_steps):
             # Forward model for one step.
             timers("forward").start()
-            loss = forward_step(
+            loss, metric_dict = forward_step(
                 neox_args=neox_args,
                 timers=timers,
                 data_iterator=data_iterator,
                 model=model,
                 is_train=True,
+                reference_model=reference_model,
             )
             timers("forward").stop()
             losses.append(loss)
+            for key in metric_dict.keys():
+                metric_dicts[key].append(metric_dict[key])
             # Calculate gradients, reduce across processes, and clip.
             if (
                 neox_args.profile
@@ -916,17 +1091,19 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 and torch.distributed.get_rank() == 0
             ):
                 save_snapshot(neox_args)
-        reduced_loss = {
-            "lm_loss": reduce_losses(losses).mean()
-        }  # reduces losses across machines for logging
+        # reduces metrics across machines for logging
+        reduce_metrics = {
+            key: reduce_losses(metric_dicts[key]).mean() for key in metric_dicts.keys()
+        }
+        reduce_metrics["lm_loss"] = reduce_losses(losses).mean()
 
     if neox_args.precision == "fp16" and model.optimizer.overflow:
         skipped_iter = 1
     else:
         skipped_iter = 0
 
-    collect_loss_for_unit_test(reduced_loss["lm_loss"])
-    return reduced_loss, skipped_iter
+    collect_loss_for_unit_test(reduce_metrics["lm_loss"])
+    return reduce_metrics, skipped_iter
 
 
 def train_step_pipe(neox_args, timers, model, data_iterator):
@@ -952,6 +1129,7 @@ def train(
     neox_args,
     timers,
     model,
+    reference_model,
     optimizer,
     lr_scheduler,
     train_data_iterator,
@@ -1007,6 +1185,7 @@ def train(
             model=model,
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
+            reference_model=reference_model,
         )
         if neox_args.profile and iteration == neox_args.profile_step_stop:
             torch.cuda.cudart().cudaProfilerStop()
@@ -1097,6 +1276,7 @@ def evaluate(
     # Turn on evaluation mode which disables dropout.
     model.eval()
     losses = []
+    metric_dicts = defaultdict(list)
     if neox_args.char_level_ppl:
         data_iterator = CharCounter(data_iterator, neox_args.tokenizer)
 
@@ -1118,14 +1298,15 @@ def evaluate(
                 else neox_args.gradient_accumulation_steps
             ):
                 # Forward evaluation
-                loss = forward_step_fn(
+                loss, metric_dict = forward_step_fn(
                     model=model,
                     data_iterator=data_iterator,
                     neox_args=neox_args,
                     timers=timers,
                 )
                 losses.append(loss)
-
+                for key in metric_dict.keys():
+                    metric_dicts[key].append(metric_dict[key])
             # When contiguous memory optimizations are enabled, the buffers
             # allocated by the optimizations are deallocated during backward pass
             # in the absence of backward pass the buffers should be reset after each
@@ -1135,6 +1316,8 @@ def evaluate(
 
     # reduces losses across processes for logging & run eval harness tasks
     eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
+    for key in metric_dicts.keys():
+        eval_results[key] = reduce_losses(metric_dicts[key]).mean().item()
     eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])
 
     if neox_args.char_level_ppl:
diff --git a/megatron/utils.py b/megatron/utils.py
index 26b4439bd..a64a8ba6c 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -449,7 +449,7 @@ def setup_for_inference_or_eval(use_cache=True, overwrite_values=None, input_arg
     initialize_megatron(neox_args)
 
     # set up model and load checkpoint.
-    model, _, _ = setup_model_and_optimizer(
+    model, _, _, _ = setup_model_and_optimizer(
         neox_args=neox_args,
         use_cache=use_cache,
         iteration=neox_args.iteration,
diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py
index 55623b303..4e101ea5a 100644
--- a/tools/datasets/preprocess_data_with_chat_template.py
+++ b/tools/datasets/preprocess_data_with_chat_template.py
@@ -105,6 +105,7 @@ def build_chat(
         chat_tokens = tokenizer.apply_chat_template(
             chat[: i + 1], add_generation_prompt=add_gen
         )[len(tokens) :]
+
         # remove previous stuff...
         tokens.extend(chat_tokens)
         if only_last_turn and (i != len(chat) - 1):

From 836aefaab925ba4f6afb0c265aa540b34573198c Mon Sep 17 00:00:00 2001
From: Aurelion <32250326+aurelion-source@users.noreply.github.com>
Date: Sun, 8 Sep 2024 20:19:58 -0400
Subject: [PATCH 27/27] LayerNorm Refactor (#1269)

* Add TE skeleton

* Update NeoXArgs docs automatically

* added option for te version of norms

* import TERMSNorm

* add te norm options to norm arg

* add TE objects in weight decay function

* reformat

* add TERMSNorm and TELayerNorm

* Update NeoXArgs docs automatically

* - add Fused RMS Norm from apex

* - make it consistent with how layernorm looks

* Merged transformer engine and apex fused layernorm branches

* Added assertion if TE is used

* Removed unnecessary transformer-engine import

* Changed importerror text for TE

* Added requirements/requirements-transformerengine.txt

* Add TE skeleton

* Update NeoXArgs docs automatically

* added option for te version of norms

* import TERMSNorm

* add te norm options to norm arg

* add TE objects in weight decay function

* reformat

* add TERMSNorm and TELayerNorm

* Update NeoXArgs docs automatically

* - add Fused RMS Norm from apex

* - make it consistent with how layernorm looks

* Merged transformer engine and apex fused layernorm branches

* Added assertion if TE is used

* Removed unnecessary transformer-engine import

* Changed importerror text for TE

* Added requirements/requirements-transformerengine.txt

* update comments

* precommit

---------

Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: lintangsutawika <lintang@stella-ord-0.stella-ord.tenant-eleutherai.svc.tenant.chi.local>
Co-authored-by: lintangsutawika <lintang@eleuther.ai>
Co-authored-by: dmahan93 <dmayhem93@gmail.com>
Co-authored-by: aurelion-source <aurelion-source@gmail.com>
Co-authored-by: aurelion-source <aurelionsource@gmail.com>
---
 .pre-commit-config.yaml                       |   2 +-
 configs/neox_arguments.md                     |   6 +-
 megatron/model/fused_layer_norm.py            | 114 ++++++++++++++-
 megatron/model/norms.py                       |  17 ++-
 megatron/model/transformer_engine.py          | 137 ++++++++++++++++++
 megatron/model/utils.py                       |  15 +-
 megatron/neox_arguments/neox_args.py          |  11 +-
 .../requirements-transformerengine.txt        |   1 +
 tests/README.md                               |   1 +
 9 files changed, 293 insertions(+), 11 deletions(-)
 create mode 100644 megatron/model/transformer_engine.py
 create mode 100644 requirements/requirements-transformerengine.txt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7de35027a..249255306 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
       hooks:
       - id: codespell
         args: [
-              '--ignore-words-list=reord,dout',  # Word used in error messages that need rewording
+              '--ignore-words-list=reord,dout,te',  # Word used in error messages that need rewording. te --> transformerengine
               --check-filenames,
               --check-hidden,
           ]
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 413138597..d24b2b60a 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 53d0ae8
+    Default = 217b4c5
 
     current git hash of repository
 
@@ -335,11 +335,11 @@ Model Arguments
 
 
 
-- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
+- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm', 'te_rmsnorm', 'te_layernorm']
 
     Default = layernorm
 
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm".
 
 
 
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index d33ded506..3fd251147 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -21,7 +21,10 @@
 except:
     HAVE_PERSIST_LAYER_NORM = False
 
-from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+from apex.normalization.fused_layer_norm import (
+    FusedLayerNormAffineFunction,
+    FusedRMSNormAffineFunction,
+)
 
 
 global fused_layer_norm_cuda
@@ -148,3 +151,112 @@ def forward(self, input):
             )
 
             return output
+
+
+class MixedFusedRMSNorm(torch.nn.Module):
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-5,
+        no_persist_layer_norm=True,
+        sequence_parallel=False,
+        apply_rmsnorm_1p=False,
+        mem_efficient_rms=True,
+    ):
+        super(MixedFusedRMSNorm, self).__init__()
+
+        self.apply_rmsnorm_1p = apply_rmsnorm_1p
+        self.mem_efficient_rms = mem_efficient_rms
+        self.norm_fn = FusedRMSNormAffineFunction
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        if (
+            normalized_shape not in persist_ln_hidden_sizes
+            or not HAVE_PERSIST_LAYER_NORM
+        ):
+            no_persist_layer_norm = True
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.scale = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+        self.no_persist_layer_norm = no_persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.scale, "sequence_parallel", self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.apply_rmsnorm_1p:
+            init.zeros_(self.scale)
+        else:
+            init.ones_(self.scale)
+
+    def forward(self, input):
+
+        weight = self.scale + 1 if self.apply_rmsnorm_1p else self.scale
+        # CPU path is here for unittest sake.
+        if not input.is_cuda:
+            print(
+                "WARNING! The input of FusedLayerNorm should be on the GPU."
+                "This warning should only be triggered in the FusedRMSNorm unit tests."
+            )
+            # Latest pytorch actually supports F.rms_norm but I don't want to break builds so...
+            return F.layer_norm(input, self.normalized_shape, weight, None, self.eps)
+
+        # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect
+        # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet
+        if "memory_efficient" in inspect.getfullargspec(self.norm_fn.forward).args:
+            return self.norm_fn.apply(
+                input,
+                weight,
+                self.normalized_shape,
+                self.eps,
+                self.mem_efficient_rms,
+            )
+        else:
+            return self.norm_fn.apply(input, weight, self.normalized_shape, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+            return output
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
index 19e1aeae6..ba175d3eb 100644
--- a/megatron/model/norms.py
+++ b/megatron/model/norms.py
@@ -18,8 +18,13 @@
 
 def get_norm(neox_args):
     if neox_args.norm == "rmsnorm":
-        norm = RMSNorm
         eps = neox_args.rms_norm_epsilon
+        if neox_args.rmsnorm_fusion:
+            from .fused_layer_norm import MixedFusedRMSNorm
+
+            norm = MixedFusedRMSNorm
+        else:
+            norm = RMSNorm
     elif neox_args.norm == "layernorm":
         eps = neox_args.layernorm_epsilon
         if neox_args.layernorm_fusion:
@@ -31,6 +36,16 @@ def get_norm(neox_args):
     elif neox_args.norm == "scalenorm":
         eps = neox_args.scalenorm_epsilon
         norm = ScaleNorm
+    elif neox_args.norm == "te_rmsnorm":
+        from .transformer_engine import TERMSNorm
+
+        norm = TERMSNorm
+        eps = neox_args.rms_norm_epsilon
+    elif neox_args.norm == "te_layernorm":
+        from .transformer_engine import TELayerNorm
+
+        norm = TELayerNorm
+        eps = neox_args.layernorm_epsilon
     else:
         raise ValueError(f"norm {neox_args.norm} not recognized")
     return norm, eps
diff --git a/megatron/model/transformer_engine.py b/megatron/model/transformer_engine.py
new file mode 100644
index 000000000..338513a97
--- /dev/null
+++ b/megatron/model/transformer_engine.py
@@ -0,0 +1,137 @@
+import torch
+
+try:
+    import transformer_engine as te
+except ImportError:
+    raise ImportError(
+        "Unable to import transformer-engine. Please refer to "
+        "https://github.com/NVIDIA/TransformerEngine for installation instructions."
+    )
+
+
+class TERMSNorm(torch.nn.Module):
+    def __init__(self, dim, eps=1e-8, **kwargs):
+        """
+            A conditional wrapper to initialize an instance of Transformer-Engine's
+            `RMSNorm` based on input
+        :param dim: model size
+        :param eps:  epsilon value, default 1e-8
+        """
+        super(TERMSNorm, self).__init__()
+
+        self.d = dim
+        self.eps = eps
+        self.norm = te.pytorch.RMSNorm(
+            hidden_size=self.d,
+            eps=self.eps,
+            **kwargs,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TELayerNorm(torch.nn.Module):
+    def __init__(self, dim, eps=1.0e-5, **kwargs):
+        """
+            A conditional wrapper to initialize an instance of Transformer-Engine's
+            `LayerNorm` based on input
+        :param dim: model size
+        :param eps:  epsilon value, default 1.0e-5
+        """
+        super(TELayerNorm, self).__init__()
+
+        self.d = dim
+        self.eps = eps
+        self.norm = te.pytorch.LayerNorm(
+            hidden_size=self.d,
+            eps=self.eps,
+            **kwargs,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TELinear(te.pytorch.Linear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
+    """
+    Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines
+    layernorm and linear layers
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TEColumnParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `ColumnParallelLinear` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TERowParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `RowParallelLinear` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TEDotProductAttention(te.pytorch.DotProductAttention):
+    """
+    Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
+    has "flash attention" enabled.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TEDelayedScaling(te.common.recipe.DelayedScaling):
+    """
+    Wrapper for the Transformer-Engine's `DelayedScaling` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 97b409c1d..77e7f521d 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -18,6 +18,7 @@
 """Utilities for models."""
 
 import torch
+from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm
 from megatron.model.fused_softmax import SoftmaxFusionTypes
 from megatron import mpu
 from types import GeneratorType
@@ -35,9 +36,17 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         "name": "no_weight_decay_params",
     }
     for module_ in module.modules():
-        # apply weight decay to any "...Norm" modules.
-        if "norm" in type(module_).__name__.lower() or neox_args.weight_decay == 0.0:
-            # also include all parameters here if no weight decay is being done
+        if any(
+            [
+                isinstance(module_, LayerNorm),
+                isinstance(module_, RMSNorm),
+                isinstance(module_, TELayerNorm),
+                isinstance(module_, TERMSNorm),
+                isinstance(module_, ScaleNorm),
+            ]
+        ) or (
+            neox_args.weight_decay == 0.0
+        ):  # also include all parameters here if no weight decay is being done
             no_weight_decay_params["params"].extend(
                 [p for p in list(module_._parameters.values()) if p is not None]
             )
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 814622a5b..b5e7a619d 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -162,9 +162,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Maximum number of position embeddings to use. This is the size of position embedding.
     """
 
-    norm: Literal["layernorm", "rmsnorm", "scalenorm"] = "layernorm"
+    norm: Literal[
+        "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm"
+    ] = "layernorm"
     """
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm".
     """
 
     layernorm_fusion: bool = False
@@ -172,6 +174,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Use fused layer norm kernel (if `norm` is `layernorm`).
     """
 
+    rmsnorm_fusion: bool = False
+    """
+    Use fused RMS norm kernel (if `norm` is `rmsnorm`).
+    """
+
     use_qk_layernorm: bool = False
     """
     Use QK Normalization
diff --git a/requirements/requirements-transformerengine.txt b/requirements/requirements-transformerengine.txt
new file mode 100644
index 000000000..2050d7566
--- /dev/null
+++ b/requirements/requirements-transformerengine.txt
@@ -0,0 +1 @@
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
diff --git a/tests/README.md b/tests/README.md
index f5ba5e560..32618d757 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -3,6 +3,7 @@
 Tests use pytests with coverage and forked plugins. Install with:
 
 ```bash
+pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-dev.txt
 ```