From aa95bb7f1718458b2e0c2b16b56f3bff7abbf1cc Mon Sep 17 00:00:00 2001 From: MackZackA Date: Tue, 30 Jul 2024 14:05:34 -0700 Subject: [PATCH] PP API and nD Distributed Timeline Profiling (#41) --- docs/pictures/ndtimeline_arch.jpg | Bin 0 -> 132356 bytes docs/pictures/ndtimeline_trace.png | Bin 0 -> 243438 bytes docs/pictures/pp.png | Bin 0 -> 84478 bytes .../download_open_llama_ckpt.py | 2 +- .../llama_mfu_calculator.py | 2 +- .../run_open_llama_w_vescale.py | 2 +- .../open_llama_4D_benchmark/sharding_plan.py | 2 +- requirements.txt | 3 +- .../nano_gpt/test_nano_gpt_load_save.py | 4 +- .../open_llama/test_open_llama_dp_reshard.py | 2 +- .../open_llama/test_open_llama_load_save.py | 2 +- .../open_llama/test_open_llama_tp_reshard.py | 2 +- test/model/open_llama/test_attention.py | 6 +- test/model/open_llama/test_decoder_layer.py | 6 +- test/ndtimeline/__init__.py | 1 + test/ndtimeline/test_local_raw_handler.py | 37 + test/ndtimeline/test_metric_level.py | 30 + test/ndtimeline/test_parser_handler.py | 61 + test/parallel/pipeline/api/four_mlp.py | 53 + .../pipeline/api/test_pipe_engine_api.py | 417 ++++ .../api/test_pipe_single_stage_ops.py | 219 ++ .../pipeline/api/test_schedule_engine.py | 121 ++ test/parallel/pipeline/api/test_simple_api.py | 195 ++ test/parallel/pipeline/backend/eight_mlp.py | 288 +++ .../pipeline/backend/test_p2p_comm.py | 994 +++++++++ test/parallel/pipeline/backend/test_pipe.py | 342 ++++ .../pipeline/backend/test_pipe_parser.py | 172 ++ .../pipeline/backend/test_shard_plan.py | 72 + .../pipeline/backend/test_shared_params.py | 301 +++ .../pipeline/backend/test_trace_parser.py | 133 ++ .../e2e/test_pp_accuracy_alignment.py | 247 +++ .../parallel/pipeline/instruction/four_mlp.py | 71 + .../instruction/test_multistage_schedule.py | 190 ++ .../test_pipe_instruction_register.py | 60 + .../pipeline/instruction/test_schedule.py | 529 +++++ .../instruction/test_userdefine_schedule.py | 197 ++ .../pipeline/instruction/test_zerobubble.py | 103 + vescale/devicemesh_api/__init__.py | 2 +- vescale/dtensor/__init__.py | 2 +- vescale/dtensor/_diff.py | 3 +- vescale/dtensor/dtensor.py | 25 +- vescale/dtensor/placement_types.py | 4 +- vescale/engine/__init__.py | 18 + vescale/engine/pipe.py | 237 +++ vescale/initialize/__init__.py | 2 +- vescale/initialize/deferred_init.py | 14 + vescale/model/base_gpt/__init__.py | 5 + vescale/model/base_gpt/attention.py | 531 +++++ vescale/model/base_gpt/checkpoint.py | 133 ++ vescale/model/base_gpt/enums.py | 27 + vescale/model/base_gpt/fuse_layer_norm.py | 119 ++ vescale/model/base_gpt/fuse_softmax.py | 203 ++ vescale/model/base_gpt/jit_func.py | 40 + vescale/model/base_gpt/mlp.py | 101 + vescale/model/base_gpt/rotary.py | 52 + vescale/model/base_gpt/transformer_block.py | 135 ++ vescale/model/base_gpt/transformer_layer.py | 194 ++ vescale/model/base_gpt/utils.py | 27 + vescale/ndtimeline/README.md | 55 + vescale/ndtimeline/__init__.py | 87 + vescale/ndtimeline/api.py | 396 ++++ vescale/ndtimeline/binary_protocol.py | 139 ++ vescale/ndtimeline/exceptions.py | 28 + vescale/ndtimeline/fsdp_patch.py | 28 + vescale/ndtimeline/handlers/__init__.py | 34 + .../ndtimeline/handlers/chrome_trace_event.py | 291 +++ .../ndtimeline/handlers/do_nothing_handler.py | 36 + vescale/ndtimeline/handlers/handler_base.py | 79 + .../ndtimeline/handlers/local_raw_handler.py | 67 + .../handlers/local_timeline_handler.py | 201 ++ .../ndtimeline/handlers/logging_handler.py | 47 + vescale/ndtimeline/handlers/parser_handler.py | 206 ++ vescale/ndtimeline/handlers/sock_handler.py | 107 + vescale/ndtimeline/is_internal.py | 23 + vescale/ndtimeline/logger.py | 41 + vescale/ndtimeline/pool.py | 78 + vescale/ndtimeline/predefined.py | 30 + vescale/ndtimeline/sock_streamer.py | 132 ++ vescale/ndtimeline/stream.py | 79 + vescale/ndtimeline/timer.py | 756 +++++++ vescale/ndtimeline/variables.py | 27 + vescale/ndtimeline/world_info.py | 123 ++ vescale/pipe/README.md | 125 ++ vescale/pipe/__init__.py | 26 + vescale/pipe/_schedules/__init__.py | 21 + vescale/pipe/_schedules/instruction_base.py | 552 +++++ vescale/pipe/_schedules/looping_bfs.py | 1789 +++++++++++++++++ vescale/pipe/_schedules/pipedream_flush.py | 1287 ++++++++++++ .../pipe/_schedules/pp_collective_emitter.py | 289 +++ vescale/pipe/_schedules/zero_bubble_v.py | 1170 +++++++++++ vescale/pipe/p2p_communication.py | 1005 +++++++++ vescale/pipe/pipe_emmiter.py | 356 ++++ vescale/pipe/pipe_parser.py | 652 ++++++ vescale/pipe/pipe_stage.py | 563 ++++++ vescale/pipe/tracer.py | 709 +++++++ vescale/plan/__init__.py | 20 + vescale/plan/pipeline_parallel.py | 142 ++ vescale/plan/spec.py | 78 + 98 files changed, 18591 insertions(+), 23 deletions(-) create mode 100644 docs/pictures/ndtimeline_arch.jpg create mode 100644 docs/pictures/ndtimeline_trace.png create mode 100644 docs/pictures/pp.png create mode 100644 test/ndtimeline/__init__.py create mode 100644 test/ndtimeline/test_local_raw_handler.py create mode 100644 test/ndtimeline/test_metric_level.py create mode 100644 test/ndtimeline/test_parser_handler.py create mode 100644 test/parallel/pipeline/api/four_mlp.py create mode 100644 test/parallel/pipeline/api/test_pipe_engine_api.py create mode 100644 test/parallel/pipeline/api/test_pipe_single_stage_ops.py create mode 100644 test/parallel/pipeline/api/test_schedule_engine.py create mode 100644 test/parallel/pipeline/api/test_simple_api.py create mode 100644 test/parallel/pipeline/backend/eight_mlp.py create mode 100644 test/parallel/pipeline/backend/test_p2p_comm.py create mode 100644 test/parallel/pipeline/backend/test_pipe.py create mode 100644 test/parallel/pipeline/backend/test_pipe_parser.py create mode 100644 test/parallel/pipeline/backend/test_shard_plan.py create mode 100644 test/parallel/pipeline/backend/test_shared_params.py create mode 100644 test/parallel/pipeline/backend/test_trace_parser.py create mode 100644 test/parallel/pipeline/e2e/test_pp_accuracy_alignment.py create mode 100644 test/parallel/pipeline/instruction/four_mlp.py create mode 100644 test/parallel/pipeline/instruction/test_multistage_schedule.py create mode 100644 test/parallel/pipeline/instruction/test_pipe_instruction_register.py create mode 100644 test/parallel/pipeline/instruction/test_schedule.py create mode 100644 test/parallel/pipeline/instruction/test_userdefine_schedule.py create mode 100644 test/parallel/pipeline/instruction/test_zerobubble.py create mode 100644 vescale/engine/__init__.py create mode 100644 vescale/engine/pipe.py create mode 100644 vescale/model/base_gpt/__init__.py create mode 100644 vescale/model/base_gpt/attention.py create mode 100644 vescale/model/base_gpt/checkpoint.py create mode 100644 vescale/model/base_gpt/enums.py create mode 100644 vescale/model/base_gpt/fuse_layer_norm.py create mode 100644 vescale/model/base_gpt/fuse_softmax.py create mode 100644 vescale/model/base_gpt/jit_func.py create mode 100644 vescale/model/base_gpt/mlp.py create mode 100644 vescale/model/base_gpt/rotary.py create mode 100644 vescale/model/base_gpt/transformer_block.py create mode 100644 vescale/model/base_gpt/transformer_layer.py create mode 100644 vescale/model/base_gpt/utils.py create mode 100644 vescale/ndtimeline/README.md create mode 100644 vescale/ndtimeline/__init__.py create mode 100644 vescale/ndtimeline/api.py create mode 100644 vescale/ndtimeline/binary_protocol.py create mode 100644 vescale/ndtimeline/exceptions.py create mode 100644 vescale/ndtimeline/fsdp_patch.py create mode 100644 vescale/ndtimeline/handlers/__init__.py create mode 100644 vescale/ndtimeline/handlers/chrome_trace_event.py create mode 100644 vescale/ndtimeline/handlers/do_nothing_handler.py create mode 100644 vescale/ndtimeline/handlers/handler_base.py create mode 100644 vescale/ndtimeline/handlers/local_raw_handler.py create mode 100644 vescale/ndtimeline/handlers/local_timeline_handler.py create mode 100644 vescale/ndtimeline/handlers/logging_handler.py create mode 100644 vescale/ndtimeline/handlers/parser_handler.py create mode 100644 vescale/ndtimeline/handlers/sock_handler.py create mode 100644 vescale/ndtimeline/is_internal.py create mode 100644 vescale/ndtimeline/logger.py create mode 100644 vescale/ndtimeline/pool.py create mode 100644 vescale/ndtimeline/predefined.py create mode 100644 vescale/ndtimeline/sock_streamer.py create mode 100644 vescale/ndtimeline/stream.py create mode 100644 vescale/ndtimeline/timer.py create mode 100644 vescale/ndtimeline/variables.py create mode 100644 vescale/ndtimeline/world_info.py create mode 100644 vescale/pipe/README.md create mode 100644 vescale/pipe/__init__.py create mode 100644 vescale/pipe/_schedules/__init__.py create mode 100644 vescale/pipe/_schedules/instruction_base.py create mode 100644 vescale/pipe/_schedules/looping_bfs.py create mode 100644 vescale/pipe/_schedules/pipedream_flush.py create mode 100644 vescale/pipe/_schedules/pp_collective_emitter.py create mode 100644 vescale/pipe/_schedules/zero_bubble_v.py create mode 100644 vescale/pipe/p2p_communication.py create mode 100644 vescale/pipe/pipe_emmiter.py create mode 100644 vescale/pipe/pipe_parser.py create mode 100644 vescale/pipe/pipe_stage.py create mode 100644 vescale/pipe/tracer.py create mode 100644 vescale/plan/__init__.py create mode 100644 vescale/plan/pipeline_parallel.py create mode 100644 vescale/plan/spec.py diff --git a/docs/pictures/ndtimeline_arch.jpg b/docs/pictures/ndtimeline_arch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cef58a92b12a32666d97a8cda2eaefccf7f77e9a GIT binary patch literal 132356 zcmeFYcU+Urw=WuB3o42)(z{Zni}d~?9Rvgn2q7RHLT{mj;%fov(xrgn)u{ z0jVJg0qG?Kq=gQb{ki+zbKLjdbM`&IbMD>ee11><$TOdrS+nN5X4bRTnl=RW~| z>S^g}0WMtv04`np0OtgN`bFsyxO()l+4 z*9DV*Tn1deXaM-v=f+K{>(p1SUb}qhkBbK0e*!LD`QwjEfGgLjsHv{rxXb~#d{Oxa z)urnpC1>Ir zSXPG=cqA#Krfcj6t$!dbtKs1EuCrV4vAUMFj^3NMQPD{?7bCJ?uyX0@f8ytYtt*%Q zxP0{*)%A;75G~+>n@g9k-k`pAXmoFOsarp}E)w}oUB-CtuuF*4a8$U}){l)mm z#1xp6Tu?JS%4BEn`}Y_XkECB=QQg-OUMcCv4*nhAX6FMw)z)umXuf)*{`zhEIUaE9 zg4!RnmuUg2fVMCH2Oj_Y|0e?fMBtwY{EtQ8ipXK*jzn4LUn|VnR;j|u*^}{AIp0(w zM_F{2S>JRtrmyTshoM#6v#PlF#^s&2g8ICN*!e|jYbC}TH7#II5Rr|P!>>|bmNhlt zN1$+{`>dEE+Ml|JhaaIw_n^un>#AuF!&IG@0#3orN*vis*f5&G);2iG@V_=H ztavg)=*3a^(09@i+CNB9cA)ZzBR4F<+%xxv+K}54Vz=h3o~bPiyn2M@lJ+W2-1~?* zUfVPppWN54K700c_cT8fy9ZtQ1@+1X?GpRWJaHhqz2m`pU#KZ z{5H|ypX)O+=oM(d-UaLad{oL+zoGJ*0PKlk9fzh=qm*#x3seff;#xfdf2@=Kh4U3X z8ko&*{QZtsGdOCy0Sl+OO+E@tXxW>;>nmKT_E#t-ta2Z=~dpzO(p*%LsRv zdx1(KmD6FvjCx(wg^WfE9>|k_XYevj|_P~rSvNuUw(TehnX#oa=) z3wzs5rQJey860E8L*J6;;UxNl&amJZdB1!mpG0Ovx}e{$lbeoRTz7c>3(|W+6T+G} zP?}kFOZi>F$~i#lS=s3fmJL14Dv7t~8Y|hT`So0Am}vrIj(Sj5{Sq%faS-sPINy5R z&}w?eeX-C8HrnLf&18OpRlJ*s3OE&jxG5?DN5ShMMOWwIJzx8fpHd*XpF}Iht}NJ{ z%tz`;ro;x}@a$!;AoH(U?8;r?2|@OapN@PXZ^uC3@#m-lN7MMUL50-{XU0+8wR{fK zpCb;s?E$^*J$-bs!(F;vx_hOa0l^`6>HsYIfBlcui2u>3vu~ZGPZYumRs#aC2w1V0 z2tM1ab|i^5jfK6=@7me5;-V7R({0*as0B=6ivr8Zo;?0Tes<_?k}!;}73lHGb@s>4 zJ@58#`j0>f@Q}kgsmQpRcYFr&H!I%)&d` zoI1MXO1LsgHYl;q0fhbx`7Zvb=GAkiVh zjFo34@JvT6B3EYptPI;-bq=6PNx=ns`+7bcg4K8dIo3mRlDGLsJj7VHmfaVjyCyxA zx}soe^O4<})has=w}sj7mhS-}HwymCGyY$$)^BCHjZ|;kmYi}9aDH9*v1zC<K%}b zfGsr8AVjmN)vDLax(kBcu0M>1KVg&g6gtY&ht$V+{0gKN+?ZG|vexQX`APtN8j)qD zhfA;XZq3l7`Pa-?dyc@wLHpQROgt%fBW6G-p?peFh%IOgXDA;++%R8DANGyS6yxEP zHNe(B9zTeV_ue+sdTgfUcPCIfaSBvhs5RI<*d}VO)aM*7D$K^+M~)91j0TD{YL4E@ zb<53(HF%8a_DJjN`4-JysucU+EvT=oi*YFRjU&?otxmpYSvhb$J;f90^m2)0gX9-C zaV{SQ2A1Fslt9l(MSGlei@%?#=Q3XLFGyv*<@-@aM4DhF=yC6Jq|Emw#3e!7w5q{E zh2aiJrH*Y!upm9?yoG(2P6Av#Cn_s6_$Gj>B-9XYm^uM^_%Rxca9ex%FDU@*`W zsL^pK;!yUB?e?1BB!q$D(mBO#-4Z|Ni7aQ337!fV3QTeK zQRs|`Pb((Aq2595wcQTNq=W*VQ7@-(Ft_GU@BHBIHQYrVM> z(qKA&VF6H&A3{KAs!&i1CAw->%4EGgx7exHZNa)Z8(UA9N3uHCQwGifA%|hAm5QV2 z-9?FWKyQo(XbL}_4;-%r;+Jbj;Nx8;DjP8SOwjZ;BHu`nGGumSJYs<7f74EXx-~na zXQZZZX>I_aLFD6^F6FwUUF6%aDV9?al;GK~5YwHm<7^u2;w|t{gaOIxAFd z4a`mYnrMvRUvL|%2am$F0@9Ux$=#`#A#A@BY5#4CNdUUyv@Cjrb-6O2u~#rCCh+-Q zcEZG-`HL<$Lf&-AD65|*mQibe0@$$q6?R#xyu+I5ZRIx~*Vl=$$#bp*H19c}Gco+j z0z>l>R?v1vlTBA7^ewx2Kuo4#W}lG!`;v3OoO^&LN%jV?4=C&4Yo6aH=lUeMnJ!NS zX-15ZFxB&k^(-^I6E0-AlA4NgO0_E31vNi2JX)FBpFkldg&C}>0&fayLCqfy&S3nW z1ihxJ?GgRsNK00(iR$K3g=Mr7P!neto93e~-Q=0RkS4V2*X+9856^mC7T@KDx4YKlj!iy|D4r5|ssCWf`PRFY7ETrQaWk z0W~8hNOSe1`uQSyWyH5;TI1ShS_nePf_9$1OkT z6XL(-n3F&+R!$U86u1?QyU~PWU38A;Se7bfHqbH|M=?~GBs;#obo~^y8Ye~hI!Ul^ z47WjZU{7V;vyuW4(&25>^(y&pA@q0S%DoM27fVXO`t>-k7t0T0lbt0W;04$PO6(i` znBiseN)^w$zjnKOqAf?KRp2;9q5VesywC1y&Oaq<&7z?@vWTP#q%~9MGg}Gi*r*{f zk#b3?xC}SHln=Sd;l)ilXIyoN>T-2$h;^4KE`!YO-ngKBmzm=D)oyVd?9+;0A;S6b z^Wcf`uyL%1tJ=zX5pGKJcfZj$dev&kG zW!2Q9B21b?fF*jNMH!AMHkR$O>ECy%5}gTEpE-fY4D%4uB#}4?rr)XDBEBC^KHlg< zjF7=04Uri)#4qP2O$juh)j}OBS}KY}(tK0M4OsEyG9Kq@orP&bZNI*^*;dd5W!wk3 z0B2nE%_@VrLFS$8#YF~_=0=CdzGd=mSH(FDnC4HgQhjdRp1TVb$ z96I1Cn`Q(tF?Ar~5FvYm+;*q@J*7TqRz;fTXs24``N`i<=StS&dPQ_un97RfxDdmc zX_c974Cmksn=d&o+3X;S;`#7JYFK!4TFAV)`v6HX=d#t@eX{)5zAb-`H!^6lN;NSA zzGVLG?d?s6eO22Aw|z`(`owh#O&Dh1)h!?xD_r1FfvA(PXSwfI+&IK~9OLg^C&me+ zeQYSs_N}W^@%M9N1Ov)IhsdrpsUzHn1^GTy${~-Kv1x`Xl%_Z;jDG9-B8b1kqhs!^>~MR zz6vg_h;8;)p-d=8R+BBodt=--(tLZdm+e=qfml4qT7-HMh8teJ;wLPk(njZU4@zZKL~yMZZM~OEb63f zYo5sJGrAETj6-dE@ZE&wjmKv)c74zzY#kR^27#BMfWvN&G@ zU87mF7hsz$G}ghnT<_zIwcK6(jY7>OcfT6yAYPJMY&XhT4LT)DOR>E#my(#{Tk;0TP79v zHeS?wqowXzRrqb%UBz<%FO%wEHZJFW%G;v^66>d!C{{ECaJCa`}U=XFHi66 z@6F6F1#`7ko&&ffd#;`XF8!tYzjp~gL!X2W-D5hr3OENufX`H%la7xzw|10X{%4eC z_eWia+qTK)0KGG2``Na|{U7Im*9nR1Tp`_#XHnn8(MLm)v`%4L=4#s37dp4W`t{Z(gWY;sG_qzw?;FAVhuMSAy^(jAKVEzBKZWBd1?q85QmH@a zCw*VGLyNmuDljxpb&Rb zs36@O`_fBC`2JkV$m=!pWDG6?w84spm7%kGR!2g@g|=XCn@y1uI`E;}Bx5H29|5-t zkfuv5F#DU&;_k*R9vZmd_wqd5aQ0wa%YQRm>_@UyM`U`hg`#r+?&r#i*uQa*Es$-) zY^LV`nxS(5-b(kv7F6gy3r{4&#SH-}zKYuq(WvQ!*K3MX5zk<_KU+KmT!&5TB^Vr4ZzzQd?jdU^0=<$6~`&n`bo9XjPmAX8U4Pm1fc`C6EvxJg$d8Dk5 z$Z3wBOZh-9f5CnCB%_b(#qM_ z4SWUr#i3URy|##MPOQr_u5cqMN0)oHXmxORHk4Ow7t~WvYK=F(EwWulWEKLG14VD& z_e$;mATznZLG9n&=bVuXA`+F2>&iyL(^q53Og5fu#X?Z@ou++dG#g*IPv`CWo#(+S zERVauKWuIZ@H~QDTl)Ynt*J4Yc<|kB-PySeC{|)(^8_sp z8caEJ3E>D&GStO_dl`JS)9&`?1$T%9FQ?$Ty1Lc1A4WyJxUgUaQ=U&YHcot&NQh6c zd*xyA@I`OAtBg}`wnMV#INU!)t@%btjNs2a=J{{2&^<k@nI89hzaSDw=&_{oX)2Z?W<8nry^&ewri#AxOjo-J(z>6wU5< zv$zn{&}Cx$%1dJ(&H|oln4t+bQT%vZ|DfA8zv8T z8hJH=nozJ5GiA}4++CS;KRmb|GnKYVurM3l`XwRU+GW@rSCj+mmTP}f)Zzc%vMyC` z?s=k(qnxeFzC50|$8%aZd5<*gZfiSv3@V;cpvd)%u|4u2i(54X&5spCRr=2c-VAjF znks#ABD0Ge6?hy3Q@eIEaWPsjEIpP(?jwNn<4I0ZHGEzoE9HOIO7zA1Wi6T{#@}!% z=y<0l_}cx=?((BIw1*jIYo0sL*v+~`F6d`Gm%iEz0FojR8PY>0IFv} z9O|bOR%czd{1sKL96S0*bYcR1&D3Mmu9z0u|Le61<#7WzC25gAcG0M!3XuK%Zk~6Y zpz&%r7RNEZ+U9-o0-D(&v>#NjQiKU;^Hxn>y6QqLqVl~QM8d!JHhcC)7jNR8`Io4* zN$0&mE`cXY1kL9;5)K%okc8k;^qaSTdt}Q!c-LqaBc77R`mNfP9j7f{l4a32aLtuF z-EArPL1}W6eP1Sja*$t!+-xHI-8XL^1HaGF(z*;_)ES5P@4gt8H}|OSY?VX#-G61J z2)^axf@L=Wx{uB?4jjyL%4w>`DylZSA>V8}2ACw9+bz0dNTzw;d%A#+t+E$Ja#>}0iHcsx#ddLf2}SwnCy?+lU8_cK<8Sw zQaQ@?s_Qi~^=cx26hXOxZlvlzsd*Qcv+XJ){U>()bS zVWne7`JocQYZ!B+;$8&Z+=&syZ!7$cbye5V(m?hjcVf28ij>Rnm27DUj`xQM$bUEIb>&20RMae;PS+4U5%=TSMKHbbRC4__@@hLHQ;DQ5)t*UaMRG1PfVw75k z*%El1W2+b%GV7@s^9E(NgtB^MZ`;q&B|6{N{B!n(HjDcGZcXOx^s+`2GXkggFi~)D z%({6z<%@v#(aywllfW=%`#Phg4HB3)HZ2{KWqmfh3(j*Q`sP(>nLq!_0f(MkKKoUJ zu|y;YDZ4Mqe_pnDKwyq~I#P=)P|>x#dK>2Ew!NDbgDuf*Q{^XD z1lKOvUU~F_JSvi`yL4lFxIc1gzE4;%(#G;Jfo0c2iBz}NjpF>?)OYg1@PKF&`h34) zL8_$eF@kJm?|{fM5RgSz)Xo_zL2NZj*?3HzmUYqGU6`v+6s%Y%RuztkS(|TfvyPuR zyUgZxxV+i6c64bAeE9@pZPRwxKhk!rs4ikecZ2#5f{eE@tv7<+MnFh9`o)NRZGxYH zRGm-Otr1NT>;SK^5ua~&+9LzrU&~@X9h`Ef@8c67Tf4DNx3YWmV6rVU_r|Z}=^Wp+ z`Zz?$DajK?Q#2m+GENU?Nw7sv%F(VWvJ%yHRwe?FpLS-*t zg*zBJG~oMdmIvr6`iCf?uT`nomUUCCpP{K90&XCix4fN2s%WYoO(@SJs;wI(3MD!o z&o_i3@ovt!aWXZ!Weg9u}33Ol@jBQo?W>llg7OHS@pZ zdQiUwy2PgX3#}I1=U{u`ga^WPrHDwqp1Y}84pZyWpCE=PDA%t54NrAx?oTJv$V{zQPRfJbkf2ekDwYSu zqD0GihU?dUWu61xXlGR&8K9icrra-`1Ga?D0cVdQPTc9X`(zYk@uyQ?Q|@?$w4`j> zL0;QBXT-_U}ZLP^z-Qax3NM0C?c|F=#IQ{;3wF;=gW1&4B~e;8J2htt{0GF$NniHM&0WnuZ)_I-&>KQDd{yjgM~NO z_*F9nsQn;@jyAeYXfOD2+LG+I@#DW_kW*4=cL}(Gpze^y@`1nHJYGkrgM5#OAZ*n# zPD}Usa1pYdheiLcaj$uC&i9b!?{T9+-5aCz{->U3w*)iJ;Y-vG(V+RGm8JuvByHc( zH%3S)!jb!RdS6J_pq1>l{-aAZt!2k}`Y z%w`!+J$z+A|FHdj2JOnd?#P-GS@lSlBEtC5n_nH#!d`a1RWdM@RS&`SeyAf#WU`yl zC+~aS_sAzc=y4odni%GwB^oHp(yJwzl`63xUoP}vLDRmQUtS`%7;Uzp;R`k#G}i10 z)7S1T%2w2(!cHM`_jE3Fq#;5yirrH(=ol)K=e$`nc#AeHNE*95SeW`NR=g=$jcJItZ z)>+rr$EAA6vQBhUv&rC9T$A032Ej3RJ#u+Rj6WIXl;Dm-O`)$mnZELQcLL@5iSq6^ zlJl);Sv7*Tix6(qT#z5_ESh1&o8dWo)n|b3y@klO!L!tE-Z3MQ>LE^_vUpz6t%bzq+ zHPqcdYnmTF)UkmfaBOmg!Y+%$U`*(+)%DM?ZsK$s9`l+IW>bI6RDTXAf{7#cINMJ4 z;d}Yb63TVgiJ_!~AI(heQ!Q6#Z-1;Mzu8WStBjkFHjcZ8GUxs{H5Hf(hw02r`=;J! zQMmuWVKick`6SiKHP!O9VXw?|Vz*ah^Mm?^a2LQ2x_3=Gj)#0LuOeiWlTM$zBo@pZ z-fKGZC)6F4+OQFq`nUCiubjEVc#f)vxzyXX)%;FYQOrPMnx9yJMhkP{!uI#LXO zbO;P_Nd%^9Hm9dXi53|rt7ZG&TO*c^{TcjVi|XN&O!VaV6pP$qp^gQr;mBP7(L<%4 zU2s&AIEQi~$ycIg6H3F|4PGJ6cpJp%V`E$aPut^|2b(jUFJB z!O8PpE5AO8Nw#x9IkK{GT462&)o^>HtwuWiTU9sS)!efFm5H~^0~V=9VBwP3$nrTr zl*+*TD^naN_v~v5IS=Vg6nXhf_{ZEv06Kh^+BqO$M`RkJUBbpKb~k~n4X=_LHQLY+ zky%TnZ#Seh)&zY8--G(GP9;9lf066V+?=h`_<0S+=YUxI&p`c;`FdGHDJGbnlo*%S zh*M;l?B}ZOZrB^9zj@{Hz%vXPjgJTVFlZHO$^lg&?*rBjeT0SPQ)fvJSJAn-&l7?I0fhx zO4r%tRykUZ+Lah&I}|GN_ti@J$ji-PbgINKnjpmC?cLIo$R3bKEw#zsZBYl44 zDauN4W!xq0S+MWvao*EFH;g6mBT>GeO!1K+r&fd3Z^~%PZq{j{Wu6eKU%j%W1-*-k zN}Pw4ZIoaVt!gCrsG+$@KO6O!-zU#{z%wUDC$jQYsYO89HFaKfQ1{veM1$OXCq>b6 zv@>yOS5JS>ejYNfZzRbXClzSZzyYdjefYs^j9zvO{pdkM{0jOhc2r>aOo8kIvc8?lA?)X*ZZ$E4H)F<7%la zJ1Lb#n_{Az%+5Pf=LD@1lySwd%IL>EwGlH_cm0?zT+qc7$1YoE`Lj@FFg2CVfsM{m4$~#S8;C4_$T< zIo&S96?E!GyGg2y=VVi#Y5IqziT>Es{F}Z`Cx}&GZK&L1F9Vz)+Q^|OPm5Pd*UeGF zN=kobA|na5A~v}-uJZSblD5K(RaWi&Ybpi9c{?CnC#6+1+eK8qaoOr)pK5K=`ukb_ zv~0R%ZTREzL@411w{56Qel{Engdl=fFwCTl&a^ct z@vZ96=JN;*%JSO!A8uc7a_Mxp=P&P9sD6U^?Lk%kKdv7P=G3c5!aUkVhmdd_BPfPk z=AF)dc?2T4KThS_Jv z+ld0ywQQ1=ohY>;I|ltnlz4k&dIicQMs&6i$!6mg<&foru(!j@Y9`BXuDS)gp569` z9SQIq|HY~3UHV4WV4O%^!QImpqWq4HNYeRsC9-7@@Jy=5lGbq2=OJf2~>hlrH)7Bein7 z#|%Z>@*IFP?+CIf*$vCj#gfah|FwQpjRxqmPLd6!JMveX!OF3!Vif^c_2j9a!ja{) zWyEVR=UH@b_>arK&E<}FcHn93L>oEr_H20eFeNZ2hzivD>)ftz<-*SlI+_f zFKyf*F&vKwv3s3~{f#)6KB(X5ms{b?u)H`P0DD7dTerT5F}VkMQSVMC*6lQ(HdyX< zQb|_*MqMQxm7f0-tKIg=*QOq@e*+aUiE%rd{58$Ais5oz4mgH1tj5qd+x$lScQI_x zd7cAmLa?Vbvn}GIAu2uV?P>odr{(`DsbDKF5i6u!yna}c!ne-D|Dmrh{l zQ-AY!(?q|3Os6hW&*7R81quuBm!Z}ubh3(|#mDhyq}5hRaRF&{h_F!8{d1OCVWBV~ zcv))H!07d^et<_Q=m0YA#|WkF7C^~RR`9^|jg($0Az%*fX`!~N@m0CbXFpQ?G$I68 zOM>QOCo|%BLB55u@e?Ku7?BmK_guQdK@*C!3p~i9@gZP^#BCL#5P;N&| z1n7NQ<4{$0j2`tO_g2rTc)cJOOy* zfs!)F%$4k6F{Sn6zb=hr{rv4a^x($I-x1$wPJ`V>(}s2-`ZO;vxl!kU%UcoG(1p&x z@|l&2uo#|1hj$J7g~#t&l(j4Oe5SwPv<;?g);|h-@;WpPMPHbyQOBB={iiQ~BmEz) z@K2la|9_iuZZ$%GmrAYvs%j;NcWEK;6VQ0Ee}5xbjaa$o27T?nERMo`BOpz_dR0Hb zvCp&)Fs|byJ=<^Ic%$jb&B>|i22e5~xHtC_{g*wEx`X`!0)m6SA$=w(pBj8>% z46{qm51V-slLUqbSP?0+?Q|iYy?)u|j&7Kgcuplp4`ZuSu^ztB(Fc_gVVnx3KSb!U z|KJp?KJL+F{rficrRPSv%l!#GTip`|u##r1SOt^IvG>s^xWbCZyZ75!PM{`sEmW{fcJ@y+m z&aeAst@#b>nf|)uuNsmE-u+AiAX(4lgrEZkUHV`BvlquJLg@%`Vf|%Cwqe-?%S`9) zz)$9dOVSQvi`%ZAc!GE}n_9B0n+p8mddst)pYN<6_&ucXZ@EFT(U?sK{>2#~d zz&2~3jE^-||IY2vnzm%sPX)8t2Z^{DFSPmYo0@9btJt+r!4_K+uSvhOF* z2oDUUVa4k1x;4?!H#{aP(rZrDIjogzGeW!jOi40=_E?d;4>^0EE_0Q*sr?PR%QR)0 zhMMhP7EbxCKJniS#__L>t-ejFn(7_0TU>RmGtjEl>qkPIEN$;jb1BCS5jy&8 zm!@|^4IPQR&1zXz13j!Gcs! zcL(~cs%dOj71WJW<46rVJ!58{{Z|>%6({Od-r$!~R7T*g5x)O)a_<=sBxWthMeF}6 zOYbUS_eZcGxR2XbgRVPfGE}uJ3Hboq=QvM+Soo=Y2n?56=1WcDPpe&qI$i zIpvps8YtzW{pWKe4&)Xc$A^9%;1-7za`3i5xQ}O-&ot4hx3qAyCtuYQ!#?V>EckU( z*)qg!WFl)zQ~_{5^UH4v{QqLXrC!-JI0p!MWkf7ip95Ur77^9Ib@&ffxb%rrQz<5( zyUWPRJvVOMF%f5vKU#^o`p12za%&2atX19^Q)v7tM<~BRM`16tF)huf&wKBKKXGsn z?Z$~!+P<`4+EZ#3>af?m@ z%*)cfj!ODnNWEUUQd9Wp&;!MWX@utw&zwcdv0OH^@3|=DDjQGsKJF7So8MznP3_@3 z#5eVh?k&vNzNvM0F9bR$G@+;k_?RU-=>QWOyW>Dw2XzX89v8n z?wAANI@i4u!pC=1v(@0^-&TBd`6K?1d@N0)>2Ue5(C6*N<9(%%8`zMGOy_3548r=H z2SM-!YLebP!E44T7V4gfvb}nxof}1a=MGoHOHM%#B-ml2*$+wJU3N@KNnMs#b`;cI zBY4HSljfP*CEjosG8Z#<32P+(OxAy;1E8jU`7g`x9}f#|a{Ys0J?vE$<@-k3m1?fk zHW%js4cbY84^Ro>gdMdXpvpeA_1Aod8SjZ9CMrB{a_SqTeA7*3Ev;k(t3;aJKoFf- z+jhf8fuqMs8)!p)M-Nbtch#Lo-2=nyq0uS{F5~X`G=FA~LiKa-XkqYrj?31(FeGMq z6{;qdm#OJ@yz<{GGWj{@$G>M+%b$c8a%IGJY#x__RFN86KiHSIR^`I2#qYFjvBjy5JA@x`oCA`n^Op_Ks^tM&pYf`F=0}O=08;_G%O`P(SZdmrDtQC9?NbflPgm66j@>~1=PzBce|Co}x z_YdmGrTBmJ97Blc)#FvHx*+qI;PKuvTAf}cgM$>#z0$x=6* zS^Z>LiWBa^U~g?j+?1iAT$PD%f0gkn%g9;A3e9Vmn-NwDvqoBQjq<|&jLoU-*J$wC z1o_(Ppp92uG|IjB)yxayTKTrHEKyvo6Y&i_-9_4*hLTbQ8__60dOc(@I{E>qem(JO z`_pf4GN?VQ2D|d=#|{rH?52qE*`9)-&#FOrxX{|MIIvb&!UG6# z@HXH9b6k2lz?h-#Q$m@)cB3Me1FtDn*MnGXc++AjcV)GzGL?c8oAG*uT^EUAqf6m4 z5$5t>t|9Vz@9iu40p|VEvSAcP)w1Ih1SV#+%GQ04X&>yTxh}2!PKIE}$Re0|W5b0` zXD5n|y1o6eYDRG1@H*c44LCkq+mqAWrg%_?yqxNZ^p6T!LkPFl)tk57w4@x?SX$$| z8`LLWOuY1M55m^rzx*NQ64p;z1j09sZ(#>zt`~JkQjFJvtS($_>gk9Wu z`xMr{c~-XCl3wsh&#W>#5#~iIauhHMI5wFDMbwG*V|=pih{Vd&P6{XeN#E%BaJD7) z6U3mxFVIaO(&E=ZMaMzzOgwo0^Ae52^W#dj4t{uR93?gHxM}V)e_^`HWEAtqO&n9d z%-KkSTu~c*uWYM}W$P;x4pz|PQVy7;S9YTwNU;Z(OWoY0-o|E<5Xr1?>W(~C#fbiJ zM*-_<`}Pxw7l)gIzHJq=g#w-`;$@!5Aei1_GTQr9O#PVitcP^wl1{JMBs{so>1T<3 z&iodkEhZxlJQf-&z#L9ydNL*Qs_|@9`qBJJF;vL1GG)=c|BiPK#rkS`7s))q>=}Zs z)ZhQ4FxntO8nvlApzAJL;E1S8>f$$n$m@MW_(17ItRw^2F`86mqI^a}*UpA2(w->! z2PfsNbY%?$<>gWKD{I5jiXi3_=Hj*i{L2;JziV2yQHdGZ3pW!U8da(YdE+$hElFMkR)U$ zF1YaoXVSTj9zXgVgBCch-l~pyB@WF^)1-zV{&^(e4ip6|d=uw-%5-Sc;ogeB=h^ z=K1uyd@kAG{G4B}?6<|PFhDP1%3l-bP{IjQdS(^dRdcs3=SZuqVYHEaNBekSOj*8G zuuahf7)G(ib(vsU<JB=KH`-;hrvz?}#4N_l;riG)nstD1A< zy$7xfrjy`}G*~Qjt>gropN4Oj`GCC&2?jxDgUKeBia4e>`E$3HD;qBEDtW3KY(1Vl zCfMw4q1U?p6y^C0nYu?#(IHeQJ<(H_bk>7+m=vX_FDYwIqttN+kj`f{=DhVxZih>w zA#F4=TpUj}((=nb<8T+JKvvL?U9uYb$L+y6Ji-B-W_IABw{6xf1I|HO_!%{kNJ3FfXC_k>LKSgJvTe(s z)_zS-7W!L>Cbo#s`4}6Vam$~G<=;=RVdC`d=c+6N3CrS(?e_F?W9v;T(^f!^dIQ(D zSgwN@oVr=@cGsZyk3;J`k1m1>^A^|l-#KTAS;}cT)b`)nTFw>NT$&*{!aim@2Y3xQ z9(>&^6$0L!wa0$=)?fj`KthNHKF^>NVn13~J@UHyb^o%M^x>WpJaOy!#x9U8&tDeO zre6z-Dhjcgw<{o&uy*|zQu`r=vUU(s$jg2sD`UAwXDdIkt8m*)_J(Vu_sot#Zr_c% zL59pl6=qEZ3Xq6sXBNRGDvTP01dz8~_tHkQ`=)cjoI{km{MSXNS_5}6oI)Kn{#DP> z_W1C|A4z`c-DH)b3H|yXzdT14;oBwJqXV$#-5nLJV{3spMxd2-5L!&JieKs0hIaK->)s}8oCvzr4&y1SzT>DvL&h%s@`A<;l4Bc|A9V_uU3*5y1+-s#x& z2=C!4FE7F7zeZ>d0!NL5_OKys5YL`|#XiH{>9Jhq`UczUST3Gis}~!L(qifK4h}qH zCRNta#rpE;?ZQzzEGJF}67d1)7AHFAfTqLs(FR_^I_31QP3e?U8b*g+lpoLTlN_0{ z^xosNyN`!$VjC3^lMd3T>swLsAe+KFLUp?icFfQ6+T4fb`M0g4m(DKsk53`xSTQ7e z=2{8)Y-u&uV)QryI_H2TD3~-L4sct^3ccT+)MbYSyJr^-P2KG6)#y6fvuZY=a@A3B zYJo&6so|O1Tnae@RLz6!O1y|22i)7vku_k6XbE z$!a9oJ|+7Mk&&7%^XXpgI`46}k`}GJX=2IgA5yU5MJ9d!dHZ31AqA2@MYR^KK)Xwp zd>kCrQJBjLL2D-iF*)=(KhMXTRHB$mUTNA)<9z!i5Lz?I%F_4Xdcor{q-f z+FQG|a{#2dtzNo<4+0&1w$53SRPj~G-lca!gz>Y@yje=cfKP%!)BL-`w{DCA2fq%# zX$wWvCEJ_nii~oKByu1&oP{b3;@6hEC77W`|d#>Xg6^XRCYTvL&s= zmJTB?FLlB04ne^S|hSGmwmWAD5=5758;BvMkH}>mSlGDRX(UG49{$}UEF>9%~uVA(S#q`PUuI^5FzS#O^}BbiL>v4R zSwlummuDSg{h%@)6wiTlt2AUOdr)h)n`KbV$#k@Cctxm(75E>E{gPi$#eh6AEEFqq z$jDxKoWsJR{GgGAo=FIjFh@{0=z17ud;N2lpq50;|Hj^XMm4py{od^DwxYNdDN;px zmlAr}7C=CP^d6-5-V@4J5TpwPPzWHs280j-guo3*Z&H$wgd$CZ5PA>b$@83Xo^kKz zjB|fD`;7O?dw*G3YmB*OX69UDWzK6}|Nn2C7M7p|VCjDC`u1OuiT^ke8d$plR`sWU zZh00TN~uywY|&`*i=5A4crzPpc-!&5by@7>V>PpmSeO?C3HiP0ziHB!j1DVMU?;#Hc zBAu4yLD1B93xeX}dK{(D5nT4I1dkz!92_xS!Q3|^DR2K`8B#l$POvha;}Gp0uHOsP z;(n-KoCB~D?2ZxGpG{wts~yBc1ld$V2Z=>3r#A0(%gYR`FHTj0^go#k*Zs|((DHa) z@Sii?|I)?(>*D_^HenhtepZR|n^OZxsyx=%^RsG05%Xy=_+l6ikab9%>GIW9AL6p+ zVSyHx@^Q#Kl^qY>x-z8H<4KTNxz4_cia5+Kn77O=Q`P=jgT6!ZYSDp`RJuqK&&lTw z)%!EQRUc>L$+M{fqxp(QQ^T^Q?r4U-Sk`L3^hCH&@tK{6X=?e-`O@kRbN%6Z=dtyf za3p{;y51+s@#NAj@3LUW{4nN3l)^18J};u0F?UftpI?S1pBF4q^SG^c?)KO$?zA+b zY@Lw$p-R!yQ%W5qPN%0CSjn0I;whu^*R59UoT(t?lVH^ztK)1*k-w=hv(7Xo%LM=X z82c`^;cn(2wFy@;Ww>?@Bkl(VFfG(=Sf8$S@{jkC`Zc%%Ma~!CCv4j-2y{jSnJIll zm~h`+9F&ct_(Z|6?r!=E)wy#pwdM_v7;M)9g@F^*)e?;}5L-exemA>@Yth5RX?anL z**V46E*-~u2y+c;H&`Y%8fweGcaQ$UnD!Q{>~GKYSk_vgC{#5*+rHNjfK%8;UuX8R z1sDj5kDJ}B`APRh06&dJ)nKwV%p!as^PxdokzHPyrH%(OY`{inktx)>O|Ak@bT6zY zj32nKD{0ARab_VrW?KhWv(EY*x^A6#H_dx2ZV9hjvuf@V@!jFex?e&j~3)opev-3=FWzb@as-|7r?WK@TFD<({;(N)o(uAZqquNM<-8r z>9^yy>e7(9=MKwtt&x@n7}Azxw%Y90SXjrQdJt8L5Snl`=TV+foY^`ka5)@YK z@lh{Z>!%r$9Y3IKUf^vqjK#j*c=eM`?5-C-r|M-2?_F@Q^{)ys{|x z6wV9;M9O?~42b^Ys3NZ{8Q-!_k4?rkz9#MKA*;i(93Bx7Zsity<_)h7Rmsdl%VH+xV)gc<@74e>wwS z45S%tTJZPyL~ux6ptzoNtX(0gPno>wfA-+s2y;bz6taF;v>DEv=vSPzoiVKr5eE`)Sx5YjC$_2S`HJKi<{8C%wH1F4e zE{|99bBe7`f1CS+9lZN`SsjC|{c}x097?q{54rV|&cQRMJ$I4~T>T*F1+I^9^jk^Z z)5ihQCE=qlrNSHIqH+gn9+d5A%uCfLFLCzk)<3qjIF64x(Y^4K>?-(4hrYeIaLj#j z`zPIO$Y5QTZOrY=pEPS2p52gTt)FyW5SqnA{g`||N|P78R{Txq=GGRHc@j%f%R}!w zlTu=zbiuE>jT=Lfo+TbL&Gs!Q-`G_%)JvRhTV^lH?Xh1~)LZ%>SA*l5$f8X_Fqqcf zN@Sh<(`5)rk1Uue5kG zGn<&Zz@1h(U#LX^OCGeF^l%v(XJmOu@d&3F@Aa-`cnR6K@EM{}p27t73qR(bq z#K2p3USN$yYQNYiZh!I8d&{ENfLjhx-pX+G&2?8PuF5op1H>P$^Jb4)G-moW=TUaL zkIDOn@QJ!^$!}xJW8boW(gk*Ot`4btoiXyeG(7VNW&6B@sKNRZwz4K+Hi8;&Xd{D7$X;U&-=ZtA=LSAhL$m zy)M!zGY@JNOB(3;^R=teKP>0OuVmd{%2EvIlYY9A1E(KL1g_zx5l@)!-iEUTJ>=!F zYD{rpBNkteDG_sEc10a(`g!fXa~_J-d#@o|yfc6RS_O3J@KaBecMkSa){?C%I_L{t zcXXE93A4|y-0qj`;%;x(#7Z8drnj%1xM?2ww6$APLpoGxa=z(v{KnjB5T*U=qzlBZ zNS%$L(FceQuR6JX?1p1L_u1{9i4tzt3$J^>p;79h(kgObtPoX)r`$?&s3PuwnVpH-Z3Qn}IXR#E-= z$A_uj*+)?o5qjJi=X&Zs?*-aGmF`r9h(4Trr3qW;m%%I?0^|i8E84V^u0*{}) zXBnzP((eCJMq+KZpZ(POL(y&sWGRs_SXO26tt@npaZcetG{5&0fIL?r+iQBf^=WeDE&4QH%0OW~QksJ=){W;4$C&N6sGoGD zliQbe6KTU%_bqD0rB$68d&07tIHN|}FFDO_Ri%9pBWZ_?BK|{x2(O1{XNgCm5_Zk; zc(4i?I4%x;ICtT@sKIFqHP*nIB|hI zn45hRL+LPeuUd(meK2Bdt?B1s!Ld?P|21ViC4S`q%;v9g1_+{7d~eYIHm%i`*)0s1 zt%N;0&?-@LJg74Z%+!ouVKnH)^B_6dZyjMG3<~tN&CmuN!_+66MT-{?01S;dG7MB2 zx>dsZ@CngIBuN8XV(pJnQ7?y~WOB zlk!e4+9zZY+a15Z1F?Qn9XkIb+$mTXDhsJCJ~-`N$oK-tv8CneHo(mkHDPVCM90KACYZ2Pl4kCfg741Dn?MXJ6^zS(Ca7s zFr%dE3u=v%N`u}*!_KiA`kE34U4v~ltD?^3HKUwfQ{0L3HjP}-w5cq5o4XY0{OQDE zv%ojJ{EW@uQ)QG;M9-FDoj;X4)4qG8-|mDVQPldjtxxuH3SGlvWcJ`uPS5_SMxV0hKI%VR<%9l8f~$mB(1QkaI3` z>1b~#pU@1_i-n`yu3s`R5ZG8@djh}z66*mne?SsCoJiXbT%K*u=~KLlAhbA@eF9v; z=E2Kan}lqtu{vs1I-`{b|OTuS>A96{gG zX}dS?J-(gKXKaRtlI3T0{@lUv!&kh~-a76MAtb64XRCmuho{xAMsVt!s$W+vXHyVC1j$ z%9rZkV4CN-!i2idIM2u4_V%Yhsq7EIWpd%AKfj9 zkjgESALp-I)d}n@wMWU~zn}s4ocr=t05#dk-o5@kY{_>vh$B`H4khGnt1%@nKqc=E z;HseO`vQyZprWl0ksaR2kEa=qx4e2HW*LLSIF4f5Uzhz54K7Kk|CXTeuBhQCZh*mbyV|z9In>hnKM&5pHiDVkPL~W4~{HH;FOiI@Mwo>9$`yMJKUYS)~1IV zVZkZR77Yf=d;#}yNmK9LN6s9Y0stf{2?eKK=9bX$-ROEGtla$YYSvb!jSuRHTG{+} z>jUTf*fXR@*cF<&n2kwb=V{~&SbF1%^$!Tv(nyIJRflrLCE0p#;8TI2TMozRw2CohH$-CA7al`_~H82POk7-B!M9-X?FQrHqm{yVlx2 zgh`SkAljRpi*Kj}jYX<3R|gNm#D+f#897Jr=bBR88^Z`VxsUhOdQP zL+<&V@U=z%q?6;!I5*YW`e*-&yO+N&?P`Zzz3@~PJgrHc|7ZUSfU?$!&QCgnpLFYv z8@6GRxQk^P_P`_lkN*|7`_Iq+E#q+TSFpfOx=i_@j@{{~Mw6P3e^=Z;!#9iQTKyAz#cztrMhnziPVZhP$%c{jg@6kFL(pcv6NrKD;B(p(2P@?H!n zI$UQFd@vFJQgv@X<80l6!jhj{g#xzbu`Bc5{zcOG^A``1YWbV~8tTE?-Y4cER<w)}7$K%Hrz6Vs$fjqkV8f$17|8Etn|u@I^gu2_SxTWr=gQ?^r8 zvCz>mz-yH4TZi_&!D|*|mk>3;c$9k2KB!2}! za38{9>Akm<#6Dfgk{4IY{Q~6^tH2jMq$09?JnWp$x{YG`0aNJXuyW{(oFwq@D^v=H^zS*@{PFjUbXG>#P*%I_ zYx*a*-ZzmPnM>E%Ec;8P9;Hi5?&A#`Cr>zVF2+?dR&v6`Lld~1h$Ep%N2bBEnpd7N zBSmy7VV68*@;5ay#S=26=aOGJuYA3C(Hzv3pKKa$qLnQ<-?DNw7>$J!Z%xA5=Uo`R z3A&ft|55B15J{D_LO70~uL%j7s`)6!*x_C=YmaNX^<8aWZMt9Ve6PeE z+4h5=YnQ+r;_dVfl#ziZ$Emm}OvpG>OdYlERkH@5C4l#~ey^sYwdl@zX_w?mmMu=G zP7xXFAoDOGN#S@D9(L`s52ko`5xEXHXf*|j*+`kDPvyf^Rtp|OJ zSe-GOmuaT(M7vU30rN;Dhp8>G-an^g5t@@!l9A-Z2jdkMfH?>YYpWVMro9nGv#&_h zR~=2EOapJ*$d?Bk9mp{&jUe4EMWk6oHjCe6dS$%(;)Bv|`HfYBu<{5b{>3XuPIs}I z<*Z`YASU+K!)2KeZS-mlq=f}RG_7LiHuQ?%^wRm*8VFjfv$G%&afbTB8ln<{4pyy^ zJ*1qy$wjt@T_tZ8iQrRb46WE5)hufFh7_`{m39n!6d+-j4W&vVxl){ zU}EFsSKFuHtz)%=<){fG_4iJr0noq+OU1_?3YuVV9bfI1C(`~Y>2tb4Yi+M$7*sY7 zYMqT-J(7LOAfnAw?#HtvecdMqI^q<+#k1!X^DatE%Lwa`vTk$`I>HJmdu59}1^dnW>!fm-oCNa{Yu;;?W*%WS z0xbINGVlFCjegDNH3$I)R{t9MzGN#@L-*+!YC{#CtkI&#DsQ4E6|(_7VoVO!`TZ?A zcc!?^21fL_eJam9(A#W#9q3h2S`9P+0@;b)l@(<(>_+1rWie@Q3Qf5A z(L=YTbG01n#%p)fQ_PC%_i+)%Z>t-Ekal59E*_OhC=HaYi{~PSleKCc5rT2XljFoW;@B=t_O3I@rv10Oq9jA z1o|OyAvnk;%CNy^W0VB2*vmqmmdFs+vTqUO`s8o3JOM;!8oI~cuUUBEr!D3`rV_fk z0N(3BX1mE^xg*vFLJ-7B`-Zf%D&79>fj?9tM;~_}dJ@w(Hfe(i*>U;Dj8bKkNa7A}dlBadQBwXvd zqMN(xiddOZW1Q-TI> z7E^_}1{m(TSs;|npt-Vh$?W{SLXz3Sg^geJGrum;_cE{(;JyYK6ZiJP%gpbQCwF&9 z#$hb}iJ^<<9kQI4TQ^F$`lW3@hTX!K@-8Z959u0d2|LLIm3G_b><*P7n@UocIrx?Y zBh*{C$c@WQrdU+mjqN?L@Yy?k{TV;LCz}pzFI>mSJGP8eHz4N*dQ-Win<{D}vmNiQ ztwGJy+fzA>elXwg_?thW_9ZxelM_vDi?#Sk!q9G8nei`&N^+A zS|$tV)To*}gGsyIEYe7p=`l%Tr%!^#>0#%PwgPHEE-fi^^l`O|aKtoSoGVScL0n|G z%n32yx|+UTmg?B^{94{G))XJ9*4Dcj`HDsPm;@uXyScY8!U%w3(i2hT(^18DOEY9O zOGz=J3m<0$xfZ(KR8!{$Y9~+IIMq~hVPSgiEQ(j7=*x%v7tUaf!;=K%%L#6$Z95rj zq?&SX4*Ut$oklj4-k`k`EB2|Dk8myv{yR zmggS^rb6qcLP;FPjB%svB~AMFlWuNVJyYJ{uW~bGo~l6mpngkvC3F>k>8D@C!)GG3 zB_I5x8;NtI;V;st{|sXM@;a>fEFDDK%gSXKxTxkm`nq-Y-kQct@FJWW)g^0=Ul?{w zMV)?BW|`AH=xAXsp<`Qo`Iq|d{cj@ghX1aQ{7d=2Ui|WhEzs0g#4tV<+>dFq+9y)mzCA$ z|5fKNwUgoe9Y*iPvmTnuH8m$H>l&9 zyt%{(Wdyq%#LryGb}8IN z6rRnx*GrOgaI%A!Q4n098$&%^{mZKk^C$^L>00A06VCO;da?O{GjC!M1Q`^4K1DV(n$Xu)GgT7W5-i5TnGx6M0DPbRXE)yfobq-+QA1`_blvHB z3+FEkQIYSqR>XI)7n|gZ4sWeLp03qIExTWMcWitM8$Lhlu(b7{yCmiMU!(N@<^)S$ z9(I~oi&m}!JB7`+UepoJt8IJ7>h|~B5@T_?d{3%$ONQh|72Sq;Ev`Gg5-@rbla}cs zuMUNz@HiCgrKq!e48p-0psNDmF}=@@^eZGeOwdck>pO7PF0H{OYmasjS;BpKrpQ|* zbM*DS@KzE1e6*DnhzUG?-SOB-8>#NXHd^dZ{M!AcvFP~Deso@N=_EHd=rOkfe)b1` zl#75@=hBP05i@BZO(m7!AxM&o@(;r@9vR$^`CeYFy;3o90CE<_xcAYI->LD1!!( zWj!x)fR(#nU*bkEvOJM{b{|gb05%&Nzf)8^J|Ud zNwxGzuf2izEf^PY3jwQrs5#!YuJ+@?t)I)~0p$aXnS-^w(0ys?K%y z+S<||;xe&Dn>i~n2Ti3<{JwxXKWluNb>eY$T5u?}vsPU|YYj@d78!DKeMMY#U9xi$ zFEmx1hKREK3Q=44UkR-3f=^yeRI1o)O6Fp;H>wv*qr6FYz8oQj9ySI^O z_zX!<-yoHs>S$MzZ`4{X)ppPMbkY?^4N;n+JIYPpezS}Z;oz|CRGQT#}jlvGr8U1kRVUw77jB!=e_(2ia%@-c;S&X z{as{Oe)J!+ian2Xv^N3H&=&T~z3u}|m>1-12iY3jVAG*|E54Rwarh7rOPL<<&~Yxn zP2D3i)SQ)Ob>M>pGoJq;F3BR##QiE@t4~4@hX0!V!GzkA3qd&`gWKiwEW*q~Jgep+ z`DLx%ZTsUUQ*Sy2-R6BC3alzO%#+vRQ8ck}Goc5>@GjX~SXb5la2$KzWRzUBo-Z>Q z(KV_cVO`3%599O_9`pelxHV^P|3+w6mKn+zwkl3UQcobB=o&-)IC^=HR~Qj&YxPd* zjfzAkkcvU8#mb;-#o8#1pP^elYHL-_OZalCbN+b{z-@nGvXoM7U;ph*!Xk8d!)veh z;KPYo|5>D=1zG6YcR%s0RR(E|9P6@zo`a#?GHT$}zO6+WdkW2Cbirr4_;}**N;f7V zW0|GD;C=f=g!Iues_w+a`$T^u|L_1rJpgRkZqO)Wv)4}N-(|N}2O2Ho(=lnw>f5E1 zju@^6DRTC==w&Sgq<9(*71bKz?z%d7aNdDCdw0m4G5^paTptY4?~pl^bO}inb^k_a zq<-p;eVTcn#HR3Ja(UAlxekYAdP^XM_*EdXnQ2J?I6&f&_xg@^irg)4j3CfO)P625 z{uY&#Rb{EhEOEyhBx4znc2XUMGuenczyHYgpuSLFcNNrf*>`4+W8L&!nGrHqKW)Uo zy8iJ1Z{mlw55i~@E@_Z3HT)-?FIt%O2*{?SB{@^|X3#GCM0%MhfeRfbvNJZn#O$xD zyJxK**;}qR9$c~Kxp=NMi%CxE|)YlRmg7V_%lQlre6k{b%6({!rE9ha7kXb7pPxlQG&ARxiC zI?oJ9OK5m&RsVW;P<8*ax-TOBmf=i3SVj^;_)+=rt z4%CVNeu@FDv_U8CNY|QKM#UH=MFAu^Uynm2hTZgXKQ%>%$;)!q8A8hmelh8$51pEw=B(Uzv0q;4ZdPjbR6XAGNM3~%Q4IVk!(hL*gr$#h*V^#eCZ z%r^DvzUws?-Q33%B&!urBocy(Jv2&uAa@D?#lD*C**G;6$V{cXS6Ma9Ds$}Q*~O3^r;F`7vSvJaW`J)7ja zN$2nUpN2I5UfBPCv9P~XP+oei$~V$4{0U)YuUDyUJodH8aCduz?r)55D9bf5z3vJ_ z-?}**K44te=2iACoLH{AWOw$4`v=mCYOl$JCbXSMua}D29jx;!rc4C^fg`S;a7H#) zB%2~JNZF*-R~{983I1=wNfA{jE5x`$_!e`^z%yOEl`jF?kxi+}PgdvE{|=6$Ikt;k zxgj4)-AA|+3#>JRwgzKM<^YtFN~@*kdBg)Hh{b(T_k*d4)P;zaI!!Er)uMDp&ipIV^`Qw^iMwFgV_&xu!0Pwa z@gK(kCrD{JZ$Sd*%p&)Stw?R0mx zCU0GyQW72IUplxfuCPXy6yNbI?V_qaIGwDQxX;teS~1JcKj#fLR82T(wiFTWzLu1y z&>4GUrE;W|#S+C;pA^IAC%_l|yI6MvNTtGygg+=81P z9Q_)9D9RBWYF5VC`$A;pnO&M=d+p(>*%3?cRd*$AVO;xuuyTms>_`r^hIBbDF!+j9 zO6IKwT3)_jWe)Lp5Ots3dgPiV?ClDrGfB+d4V~ntkyO|6@k7nD;{(0!4Xx9jc6A!6 zJN5@n>*c+4@||V^bS?Ep$9}Gx`gt>cwUD;4)4v&Y-6&bbdUSPHY^ZA3Qfw=}6p7;Z zT(N4f&Mm6MK__As+OldbfB3{6gdO=z!>e6QZj`cnrV27|PeJWVTTz+mhC86jAIJP< zg(nhURS`Szs*1U%m64i|jJDw34kGlO-k8y}T)(rW&fYVmTEOom8a@Wk%(x8!`yily z$}7Zm980LTGk;j4Mlp-K2Yb%0&kDMkst+s&69;I%w<;P7PWqZzV&)eYP z7Mi@S#A^#MhnUaMH#$A$sjre!Z!*6Fe%YOms~!D>zRJ2Zp>mdirj1sC9xn|u6(qxS z@(xF3V6h6F-!i$9DfpQfc-xl@hc}2YIRp88q)!|CMn8kTWx5ps)&lFFVhstU$^r*x zMy#v{&3OTdFo)&<_QWeD6rm9Lv__s}r!hCLn&iZgI^?PP{I9j~7OWxdOmJa>;$w>-tfR%172(QOk2!yR2v`I}O+0QN3EkV! zdamTwK^3e#D+y^|=r|b7?=Y7zV)t>A48>)y)8I5s{!%-q7g6^=TADQ7y_AH<9pSRW z5;^5Yy3)n-DzM$A_0n5QN+kF8(9<={#GdnK>rJ^7H2FGOynz7Vw{Ms(a}>R4t)88i z_rZVH5Tv($@*R!ZWq>LBH(3ZZw5(<;2HeG*?XirH_61(8SDLRiBVSn~=7!W##jgJYPSrlvvxD7zJu#0jAX%dO(10qOP21 zpG7TelkikMz@2%NICWyaL$i%Do(Y^A-*TPIR`Lc!Je16e;#CQ}V2FGX`Fp$Xx$^Gq z6%-~?GaPYblDOP4xaAi$6eXZ~v_Ug4JCZ-E-mLRD3OTi{eAJiDl^G1FiZCL8{wQk8 z2~oq7yq@k#uP6|7XEXxfcf<5Ruhwx&=Ymgqa8t?UruTb^Hj-CuA^=Wrz-RzzR8zF) z^-}YB${uZNuOCfgi;?PRNI@34fKk+P-X<;duNc^IO2U7c{rEBHUI=HZWqJA#QBiie zcTgBPtAPe4Y!vx7wG0?30Cj>odu6hqk@PV)tcX-LN97LtuM!{q+ob-i zotYd`w-p3JKmP-_Y!Y@~-6k`PQ7I`mr2eBXJU}I8%Kj|q<&4i<+ey}3?$g%O$TgF+ zHrM368J!3k(4d@S-I?tM!O{=^J?v2=OiaLY{)NNK&x_|*d}*r8w5)(v@2X#UmIq3Z z?4!Tg_T8N8D40G_4PH!KT4!%O#F$CR+cfp)57 zcLSP+Mj+L;was1~18csHkvf+{C#@d9(*{em9j~dwYA1AUh{BpzSor71{ergwv_Jp5p#kG9 zloK4=q&R^wd*kq;7SpWjzAjN-t)bjF1g?0u##UoP3?6u!=m1f9h;ME*4gN_7l@k0g ze*kG+rLBHlw4)Ah-1{sa^pkE|mV(-IoVPtN^8idO>pRzZzn5C*Pzdx8cM>Z}ZzCP(%)RCpjaEpATix9G0|;v z->sTA9uH;Llka+VJB_TOBA)2pHpw)q5SQG2&0IU8=T)2Rq^q_d8+9=tNtC|2hBEoN zM%72#-tPoGm)F149>uXtLXi60EM&cAy4%+)KfX7ac1 zi-5h94y9`Oa`WmPMT|#YS0gC_>24a768=o%i~PdU$7Dx*gmYUkY5Xm|HN?DE2e=W! zwtyYYw0Q?qb-`KWMNCsKcj_z(4X7tRHu=6-#8>=rF?T<2;Q?ABM=sMwg{LGKU#A8J zEBvH`H-^{Bai+{Sn6M>-;(gY?$kjECZH1k8-=dok@z4IFwmv2~8Ug2Za(K}DP}@yp z$|34)2Xo0KyB zY5qGe2`3OQuke_^e4Vu6tzwI(@=-j)ppL`7tYcaF&5kI&d!A`!vjIFls9wGwpC zSJ(89_Qh}0wP-U;L9jwnDKGeGvqL%0G8y5iTs?ioi0P#181XS0>OL3~6QhEDVQj_R z;FmIKk#TrjRV$?Y*tpq9P3m}t%>Bmlocl8upR)RRc|(1e*YkM2^XkJ{W~=*GZo8() zi>W63nty1AzvCmNan*Za)x<&SH&|B^H{MZIlVecRk;H*E1dNW7!=BCF*^GNN#)Cu7 z^hPq(4l!@0B{4Sro`g&a*{BsR7b^FbfjTsV1kQ)vuYZ%Via5*BQ3*vYHsM)H98d2b zAEakWEoWFLZn}7%6?sRZEl|pYFt`k!q4BV=HtAE)Cj2tLsBO7&cC~ip7QIAgx15OH zo}CSA+j8-I2^udw)8PRFuJ79-NoMvzsU11W_FlgQXN~`WRawn8mg!+SC}euA!xg-I2tr`WzRX=#eFD8(sWm z`KBmONKP|s5yLfV+qKoI4(SyX_vC}n>m_9iPOt9xa6?NHzrS$Gll8o4%DhMMcsG{I;9(*auxk z$9r1&py$t8R!Y7&Hd;fxG#_nM<#Ty%E-Hd zN%5gtYoxc%2d@DWK#<6sCO(4!L~tai>Gzpp0gDzkM78ApEln=IM&8t(_CG18bgq2Y zh%urX*e&~UlRk}w0Eh1m*1hByXp+-? z!%tX2t3f}ZT1m6U@tbO?F3W$eOVXy%^Az>g_Llq>h{3&Fh9#Wm(mF zZL^k05UPZiMR>X@sRlMvDl8tP`>aV>sk!eq+lZ5>L~mbS2CP(3yR~Qnr?TeXS^ApRBs05ZeQ)KYz!tvsy-%P2sD8cx=Vy>1PZV6IH{5(&|oJ z>{_f6euU_>-J?_^`D_(@cv%}MZigL?IQn;@5B>HSxL#;EML&Q4m->``<<@ifNylrp zM|<_UTpD%RS~;DdTKu6tkB>_IxEy(bJ_{e?|4H}DhsICbTjY+u;a&A8n+r9U^q+Kp z{`zNB2Q43qTc-c~JmmN!U-yFU#xEEDc?<8+!Ti$qFv(*&p{vx>t(`rU)c;vy$2Yov zSM={b`uoq(f)x$i7t9(uRM{O=2j8)ciF#m#YscDnSOf2P+|w+owJ0etwF`cBCMOOn z^=W>gSN}cmVi9qdf!xc_b}W<3A}9PBf~+6zY!BVp88{0gf+@2ulZNsOl++ zR#fj`2v0ZkD{dm#yf#Z=P%7nXM68seX+H5jl6ZGpcyb15uo59s)Vw?J=D(ROtBlP>?T z$2c?7!0Z}%%3Aa`L#+|vd4E&>*28?I)1_rntsJoBP`ZU;u!#x8{Q8sb_eA3@aQb1c ziB%8Cb)xpJ5y6q|bku6FM`7VPL{=%~Te+oDYEV9F!M#%@yaM1-_psm-9`Kzqzn%V? zfH@2%Wq|b2JHbAQ z!OOJegM3l_GMkyTXJ|R(Y8~}5QD5-Y$EuU8ijBhxM`~75utY0(D^73S3RE(SuQUK) z!h1CBg4_>d%BmP&%)QwEb6wx7Q9Wc|RkR-C-`a>Kd!T{-8lIt@jTGmgrfbyoj{$EF zh7@Y8JyMuEYldZN_Es7Ez0afLuen_tkZ)(41^#20pqSAQqYQo-Y%;2o?p<|g&>O&J zNF-)6mv+f(C_WY_zZQ&^SE{KhzSaw{GB(VVT3})-GK7aSIF+k#U#;?kHoc(pFB|^n zKnm+Ar3q=;m{$69BjRHn@~mkPQC~v-fJTnC7^ij2EaSa#a>Y7M%6y$8RbvlUNTwUF z9;8Q!P(sAV9~UKvoa>0D0g-(I_-wcpU)ozlaZW59j zJtwvdlMnu+*ZK3mzWjzH_-Xrl^1D73b#65^Zx2O8iv>mW^K5LX>WIr73Fa{cLWE*D ze{sNohNpXFdc9fhuwjb}eZDT%y?Rf5xTJ)6%y9#TL1f>2@!W4gv^G6FlgXpFp|@3u zmuU%TsVQm`n4Fnu$`2&j8b847uZ*+SA@?-=9Pgbz+XUYo8{i{#aFPcA?SS;a)p~Da zEBxgY1OV)F@z%(VR=1v97Gg)~zJFDfbGwfQAyenMXs{d6uGfa?YNDbB|Xb z-8%(mSI22-e)-hn0%0{1YNh&xn5yljnl+?uc3wh__CE@rJ3WpX+RpUf_r-7socsD{ zZ;eH|3AJp@&`hlB``B%i)sU4FI;{?xk+sa6 zmV%+D_>`{iTmGQ?4e|Sb_56Q7N#+)v(?ecO0F_xN^CB#qyc$%}G_%VAy27&d-{rD~ zGF&M$o_F=38gm9r$4xKt$E?L7yN+1RnHf|+!jeeSu3?H2il?WRCz#ne{cwYWhgcea znA6O~!F*)jLrYhFi3L6R*)M05^swOC_ky0NWdDr2v$?g32O}M0zOioLm9Wtvf$ErY zNyxQ9eV}Bp@SvrnMEn}QFwlO5pQ#L9g4}INKP7PPsXo~ATH_zJoAF9ECdYo5+M8+7 zH?(ZxG-WjGtTT3Bl1+;%?4JVQIyb~O8~dgNu$c{KhxTlC&oKEC{R`EX*?MXG?!wCD z^1VNQtnO1IE=IMU6KDOe6xh73G$~>=BQv?0x32hK5pM7-T6kc`T6q7?9kvk;KaJvR z(}!>Z!ZS;uEIQCgTP3uQL1Af@OD1jhkeG+l<+LBy%~)O!+-*o=+zHu$<-+!L!P=VQ zQ2@nV! zq=q6OH4vH*LJPh3`{a4f+2g$T-tRekjQ#8{XPkF^N;1ZpGi$E7)|%_OuK({}M=Qyp zLdF8TKA0To3T96ypG0{SyvVJv+z~7u%BX5vDBG1)Nj&%D@=?b7<}0M3GbY$ASOUwT`*t z3R)BfYGW36@plNFDjyue{mi(_VDXQE{^k9Xv-HTncg>rpz_otz{czI8EOKV8cxB+X z72{i{-1uI*CD%aV)89&pXKFLyxG#V2+PC1+)Qm4Td-Te$le5ZDOBK4eo1(i3iY0#^ zfWP+)Py8KRxBi~&6>7;Hj>i^jz4cVPf=tkojr-dvefGDG;|J3j{X>6+xJwj2+t?r3 zbKVrc_5(Ng;(s>sG;LljV{H4LbLcs#5zTew$02i&Hpl4S$BglB9c$@$s!km|-KuUx zGd4{CM{@q_Z2kZCY|)ZQ^fvqhr{|)?mq@||I?|u_;zG^HSpQ49SjY4df@Zqm%Hl2aoDZ0g9UT}1h1_V zl*GQJ>^*W46bC}}Sh$zcr_Z15*G^xjhzYdvpM6H_-e_A3^z|G8>AbDm%cJvhEq)Fq zTCN@^(yv6jMHmGOetlPyW%Ka=9idC2#F1m7$AecmLtc-V&X9YZ<5lCET=l$<7rF6gP@36et{D>eC zA?WJWC%F4!Up4h)7Hio$HiP|h#&kVn0wc1b2$CIzT6DDa?T%LZG0?;uU-U>TWe@T9Axy7h%Q6CHkvSq`TvC0YyR^%u~UK#fw12_IMNxN3)cLrd!z(IrOa+4)fOaacpR;rQ|z zhgcl<9gcasijjfmp8evApig_DHb=&tRgB=OJUUzuzzD`@u3M3owa!NGdSVqc(BiJ3 zm-kNgTC_$zMJ%kwnpAIEYqW1tUvK8x+2xI+IYpZjbA#i6Yq#tzzwf1<7g@3B66L&83jr%B+Omkl zLx(LUgDaAU(yd$pJ5Pj*hx7uxf_zT44#P4#8bqcrN>_$f z=#Q&p^=pS}&(M%$6dJDj^W|C?OrW|qJ<+5D(l+SbOO+QEis~#uD%vGKX5?m2+3HuW zzR|DN<@VQwYuzHRHrms=s{|d73Fa$}5_;{IQQZfwJa zDuLf4iY=XIuA#_3Pm4QGZV@8Z$2R5T|2i^XTYQ1k*3r{5pWpp;lZ>K)0j>LO)|2~9o5h15qP^f>drdy z^u^L?Xc6oPyJe0ejE2yEAv0E7G08>f7-?6K^m0iHP)J`bw~}!7Y_q4HI7in`-teyF zK3CG3dmTT=AeZJvrc+%;!&ONS+p$`>!)l@9NL9HM_zJt+zGJuJ6r)ixSKh51jmJ?) zD{_B@_c(5}kG0{En{s%s4fTjVw_ z^F?UBqv@7>+ z4+HJW4ZXlVNG3mKSbFyd+;WYABFkX3TdU5}% z_42+1>;yah>cGFQL96`aT3Jw&T3-YpT zcsQdRMSMOTZC29}ZMJLK=wHB6Acrm7%*nCKX3gQe+ni_>9B#3?YM1)GE0wF#iU=IW zyQRM1rvgni5)Z%HZH*YqN*&Tn4+D3eh(T$!2vzM>eKSc(w4434R=2ezts%>m9X!kd zE|-P%YcPXReQ`b2dL?e0?ll~9X4V>m-41Y-*HdW@-t8)Z6w~3uPrNi_?nhQL$;7+| z?TR`yBXjL4Vfk!raWVX^wwIe+0<z_q%>HsBtFPQVTGL zzSDV?D|%BwN%ug+pC*nhW>{w(*DRUd`F?Zh=PFSFRB9Z;y?R_}ajbi>Qmb}vaF^~> zUCg*)MVACM^K|G1O z?wBiSY`O+)3Er!mDo}igS9wi>;glvWhwiB>__I>Wsq&fNWQARs-w-iG$-{`W zC$Qe2^BGND5Rs-U23GLPzDjl6cFMhG6b#KeY06p#$5N9<@@7C_l1~D_u7kb+B8@K9 znlb4~m%v!Gdy}m^@$`;wGMq9Md=UPt950@|(9~|&oL*K=>m0LSADFIlni8BoqW5|~ zcSN_VtvQx^;E)|#rvFO2sD%^RPhwgi<_xklhqKzG;*dR%aWO)gNe>e)RqcKFEwk&~XoLG`;_Bg|ZE^ zuJUo78cr}h(bH^Yw~z1Ci1`5!qUFQ_VBYIju0Pi;TnW3+l!9){r)UuhQ3Ep*WDr zGcX}m7O?5obV32UH)FD&^6OQfgmm$Qbnqj?$*D9Q`d+H7mmxo(T9x#Rlan}*vbJm= zGzF0v(iv?cRth|@kW20_R|@9p`KlVItvzHx$2%I>EorG|lKAw4t+?gp(Hvb$QFsHYjlv8_R2SQZwQ-u0)dA3nQE$O&blVP z8)H{56N=UEc~5;@`sT55pD}&^DJS++O*E2{21(v_v6)AjybNI z0NXSs*9`~1-kyE5pLB7}EbVG^J0vPYEN#K;&|xAtOjkSj*~c=GCxWSWcD8Grlp3OO z)V5Z*)pA75gqj6bE*QwjIoCHv+5&v+$AQX4^qB|`4PZV=$(kuPybUV%em0tsR*bz0 z9?EP3n5j(h(b%NK@`qJ`O68nDH*&^aTrMalE_Fjd~@jb}Pa=h4m(-tr1eO$8fkPyX6sCWNm+Z1}Sw3 z(4JzS=zU2jP1``$xhm_=4}Cd4Z}O;9(9EB^Ynp#oe#A{^xMXQ^_6XCX_zw;0Ug;Od zTx0*Arf$wa*FxX5x69JLvJs;-td4e8%lg1hP46Yo<=oJYIFB_4c!r&JW5{uOE>({C zmS7eO@QNb*TihF=i4QZ9uC-EE7KX6Wp&z!$)r}gw0{OoNAkF(g)8ZTKtNo_OQLr$i0v{$@?JqbN4-zau~uHv84w;V;uFFHr8zsy)_$``& zIio=XimWw4`-+OsNUH94;z@cvWNMIlK<5N!_F$TkyKbmQgKNRd@os!33M7;!_@-+l zeMpQYS!N1xbs!>i-O892f&8h0$9EwOj~Yr8Jg0^nN{xk;o>N4uju&28lLoy{_xgMp0X z&pG*~56)lbLy>9{UHYHlwlkw5*h0r{GL~IVYZRMWT)gr{_m-+{w-6?rg)EE=-08Ez z!@A+B6LNJFL~Ml=X5y}UG({tE?lsva?tc2+bcZmgVv_mLz{s;mE+b20Ug18L?wfT& z=r{b`PxYnoC$44qOvbt8@^6GL;iB+jPDhD~(Bc73nc;hBt)&NJ-As2m?Q<-Js1nZU zmE+*bpl$Iqowr&@d#K#F6NUN5w!Pn`=^{Kj&ih5-9 zdRPR`oq!XS8E$$Vv?%+caP7*hrO=s!i1e=^v!5JNmZ~L)b-rAXmX?-WRA~B6Av^#Y zoguu<`-jDZwP(HI^4a8y7pOL0HtS6>EHAEl>R=&(NOl!Ta1|QpgQiP&wYu#P(!baUz^7!p zOlq3-L5iM{irxv`cFB#a(2TdZP5$_}_>`22P0Y{O(k}DeIgOe4=-BJezEq{8;<(J_ zJ-ReH4D)Z^Gb)XzlagrM11F7`$qwXonAY?WYruF#)x&iySa0`%aZrJxYsKfBjP`Bz z?@DIdktU58a1#ni4)}4DP^JIm`v*RU1>cwd+)3 zd=Yq%4_YbsxoOZ2YwgmBFy&r3Rt_{VLI)X4f7aH~fEKpN$5GfEazt14jJ2*sHGT;p zJzdeqm9$naI>6e;564bpcTn|_wu*0GlzPS(FW?fz`!4QjW}gz~bi z&V1*upN%~nzw<;+mIpdZK&kUN{yfUo`s;J)0aHRJBiCzZ;>f}~7dX7CXiLT`EDby} zFZ|l28hbLkR;l^VxV=lP>e(-UVSRTtq+r zN~P&h=7NzCol`kX-+H{~qX3`6W)e|bO(&{yVB8MOc_(VCK#Z3OQ>)qCU3|9yibdVF zKK5RU0g{fgtIL)dj-O$8Ijl}Y4v%U|7Wybo6_{2STjGIi`txd#3fWZf|BkSE_ z<3E>q@kU2;sqp&^>Lzto>2W5@4h&qEV6NV@^7HuI(^Dly{MVuHSMO>9)xYl-5-K zEjg3&=9tD#aJjp4;nIT7uOI(ct3wPG2m0T}GlnM4ge{8o)OCGNoSs71Us9JJ6kQLt zkC_i?ZRC2o#f8zg2(J8jKgKtW_5K#PE_bsvX4{yU`!Q|3k+4qcPjgT6OQw6Rqvf0t zH!_tKTnDN6*1E{_*zF(BpuyP=JtH}$X-{DxE$u#0w{jSIdi%6KMTYa%a zCm}VKnfx16;lvIiYnVsN3RMdq?hgocSL|AL30Hd!s+6N*0A=zD79o;zb4|pI`Z&%- zZf5BbZFGRZT;@fGA6F}A)#Cqc4G*%P8Dod^%eMi;hFTm z{>^2XK*`mxwZRPhmLhh~{YPTKP}PBR?o!%2`u7U{VwgP{KH1D0&t8b}v+YgDJnFF2 zWwg0Fr#d5C4^~O{`DkTF*y#+nJ8;k5*!UFt6eD*z&)5pa?YiL%?lphi$LUBcL_x%4 z*6|%rsY^tom{+mZt+FZ2w~plnT{^|zuV(PHxXf1P*v{LFUSoH$zTHZ)sPs@gOmgd_ zbT5tPa&IBbVct5*1MaV85@iuF+6zD%i4w;ny0T?tl!`;b@++@VU2}}3GHGcvVL*-T z=Z_0R2|xbZFki3PaX8|ye-|#N8h>W)9QPk_r|eS;J=?m0m(SgTd5=p6xc?*WRMr1b z`L7F_G`@cmOqQ%~P|@Px%L#>{Xt2rMH&8iY3o zF>1OMpqG9#UyCbpRZ!t9jJEaGObK>aVZ}<%{0BYljb-&UT3IdKSq5KBRX{?QcQd0S zugg63*H6*FnROP<_N3f=9Yz03>sF$L->9*uvG#ogf&aeY%d%z_$sW;iGMD1~V!Yzw zL4SYZH{qLqU5M_>@XC>muIkGstwx<`d~8p4Xz6k6)r5)vSRZ|4d6#Lmf%x+vyKJsZFI-l9v@|A4SGMIZ@BiPT)q<=TI zKV52(!Y`)qEoj$raNGh{*hca?P=KE zC0kBFKJFo?Y|&UU$Nj%{mwYdZjgT@T))+bybU8v{Of~c_7{Iy@O?5$_yl5Blcc~ca z`Mo&zIyO&blbksvQgEawhv6Nt*}B69zo9aE0uNUSI8^kJs23M1DR_MC{6yt-d4vZ+ zg=Q+UILGtxG2_|K0P^X_LC&90MKK39z)Hq)C5ajlz2LC&t?A)fxVK~L0iuT`XK0?W zgk1}jMy)4J?cH-|d;QL6tCBu1A{pTvV{_wXI|~wVCrK7`a%2KiBQHnnQ(S|aO+a(ax5LJ#+h7O;+Q$N6!zUM zK&=22cPr~@g8Hkqs%Q5S&c0t89Niu|Wm`Jf)z{tshbUp!s$MnxZXIB4hQ>V#kISaa zQV)~HZQM_^PELo(;n8wvHaP9m(6Bj)rmLQ(f=CL|C&<&-@c+` zbwiJyY{aFEM|H%k3&MiP!+mA-$qr&3>XQA{dq@tgI(IC?s>zyp{?AE}lqpgnxTVY# zr)-&;NM$x)4h8zR_!Le)TdD~NI_HN0CmILNX|=o>J*~toKzZZTe%!%P8uy_>0rQfL zCPNpBG+kPL6SSE^&wowCsg8euuPTL;jToB@PIKcBL@283YHN&(vCTH;F`;IEo}YzL z2@(iKEkuWhsZfLI0C=Cc{+k-DQf7rgn4DoFQXE&m&0D;Ij4{6iK^j5dk3mzbyyP6v ze2z2Q@N*_VR3H0uzFF{D)b7U+iI>=D81jkloQPBZgRR7pu;zEq;DL+3&xVR>t%BWX zN(c%iwVF)erYs^AczOSRK&n;++3j=8PdO4?N?^NOfp&c=iqX^Z4#};^FzzXG%;5Ig zQBT~tJZn}m_dI!)khy)T9PVAGA%3@GcZay>>$6Ag--!>rab&mCRGk^>MB(n}BU{v| zI?Y&Tf$}&hAH{y#)zKB-EiXz)KfSTl?y#4U_nymV@jxNOc`{5c#OLE<2J;;Pdw2T@ zt0pwp&RQluE(p%R+Gsn=;x<(U`f{m|N}f z>U9jhe1@}NbeqOrzg{3s3&s! z*V1sH$?|)m8J0s$cS_zgq5HAU9~s3Ezjv-64&0)eozEM?Qa4N4&kkzxH)?bSc>FrfDYsE&yh9i@jDLJ70$)6N%Y)qphuFnSkz%=ubK(`E5QUdQ)Nu*e5DOTTN8SbG`23tu0E1pxeVWy};L9{#<#v zFWrXoG-u_OyXncU06?F)U152#2qJ=3c!>$HxS_6V)0<$hBI;6*qxYEich9Z21Qj1b zE8#IC$6psx-~lwF?SAglQaf7i{(tlKPs?1+P_Jz$%`(^Ogw<-l@W8s(`fd0p4U~U_ zcCa1_J%^QX0p1W^EvFB45~J9rd*-U9a%y=RqdB!|`)>tS{`$A`7A36NTpB8VdxWkk zE4K`vRRQgmEeoV&QGVQqBwtb>mg+oGOqPE7V-q62f@4Su0YzHbS|Zuy<#Ch3CLO6+Tu zgV-SN4NQhy^1rPdBPO7XQ)i4is~%PuwD>V;eJDL1#PtZOY>@$dr*@ogH5=;P2S{Nf zN0dUOZ&6i_TGP{-UeBqssY~gtccs?}(cFLjoQ@)VaRGak&1I}|HLbF^sARlzEMBn8 z!I82M>z0yZwae5-PWd8e5W?R&@A>H74ijs{NHOVdYN4KVp!DFSB31mavzcGJn2x>f zmB9@(W=*k+sf>JesPfzB81Dwkp^t?qI#=U`Q2<@VJ?g{trz>{V?KAN4&>AFI z0%3`wWUIO@xOm@vAZzCOP{~YK*q42W^|gAdm7g#e5hE5yZtVoZ5IHNEJgQ%*CRG-~ z>|A-}CHEWaRv?M06aI?%iZ;wlx<5y1y(++RO(IhhkwZ5}*?umzPJ21`Mp>g{N_@B9 zJAJM!--N_SlA~FPisSmvOShboG!WW4zVqD$zV?Y`WSl4v-B-ot*G18HK zSKCRnx82k6D{_~m42aE>lDB_rY6YtVsI0Dc463FL1cz>b>f(u7sH#!rERkd2C&Bwg zNPad;XMYjC=mjs_m-2W#-9;h5yF!Il*3agP^=dH)i_29w&u@dlg>!|l_ z0j9mojX}f?uL&QH^Lh3OGHVy%vIh*)ForgBT8}|Yfj!Af#ex`+T+n!{eC#jS0O=;) zccfbzGv4Dc)Wtth{nW}w+n!lh1!2lB*(H0gM^u`S9=He(f34B^vq{(vke^$+mGxOO zKE_H21GBhZbw52d0~b6vYG9H!ASOjL5rHWPWZ;#ZiajirKE0eq`c*iexrI>un)RfNq5EFp%-?? zjMWpts`dFPDvBX z0G*LAHJyalADx^Z|Ma&5$aZxt6gTf&R030DWcM&u23Hlu!%W< z(OH8;KzAWnaLxcTjHpUKc5DS=!3U*BhgfZ{6xnL2e0yO)U9O)zfgB3>7}8axI&nTs zD=-WhgTUN)OOE>~q^`0((Kv^boq`-c$C3v1$r4#i?XutZwKPhStDzuGX290`dOoRB*luYpu)BHsu#6OQNNj9@^v{H`^`*L@^KdimFJo@KRY)+5rZKn!Bwk9 z6h~O)7IIkKTRPUz>K*e`Q5^?J)nTW;zIdPh1Rxg9=B~_pL#V5ruMz^}X4WG-&*sG= z74Z%4%b_Q|zZ%GNqsZ~D(MQ?P5{zFZCMYh<=xc|j))Ru;gIfW<6~4n4Z$G;A)jY>8 zvwBXBOX4y~KKy>)lOC|$@U*T4)K`wtt4=R_^}aKzZTFcNJ26A;{BD)9vQmYam>W!Y zZH*jl`l!1Wd5u3qqy1JX^g90 z!;?$q+86Fy??#655bwan!_-1TCP#goym2Jpn>u=(;MBf{R$@r6@?#%;UCqUWsId`a zep{^k*F5h&7L^p#2yh|%oAwJaWY@T2u#@3A=cMd6VQ=f5o1=n z%nlGK(aFF!7CsGL?#Y(4fhX8$%G%HViA7l^~yrZqr|0X@V zqyM`veFmIP{#Wm4>or3Vi*377E8obJPCNZI#Pu3bePu#YF@D8EqE_KAe()nG?01|5;eDl$VudX zsV};=LT_BSIvn%=x?=fvgU)ZlI@YVjtDMF9n^p(<9bfPEmDR2mJ$Ny>rK9O55*m6+ z@_fl+C~N4p?*entsDx1&?VRf>ChL3UibV4jzZ1hImnu%=NBUa)Hc}+L)(^OlV@k)& zmM(QEUay!ToQ@h1W}QdHDy6=&H%H>li)bkC6pXRZs+U1b`C?39e^gPR0+qiKzENa7 zT8je5Q!IzzD0Y$y!P=ca*<;&h{&2iTtT(Xgz>eie6_ zINm9(Q#bPGck4T=?S=TTZ&W?7$(c6u#l~?}wEAK@yMdUd{d+}T$q zy)KSt_|$@dUFP~GAd)B-tjB}Sn#+2k>@G|TRS1s-sEQgFLdSXK;Nw|@4pmy^y zmb363>PLP4KAU5eWa+m8!P1(AXP(+Jk-d=9!B0eDLc#y4`c$+Kkn>okkqJmuIPpkC6y zY9NU))ccRrykr;AcU?|VpGA-s0@;CfNLC-(TiDO~wEg!JEDKg8IA(AU#Rj)l`?de> z;(9Q1AVYzWlGNQALP{h&F*PDyd%fPYmAe6eVvwwaio6Nom1lo*Zlll|C{S>%1~Xr+ z#@4=4Df_sZrlp!=FAJq#ZTz(6;VL1U(KTL+`Lx>3U3&5n;-=v?2B@Mp{aUS}`|0bc zBCa-0eMeU0hW5C$txr~>&>zwsSzGHv8>qA31u66@r= zVdl~0%IOKmcdZZPO1`5d;~4}?UcSla6zNvGU(^@cOIf?djuUU67%{e=h3AEt0TAO! z^TE*~SEqZDrl9NVS=$0JqGtd^Q z=~mL!tP~8H3UVOpjbhgd)K!wgn0-)dwf4Z*K^5eh`*4Dmd&4vDuWvEad3GySRt_b1 zunx`>r4Gmn8@{#mAX@CEcxacV7z{M6Q2%VmA{q-~OX^Xp$E6!{mIIu8%01YhJ zl{Zz|sNI)qON_E7AuuXXfV84Xbl9HeT^N-h?hCk1I@8G|-@1?QIcc6Yyn|uyw;lg^ z5xGmChD4lOdeCv7?f!M)*WmzdKo{R zu&b6i=uF)nU{5Jk?pVXCy-*vM&p5*I%iW4;k1F|at}pPN^tkl{HB-m|Z9H^O*mgmC zhcYRpeTqA^V-tpdIDW$Z%xA^A-tQ_QOHO4JJBF?Qf(%;pG+C6VwFDlybJmR39_E%Hg zLYHm79az^)V;e>;u~J_h#0CjJrkwm z+|cYL30~KHbYM zf{V+2xR)EYs#&)E$Fy+Y@=BCgoLw2|Aa-uga%921m?L)NjK@_O_CUjUudQMaiudbI z3?{}`4jX@lr+c};qg#=njoM&3oHaZtv)Zs6Pw!W`N(#a!(iRCqbH(DsH?OL8xlj$` z00{^@we}w|xN#B0&ZD0eb97jPs3;M(;h*sl}ZP=+P#qq{M13+M1C0{ zSE#`@b+8T=TaKzi$25-^J)^v@iJH{27G9(O5WQS;{|zSLW-#dLa`v(uM`JSlu<_@f zk$;|bY;rp_sqU-Q#{CTKKlnq9E{UeV7pVTL{N8(Zj_Ds=(dkV!xuUL5-iUPsRE2#o z0z7i>P)>FOc@)AS@HUWl5itFvaJ=jXR*>tyy{=PDcyi%r7AQdc;8S5RCidJ+CIHoz zr8k6e3e?j{!g+IUW86J~XhlwYikSE$#s6?)IR^({u25xNhILKfIWyh(>w?D#Q{&o6 z16xwCJ)~FoPFb2AQPmDuVL1G`xMSy~(L=pn{GP;%v@lT~2tR4iD0!NH|=-t3l|m3qOX69`_>D|GF>&epP&6ou$M==62a=EI2Y) z;fl92kDns46O4nwAuHf;=n!7~JN|K(xekaL9pw0=?3mDO;a`v1_q!Lpm4&|P{JSB; zM5n_~%ls)duNr2f@UZC6W%M=BI%dU#x44$<4l*0t*V}i=gIsag2OsfKlGCz@)Lu4u z>@T(!89mUw#hq;gfUh^v61PLHJtM#Otr-Ui5*DosD|B}6KJpV*ozYw|nA-r3Ll=Df z1T6${=4f)#SN-NRTBEUlcr;GK z!o%vnxpNw4yKjG;&6km%T~8TpurTx{Shf<;VxVBhK($G;5t~QfDq6f+UJieF_eR4K|U+7Hh*JJ3HB0fTp07KQ%%Yj zJ3;d3S1Trm(-;0$TE4^bszw2BIl!)zXdedsTJFaS+?R{$%%-vrx=t^3H8~qRW&@UB zeI{7MkMUt*a*GS#q_{{rG{NXYvP{Nja7u~^zrCF>!P~IW*<%EJC%gTcaeoiyZYSO| zyAY^sWha+PxeG5v6?JKuAd8yWBW3}(4L8ojjd1`zcr;U2Oi~4YL``&FytGU^Pm)Rw+=uAv;MgGu^;(Vr?^6&v#;!3%Upq1m&eRSNFSj%x_-CMM=5EHxfF+t_^TzPD* zTw3hsSof#QgO&T<*IRY)>QAE+54({f;Ue-GG|TW^zqsR^@LS`eR`4aMB~)!Wi_gg= zVUK5A2l|ehnyqk+qK>?0oili%G75ELN*Hv##UP@h02;t#p{QS#kpaABe}4tT>WxuS zR7CB@7=6gyefJ8Q!{Nt;3*OpVV6x#hS*xs+OELEkHvz6{Skw8e24+OO{%!>#q3uQPdfaxJx+?984KDcd|LJhE8Z2Jr}5!(q#d&$Yh^FsejgtZ zudm?cPIxOv{CFVlJ#uTj3!6s2W)xEGV?D0n3}w;dFrA+4+Vu(l1K~JCB-%5nYVBGI zn?xrHpAPK@sj>8-c>(>{3Y=9fVWOYZSx?rb*<-# zG3*w>Mku2{`lG`b0Bq}IGlD(nQHkI@9?{`6qq1LveRH@1dS|CB46$TNmt*ZGw2{v< z-&LmxFSI%s%QywAh;BfOMJo*qA>h_AB1;|V*G@v|NFv2{*HN;%%jXKa&<0bcZHopB zo#0v=k*cYq(o72$Hd-_WjfU1_^bynO_$vfUPYN*I8D16)<78;z2sjDIBz~0cV3{b$ zWo?{5D-1NS^7hrt$P%$KrIUB;mNIg`DH7mrWP}*MOOT7oJy}`G)$rw`)SYRdKwQ3EHVzQSjFqc;JELh{X$8Zw5S3iGobNzO4Q~9ZOwEZw*0c~f?3anNyUr?6{w5b>h zP2CMWU>F;zrIs|fQyYIzqyXG51r1y0Z;fs#)>;Pkp0Lb59Fma8Di&MX6S0q?w=aQ3 zCWuXr@-n$NS=4%dT@-?InCb;fIF84@uki7-?yqY}`aKZm&bRHL@@XDj;J%7Rtx)F| z;~dJ-L(g4sOdVc{Ka*>E1bHlAs`7x}WqBd1u_&~#-$A|Kt3-Z&etX-z26-V;s>=g% z*OmSXsW4|HJt9|~dmvixFoeqAfqj$QJ|BJ9{o$?;yL)r!IFoDYptGqJv{2071POVH z%!omBs989E^-SSjF!MkJ7Z77p$bgP2h?}a^IJl4mvatgTnZ>45!_=DntL6P{xzrt- zEE?TVq%{jOuu`fzOVI5*D?%P`2ahDp6yP76IgSiyzk05v_ z8i}LFe&`3#GxxL|$e6h`7J=Q=hS-&mJw$edTeBV!AUyYKu(ozY2kT5+^rjo2uX3_E>YDG?}u^ zHZ*c`dt&TgdE)V1_1l`qD>;eCRb>Cs<)l@5wpQ27fm+&44O0}oTsiHX-&_W}fsV6j zAN8`2Q+#9;<`OhnLRKhQkZP#(CMe_=PktgEXzM?Bh=`AO^iDPfD$GH3-slgs#lnQD zCueU??V8U|i*2KlggQDqQioOCPL%wrr@&qXI(e7ZI?|A;m{&w|S4xdSUfvfYH_q78X!&v)S<<2hxt-jy||!y>(}1`c7HrEcs{e8 zf8>7-V%(3aOgZ@If)8_pFGa|}I@h8GYr5^47I-3Njmo&WNtp#rEE3vjC!~OCjONC2j>G;zdOU&q$W}RKmK!pjt=ClH7 zQPU4Pddf*r)?Cb*egg$u4BlZ!JL=V^%KSNoQAE5!dlWPm{Gdkl?nw%0e^pgraGPlX zJE+=dUNNJMGUn7+BD=``DQd0#ZRa_83f;?g^M z=W^N=vC*#+#J&*trA=ES6k3xq>MZ=>i57!YvCK)%{r>fEMtV(uO+^bj%|&}-)ZN+B z41@OHgM&UsX%Jd~vo^P-kPy9C9^{nq95ihzM`i$5vF`90)RadrhQl>zXT(q4X}e!gLXsMI1HGBvZI zi6iv;u-w(c zXhAUpb(d~$R86Z6eRZL`jvFtc>esxvlvZ^^I(k%Dn|#d-*#shLAx-SC>tzNm`jf%o zH6lZqlycsH$L!~(O2HIfli9M`p3=Ks_K~*j(4@SzK$tWUC?u>Bjw zDRs-f0y5ahYF#NDaJlxbanz!bc4wTDa(try={4ry9gX@K*Tclf=~Oy^ zUD03r_4_b5cm$oX5bN*I`lww7>DH_KGdcoU_wn%PH99+)(zARl9kJC8G=6ncs$6T)hlA)xx9joGGV|lG|loJ6>z;-6(lO zUn@}zN^KXep z$E8yoXpFXb_5t&4dH_aF#kk=;@$B82Afoi;NJx(fEZ9UUv`15!1+41dn!hK8HFBp8 z7p6`(+U7i}tqfK-HocMWArSP$@86Nn&d{$TYU*Pn;u|=r_|U=7jW5IpY7$?Xo+m=1 z3}BW*6Za{XQ`33ISI@Gnp+TxVAuL--M?vzlSeC0tM3$?&K?SV3_CKykxjp$0Q|S+l zjTiodoL&#+7Rif*-d-m;e}HNpjJW!i6a#)$sAoV3y7e>2-x@b9J@>v%TUdF^f8tX|B`5#nxueG@3&VnhE*l&aIU zV^ml;L37^QYds~g)jLrqHY%AczrI|6zWDJUPQicn@xRxk_+A(KW7)U8Z>fpr z91vV3cJtiW!o>4abWri-1`iJXrHt7-`1x5=Al$3D_=t4YtCD5pn@xKwGr{XA8b2V6 z^Sl?3%3$sQSm+e!zwn3w z&xlSzHc`etC31W*LH1rN=x&p9a+!BnHr2NnyykgRyT!(67-{{-7PVQvHoofl5@A^z zRJFbK(qPW7uIMwP;#=(dIgGZm3WtvG>QM=0s5=6Gc<9-E8AgNOB=@)bikJ?Dj7~eh zqM=dX)2SM4C9E%8`#>F==kkJX>9d`<#Gu} z3ume+R;uJTJ(o4vP$GuCm(tAF?TspSzDtwrB}r3b3LOost? zfS$vZox$Q?%YFKL)ExQ9`N&){OP?lK=bl5@_LDo8)*#NqX=? zpf_r4G%f_+zDIR)u9)=x+&Yhbvd`TbNEjkV$}8Eq&THIf8wgpyZp1NUaxodlg9pCw z0zKRAy|%T;89>e(V{m)LcvTG`XAT=2>SNVVZ`cd@)}>m`Xo6ZvYximKwihT5Yc2Ch zF;Py4w7j9esmB$rAikI&`O8FLNr9Q3+49k^|4e^hqP{Z@32CK<37oecjx9|)A9!DS zq1dvrt=;mzoIbctmGEUhWBu%@>l4VeT&h!*pfLRyq^XzEl;SSnoM(n zS;?i1r#?}8p&P9Xr@ygF1@Z7OD-UErLiH(R>0{NFID&cbmdFG|8dyy4l7;;gA2VuD zboW7OEt&`o=}#HSXDcmcZKUP|e-0sw7Ar3hJAZHJ&h_${$+B0<7?c|U^c&2;Ut&#J zP^{^*6Y}BiMkW~TRC7rA`U*cfT9_oJX8qZp$2KDzNSQ z_AOH}Q8y8_fbGc?u)srWJgW?4AQcwl9NC2M)z&g{*AU}tD9+LsG13_A8f^vX5)#y% zgmY;WfB8oBivRohX9B1z;X6|E9eRiHD?2$QR<2yWQ>JtR;Cge(mGv}rp_W0oBtAgd zdLCA${2H;;(`+r^lj2ffI~k-8vJfYm^a_KQdo0!_C&4*_K+7Iu3az|FAi}&r>{A{;6=7?}2<-fcsF5bsWc>Pu& zE#PhXx_xweI7^){nVYpHJ}%yY6afvov{Y@?9wX`DHkbv7+XP)guziF%NV|pkKM0d~ zm27MsDMRi~4NS-VUSMzC6>W$Luj^`db{SR|2w;9@jZ8{l5Hno(EJ&w%VwrXn-1vqj z+wA}L=-;((_zQA{734BRoA>GKH}kf|6+PMw0G1(Iby zh0i6=6XWWiwdZf9f3PhQHtrHEuN%E1#O6aWAW5!dKNQ<8kTl zkZU$Axk4_w)&v71R?5l5I~m-ihKl7*rLNdCFwmCB`?J)mBvF5It}HoMW-;FXPk#wf z)PJYhTR3W3F;$VAe72P%q-GmF5uP8KGhrtt+U2nU&8U8!XdQzH{}FyBVyYpH>6t z^)1?rn{O8_6y^#N;i!po4fG#0lQT*|Rbzc}Fa5kSZ7XHu(X>~3B3)Q~ROxiq4~lQV z=LtnVY<0@YHEmgvW=TD2#BkE27EMcAnQ61VBZn{=# z`*V8TmS)f_->x2lQkRipxgr6X@zHYjhEvXluEnqu+#lZ0X7E`fNI2Zmq&L90ghPPQ zE=p(eHbq@$y}E$O~j*Oq#x1{!)X98R1gn zFvx-Yu!gF1Q1%q4Ap+#HDJKe`0S5fE?G#qg_^C+P_hG!`gky1FYvDdoxVMlsp#sHpXNUBNJm&g3$K= z6}XexN&Mac>~!3L@&uQY!jtLy$VX7i`yubCQ9$l$jZU2amR3qerF#72Y4gNx;L}4f zve5RfdxwQ6{SH!;erZojBE9DggU#5R|K&RmTmCR+TrmahwLf4cE`Gl|0@(!I);B=L z_p&WfG3jx#jw}e@A%LNQahkkq@XN4+;f!Zit%QE2&V)Z0eN0TN(+z%hCtTXiJv~@R zj>+wiRgKI0b%|Y`q)c1rqPhSx-|TJzIa(^(dX@#|}O(unEuBe`zA z6~zP%wWk1G5nQMJD+IRxSNZ_g-kySJubUJ$fvJZm5fz_Qwe8&#bM!p@qSg zegq;)aid#vnBf%5aYiarE+YSV0Ri747`-u!bBMf^+Jq~1NBHxkz2Ip}x8{SkyJ9co zcOH5lKnl+@68^k^#i!R52UeAn>q5&%74By(8Xc?S4}yc6cBcNKHp+iqK+ct459BM) zHO9~3#{V+sU%m6+)Y%qyau)-wu5MwCeX52`%DM%!q=(z=tX=%3s;-Sl#%?67pwN!t{8!5 zc*#Y91%h^wL$TsmN?L8M=e_j|mC_YeX8f^6i(-nJ>l#3nW2=(mV~iFvswKUhbu8#$ zs4UIg6IA9r!E?>kFzH#v@57GxcXo$0?~n%0H(Xx}eG>{^%i;amY-_-O>)e8-(7vdMgm~MyZ)vU|;``o6 zkFeJ5L^u$cp}&UCE)x^6&2z)4tdzMVsan45&a_azx}Qp*GMV&Li+D@t`bz-`9affH ziXwzo<-nBEKn&`Z+dLc3Pi~?RpTQY^qy9AME(z)0>eWV&H0zGE5n-)`S_Q&s}o4 z3nV@$tK`=X3B{kiq4E7IZOy-E`hREVQq2Kf`EhsCnKLoinpm}rDRy-#7!#h8af)fY zT5%l8w5PjuG3@A7?wzaVNj0{~?7}7BHGT+p3tG# z6kYAI;%|$K0`HgV_d9HNy_rQUx^a_-A?^$w0Pi|F)nA0`raS@vT;DKLljDq8Gjibo!E4@K>y2$F>YYopng z-3$udMJzQYn7-_oUd*mcX#7nfgx4mM`I3RMoFuI@DxY?=cN=5Yh*K;T8 z^xqdOV~I{TN9zc!Aoq-5I7WB`0jE1YIeZxLhy_J??2K0XH#&}T==8d#|6T# zZAX`O4Hd#QC~*n5H(pT}FJ57QbE~(cEG6)FbMe>myawYt3EJkH%PpGDBUw#*f_(L; z52byg2#as2_@|CpJ-L;3?*eXb5eQs-Bu}(2t||@Zl;BG#T4Fwo0G9-fWYkQQPYLd* zrFkUh_TD3mdQBM$bYE2&|1{!vQEqS~sA98A$xu|uO|hZTKy5`d=51QPhjWhPXuL#> zhE6Itl_xcSB-u^_fzIN#)%D2ctV&*5TXICwE9!xYVfOGpB$k5V^IH8+U=MxH@P}fb znU*1{=x8Ncx}XJuYtnAnW+Pwj+-6=%j;jK$_o}j6Wq(Z3i;!2$s}br)sTYccolCTd zshvvAexH_`{XUhTPZn_bK(hdPN4>pf+u!Z(S6@N?Zb*jU$0O$a zB+Lv=dFE?Dto$PwYh+{Ph>lYm<*32}<*mZac@w&yx!hNRzUE6Puwj8qUDmwRo7 zSxvIDYQoFAfGuiDoQ)S0UuR~7#>jN_%^t?N&6Cot>Zz0CG2Dp~M@9++1-&E*o!l~U z(Krn*PVbDflw_qi!HPO)RZ$5bF4gdC8Q5I#laf&_t(9idnigqh8T5V>h zRI#8J!U@76Iq)$bw-z-!l35ir&2<~M&(~ohx!Ic#$Ho2PqlTXCZ6l7vcJ{#M-dkp} zt{Hd04{t0dfs3V$oSH3ny_(jZahar5Ph?lC%;@o_uy?gqGX2KvvDr|qTTS9#$vWV=A#=o1bB;*ph&B*j!awJB zTRhegy>32NN8S%u>2gb4O{R8i&M;42}%>I^V>@YcAh8)W-;S?qV_`ZYS6=7b-M{8!osgh< zia-v>^Yc) zLTmmBsd(t3Xc@zDc7(bxXM!=K&6)8D`xeM^tbPpnNLt4XK-xc6mEao zHNGo?*A}1BYhp@n?EcmIR#osw&x`F&TILq3)vGnIyoaxG3deZnV%L~!aNdCK?^yZE zf>3AxRp)&+2=Pf^)~2Y-6aBtz#4{(D!*sgJ$6HkH#)g^IX;aCGC?b%{#@e)b6L(k!U|RNNrB2Liwxb5VbY^I$cz7yVo4HvLjXz@;<-JEy=@ zJLS9`lF~EA=hSLjF0WY5sx)74HwKxDnHdgDCzMJ1uIPi@EyZ;{n1fqB36w_+OQ$H_ zllvp=4;zsOz{x4K&tIqF_lQc%N(Wz^W4V~WTIOp2I{7gow_T&X!a!QrSNB7{p#UwM ziLG>p*wYsFO+rjKloAsjsHhiPgnsqMVaay?5iFVeU$EpSLH`IW`5(BG|ASM1kvyv3 zx?6Etv!|vJeROG)6hHFvAjrs|?!nO2E-93Y?1xBMnc|7;SEDMNzO8-1rtl8)ozLPs zhpFUzm#r&eyq0b~^G;Ctqs_5%fW4K1_EBEZq-653l2R?omv5%Jqr4&>U0yhFBaY5| zO=d}LBQXQNDnwUr7~Vh;ohKKRBl1~*1p+2}h7Hd9x3_omPc&!ZFTwSPS_Vey7<%{j zTR0cIO-5=iV(U?Wg|gT8@2TBiwXpO=jwNIG0Qjl0h!|Uj5c%+9h_i zlfIG+V+l5&zZ5WV5v&%8+LB!(SU!;naVlmCa2a)YxcikwMIE5oIJD+D(<dgU5mi*>swR?V**XC|UBqut%<(qvU>68Z`C$3ke+(-+451uh=a9DZD zTX@=Mz0`P3KOq|h0<~jicnTXms06&WlT$0lBIfwCrRjhN73{YP*l+&ceEO{UC&sz& zNnt6WF_I^=gj&vd>U8cwl*^Jz3P|~4sA^*hW>*9DuV4Oi(<3;J(D)GhUBl)PJeN># zVZmA#0HZSQYFd%wdX*D9<{4d?;Fa2c%qb{Wj_ojn&tOF;$#s{uh1q()D%;Xwy^QmB zF5UC}4R2~>=6=^xlQ1P6DR9jPFb^zdT%_n0om z8y@W^5S*!uTiWI3R)It7-5R+Is=?>!1jCvYIeOyaxu>K@sks>OghSC%nL)w2qbGK9 zF+l;!aq1S(8}C0ynpRQQ_6xVz^YbUw^S9nR_N*-yOd-80GfPi7z+|ahTz$71_ZIbl zqXEQ1@9w5La!Y!hV4D?G6n(L|-!I*jyfAIz#tIPnr2aLr$*LO}7(-;`r}E4-E50#(flbLrsE9F&;E~vmSpl z4qTwmI%sXF!0fLa4QbzP3e~X$XeE~$xCHIELIlBh=8{@6v9~;B0w0_e$h{9?=SY`w zEMt{5Ts5ElL35G#oST`QZ@)y0>Hfi){_5rAg_C^lCkLUWBO%;&-+4={PK-t|7TS`S zYxf_B`=%-`RC#51J4fZEB=^(^DU1uoya)PC4d2jOD7pLC`u!+!YImpkn7x5{`hJMi z5&l5_5}{z^o;=g&T0E)Rx6XM&PNu6CRy2UIV6Ctkpj7#One3FP9nM=+W~RaxPc-T- z-emJl0Hq8?m2rofzn3K$}yKpnA?< zeEBMdyt8>04(`OHWczMeWyOG&i{S=}3%fI)YT18oO2hI8?DDHWE(T&Jq|=dYuEK8b zHF<_a%a}D?<*L=R9_YuJnD{QHAzplnM$C=xuwt(~toCD(?U-F}e`(R>!=(W_S{lp= z^|m5CeCc=D48nPop_4w-2&mRa4)1e*mQA!ruKKrNpXTY$w>xTfhTfHzB|;>`!+ZPN+N*MjVsFp?9bI19e$+VY?QC)fIJKVOHiI-t3L zs@rFf2p8v7PSGs51!Qk=X}GZJ$^&b&-Ic20NCGu$f1jUA##F#!0lJd5Yc!L_gXV2)VOIHv0(Y;%E=;RnS}LX?k9&U5 zBuNHL2fcT`bqJE8pD>gg{WMx``B?AprE@LJJox$UjoL3=K(U9EEX!LJRkNya*>_CE zDQKnaUWLZ?&o|4j*L4b%nm}K(5STAktdbl-*r4(TA(=8M{@df;wVj2tsYP&LKTm!Q zxTm(gZb(PN%$HnpbN1Kpwrj!fYu6S1CeE8QAs0e^f3#A!oH9BQiXI$t6$WS)r`D(` znHp;WHTv3>BdfsQ{oj7*F)3pZuG(Oluh66Bzuus?bqNd}&n@xq#NVin4N{$}_Sf1! zBMbd92N=y*t;GhpjBZp)z=K+~=yaSVUB--p!wlslKGEttg{?g41tX=7^+JTNmzNKr zM4Fsi8=iDInGO(V^sOKE`nGEW%YZ#V)xwXvgaE!M!AWkJPKROlKqp$sA!l<=l)3J3 z6hp|vnuqsH3(QN#gNrx4t03_MIVrFSU$-`hpM|m80CO*`^W%w&YO_O)4qiJKy&yHm z_-x9UR-cS1alWE&o}a0giEFD%E|hHAjcrwjhAyq4NLoD`{qKKTVZ-YU9iM0T`k}09 zbNnpUh<$YLO>eIHheQsvfxvL_oiZXKX~0sg%WydC-rLC}>T1&rZ@ZGBnWlbN*}16PK_fI` zZ+{*o?>FI!@uJ{#{g7RhV8P#1YK*1&;SN~+8_52qJW~DAxY1}9NcT>QTO8t>z*jBq zFWg!-w}SX6EDdJcL~TZt+p%-|c&rOvwUO^2nV^>5EThQT!l65o_q)DT)fi`Czys3B4;tqqy7f3` zy=8h=G@{8Jo}>S63h9Ft6{tP9!HQjnPkF-e$hyn*WtVQaeI)jmam!HEDdlsG5ft}3RN;sjZ0Q>&;XP8)(T4&KBz*r^eZ8O6Q5MuxTfIZeEts>} zhX7(z`c4Cs`0H8ls+zB6HkC)XI10TS+cpSZ6NQ8_KI%B2ZR}bbvcoL9Cu~Al7AeIK zhCMSnHc#zkwx|;@ZTIiyeTA`>GKl4Rol3_|FEp^A0~JHXp2k*gg^JEtQh&s8s??qg z-Pt@edq{nu|%BpExn*oIE?=5iv6eT|5u6P zMdluN+k>Gn-vw`EO9mXleRamoPC)ttgNgW9VDUoq)q7pbsKs3IL@f!EUq<>X!kJ^I zpNv_hd($hkavvHuXqQ({;leh}O zj_=%UBkCV%2Q0ru%597O_6+ZyV&XD3kz82d)Z*zlmP9)jbsbXXSjc^k=jKBD&jzto z_iqT@aG2%8jprh; zzfoo`yruiR=`Hvwq=!EJ@i&ogT`Hfx|Cz}bn3Swe3tu*C?Cu*?r8{Bbeha9cU~}r{ zM}G6{0MsFhWG$=4xF1)yX)8OuEBO7(DDQ&nO~eh80brf&FCF+^oa%xeru|_4m(x){XuRcc?^ukDIUj4!xf?CFqDWO$Rjmc{xKc1c z%GR1GH#fO0cbAf2iBZ;x`v_5gz7l+(7FF|%l<#;FWr?p-bOMmBgztMo058**8XXpf z=?jXAAIL&upv)EwZeH{Q&d*+8CH&GKP=Rs}3hsR@;GTJX-+KI6N5>bm>s_W>OdR6? z&#k1S)nTgI_&)s&RX86=2Uu;`!xu(doa|5j{mO?ArB_P7K79D~(D=FWzRsh~E#WM6 z)n}dR08lp6#wJK|(BZRU_%j`+o)mpu7d#sHG0d?rmpXj5^&4_HGHTR!bM^MwRkt4JnKEc${)kvtv>1vYc5KU zkrgVhPuPMO*)!zybXk%JU&;d|Pvev`Gk7m|I-9OCj1VO0^9(Jc4B`?r%BP*<{d+kM zJ%9EHd;R*IW$flz7gUJr0ZB`u`#=`z9>ulo-M(K^#aEb`f|rCjXBO9jAo-y(G)#Zy zDgU;4o!F>QqqqV%9G9aslJrR663zO&MG>L#oX{&&wSpB)tu)K2#6BDv!G0;)00D)w zDk~gUbkn|XcxVQzO?}sS3E@%);>#F&7vQ~pe=xWc@2ZW4gO!T_rhqTw=4IoOCRcCs zeSHV>)J;Y@$JuveP`ROA6y7D+8ds$D8Y1(ns&DorohcN$ zOwQ$31f>{0Wf3oRrL4Z0P+Xkli&E|H~U>yOPex3sUXkE6&XnX~73oY2?NFSnNpdRYwx5!Q*i{$8nGaKj=CO>a+M%0pwanW^5i zhRzV@R=c8Q+n~eJiYvIjCN3ceG$o#%1fsNbDo(jFJI3#yH^ok}WjQzjn+LVn9i1_C zO&L19N;ak?tbPdKsO3U)i0Fs8BMvqoJh|c!ip!AkOy-bt9|*x*t=ir2@D1wTN4{Ti zPZv!Zl2RHaczJusP5YMN-?cPqt{S-7;|gc9zgP&S_F5C^$x#Ca?>%#$A7{LnG$QrO zMrRrZ6wrD6%sX~klt!dgyJN(v(v2`R`y`+S(qR7<`;ee<}tIvdJPSB7pNPvgH zQl?rwJc!WyKIB*?BD0gEcrl=GogsXGh(g9ZHI?K&wl{KpcNap;p`xS_`&;6Ry{8|tQ05vj?ZF)>@Wz<}H;@F6Nn zIksj==*etYmP>^{->SK8yLsk&G{5P37Nb`)brC0QV6zmbc3fN<*ydXs8a+7p@aMjArl;MsE zR++=hjKl{ry$_C_KMZI${qIOgPO@b(r_qv4F!_U-PndvDjI8+@l0UB z^n-2JR<~gNjEpE{6Frj31%mtSlj<5aqqY{qsCr+D_4?7-I03oXM$t6K=wcZt7k*lC zHvlt7R}hO=+U~)=kW-$&``WxUpcA)kD^qiC`d6jI%-pE;)_BNI9aSPNQWSz@DO
>n09YmfY-~n?^$YC z>THuuDz9m=L`tl=P?`p)meT~}U+LaC+Ol?2OB)1cT35UkLj|SJkVOKe_hvIzDD6(; zKj+eO7jn0HwfR2^>?StHyUX5D8pA_c$2lpa+`cmuN}JKahCFM(a+iDSn<@2sq<3~_ zQ=8>dpx~U-TT>ztb~xdPM0+jZk9-!zBK#f{Ae(%4d{$^K>Hhh2{ii|r!t<#l^1#L} z_Mm7l;n+G@`OvXvvA4yMuCb0ssV!s`{Dg$J4^ z@R;PR5B?E8FXS&Lqmvn`X8?WH1;o-dO7~Uo^Jm|j#+i&vl|p;M{B1Lo53ZUWb8n&* zKWuByOMm6bQr*^YO#Nc&=U%x;_7s^gpOLE1y+=$_?&)nv)$n(guc+55r%VzS8yyKJFPL{!ecRwvK*UD}nwcxV|H+>J_8)5g9n%k* zqwxi?zDu1+R5>{SCFc``4gG^gnkk`IYv{Qa7W#~NKLZ*h-*CsV`_z6suox@bj;j@d zX>y06-{ih;n+I)goYrxzF*%Fv|7Cmp)S8f;s=KGzX98X z%Cmya`(hLm$+4AW-sJXNSoP|Ah5${DLmfj0J`_t)x>M1s>LC+n4w zYNbU4bBFhqfb&3mU1O12GX0;01y}(oqgX4VQgEQKI6nQVi;6G$a4lQuZ5jxj9VFNf z!~Ld`$55@;%5gfoU*8!?NP_knyUWYfn(K)@DqogN&w6|CmW)VW^)?WO&5~_adgYI6 z@?B&X2~)QBUQH@_6yA~Xgj$aP?tpv8YXE!bh^kfXJaB%h%_(NDLDM3A`pz()fyk05 zis>A8>_Btr``5p0xxZ`g0z+$%{L2L0Qxizo#Hjk1D7#u&hQ;0ii$Js%0ljWBiMJU| zFcNN<=ij_7YfR{X;F$IB-~2N$x~$RrHgW{3^kAVS z57L+-U+f_ptxyV3mU*^)!SwSzdK|}MI!G))1eWbpe?|H?_vd=nrl;vgPnYweS}qD? ztvWM5=Kz?NZW!)G7!FK`jJIaOJ^-1R_Evo?;+>AkeEA&3^Z5YhSm($X`FNm0;LR1+ z*PtPAwW%Y*mnnj?ZDd3+vbLaue@phWV1e43gbIT7;k@Kfhp5RHHAxlWEuhllUIy>O z52xg7^ionLm{efW#yIP?!B(-D{@rQt@JL}so|G?GA{DiDJ(uMdD!&E zfsl-MLRU|b6^UEs$i`}Un&D>Stmao+?H+B-+OFeQv$Dl~u}*1>!TM~%({IbOyqfyU z^^U&xaD}IP0Cx2qA!V;|tg|^G!P$Xudv5jzu*Vs$WuvGxO)0iTT6VjVNy@N+yt&(J zw7Zos6a2RGhk!Alq0<&ti)@=eCHLwGo$Uo%GZRdQddEXIy+4OL z9rjzcV^n22GH$^&oa(j%YzVpfy|;A(nvRDBlvC8-*`_nPNoHz0k*}-+ZoT+)kVWSx zl9yJ5c4p^>V4Blvc9H8}0c7RA6h}a^gpp#8=K_?=DCr#6yu`qmNI3Z^rM46=l7U#e zP5@urZn@=LV(N_3O^T>8;%6boMa3|X zM9vK20IVo zgi|-TNOpewyov!J%W!j?WsECCx!gst-2_{he|@FNEgaau9ME}@RU&+>s%NH8 zKZ|3hxoVRQZb&)1k9Zguq>s;EmLr(xe=ps2N)=T2#A;b#NKTA-7v~&rVppCmZ8xsO z%|I zI8-QbsT_L_bfU*uI1J~6mmmg+-}7!=XUI7Gph1|cb{nL7`vB;6U&OP2u=52K*ZW=g zWvb&s0N=4GQQzz>?hQY$;a79qx9(pHf98BgQ9wh{%5ZFPGz>?Z|T zUuF%OiW+KaB8*}fetR<-QhGp#R3wgUu@_VjH89nV5{~tq2ThZkagim0Xa(USw=HAe zl@h@_Y7M6fh#N9J1#fl8m-||fQXNL{)ppWn5A+RmqnZayH+FWU;9;vB3@>?FnuhD6 zx%m4(;H3V#x;o#saP#xi=;-&+HY*xhTEL-e)&8vM0OrD?@m4*D))a69(A*CsF3*A=ZlPagyJn}ul<|k;S-O6Rw zCWz>^HhY4VsHgFBK&ei)SM<8`gk3{T-+cS9mWNjy7I>-3V9jo5VhjUlaY@b@vg{;( z)(ipdXjn2^H18yDua`-EvRPyiRj_<&_Jd~EVp6+kJ8O}K1N6$oX}5odGwn*6%Rc(+r9fqr#)q?5bvM@+$LJ$(|VLDl-B%f}ph}OaA%A zVbKaT?PX&44n)2W338Pmb{AmDqV2Cnr`v~#6=jfxGkm%9!?d$!N_|~&M5o#=M{1N8 z^!`#CZD%g_wTrWRJr^=FGZrlsV&*G~+EtxCUE!bod;`DCay~kRPVsVW zplSf)cg}zT%trMxG&eMV{Rdp(zw)U+{dKtUH_t`C|84azU0>BUl~i{1rQ)CVH4XLs z?v&OQXWY^!rux-Y_4K-V_rx}ve!1-`WKr2EkgM5Y;=D1tdL7xg-;fD#^PjPJ>`$}U zPjpwm_0=e5t%`e+>+%Wa6-lV#j;@B~Gx%eU+}YSj&;(8bK2OgV2HrvI^9gw8I*zVZ zA&?5TF?Uvlq=8?yb0oAif`v4k115d!Q-gV(DTYKui&mrkxPivoA~0MwauV=l)Rg^Y z&&O6A;U;h%8}n5OTTa0 zW7FN9O)6Jlw&`%}M>2oAlO-#b0~eLF69n;83RQ(DKY>6#M1H0W%o5~bAL_4cW$$C^ zRK|&q%%S?dmu@}pYBHI;hEytZVUB{ieby;2t%yl?6pa304U2}jx2be{Eo-L4VP;F( z($8$2UuR`79d}K9sHXHrJoPV&se!3(oZTB|UYF2$A^rre-t<{O;jt<@VbS1l1sT}_ zuL~Aq|rt=hd9-r!@b6)>L~4_-lykMMA_Z*?R65#h*carWv6@9v9_`<*A&ie7ee zWIqf8HFgUhSGcV#C2NXtV(0TmeQIskmxItA*}CL*dCB3t_b_SHUWzuh0o4+0JKwJM zZdx;|itYWeK`Y^Kz}vi%obj+_R{HgnE)cdi6Ez1NsHm(g)j zyS>Qi-z)p&*yha4G#Ub;if_ zBD`c8W^b!Jd)&7Bp{)bMk8?f-oU>XOa?wI@>y#5{Y{M&o#oe!R!AHdqy>l_@vIH>d zFN)y2JFXODIat5hEyNhnYJcE+)~aUq==RRB&0VF_VR?ZcG|c!PG=&|@i9RPgFIG!V zEsf4o7#UXc6j4jWL+iAt&z6r*n$D-WMOm%}oJla`g*fJk61l^*Tel~Tm8i~ z739&-{k8)4nfQNo(f^w+qUu`~gYXeay$>yNAs0^K#TT=0rq)wRBfNTl(8P6VvB_uY zafBWPco3DSI*6Tf|FaXHPIm}k37f4Y2fil`y@@}?B~N9vE{$)DRO|(HRTG(#v}D>7 zTIzC8R6bjSfZp4(x+y3r1-c(J;n@v`GC+2TOc7jH_r*txTv2&FoSlJYxld#d-0~>b ze$eO&q-38O6s3$9oMQlLcP#?;+;!?9#ZIQL=JHqq>sh{jyQ8JE!Qq*0B`al`KCXso zGS2{y-l{wJ1~1mprj8WrKwi|P!{t#O(#4@^tI+9BK8l?!l5G}cI0(bu#Kgf%UE^0s zE#VVR(q-+ao@cuA7GNnqnZ0zzIi~J-ac7q92^&rknCk?<+*eOq4tbC;)I7}O$SfD$ zeW|PxlIQ+HVWGuh4B|j1%vbR_V746nI!-qc&u6OX)Yh{vkRPkzQhH72xJHEOzAh50?x7L;PX1O4(%A>+9sM=RbLjpN-O=h0S zMAca{Th4jDOPF&3U#=lnd!VYIc6hSY%@H+QONh>~T3a=dNSQRuHfV7RNF}K1b6JtC z#v6mB8M=T{sWWFGB{^QmdJb~@BggMBe==^tPcO~OqZ27p6-uc?G$M^9-WiixCfH0b%et=c}aJR~0%^jxLTj?`}=nQp3E&F9ZvTnR4L!V4{vNI{(5>;4TuV6|oMmE|Lgul$ zXIfejJ6Y^loPyzP{c@&mNEcVG8If`jm)y(C|7hcvME9&#wmO$E`Fjj@5Um~!h?ZXV z6gqouUO-v6km#Q2T}|h@i_0`e`3YO>1n>lSosVIDVr{?hR5TSaFyB#rrX+gd5)=&R9}>Yta6VTtx|0?Y zk9y((e<50l8aeHNwA=GInL4xT_xCvY2MY(an$E zIAJ|uj`dzlUuW;{7wUDa@b1ViwAxe+&rWp<6DSGin@>mx-UfBS5KTH}90gQif&4rb zClS7*+L6`PhT%BRrRX%giA{~UM3#G*hj|15 zaLpN32(XLlBof%}?MKI~rhXKF<3V6`NSpdMEV0d`S{sAU+B>#XV1(oh z9_=b>(;awzk7azHQHr^4AWg+;GgA z70MaH0nL``&O5hI466Q9KHx^&sin-nOMbg$*ikDz*w4YU*dR9M$I_`m+ zwn|SfFfqMcFR?HYH?{75rnY#Iv3i+K;WI6L9;2mRUSy***Q*&B`XnRw$QvHR=GM~AneW@G_$xPoSzy_NP%G19X`63*m;N% zupeZ%&!yMJ!5@>Vs~77}l~E?wt;;u=tsz3RID?Qx!6F+YsL)$o&w1x^-|?#>&QR9~ zcL0f#waIq1=n1cMHf%5MbQuI>cy6R5d96H& zc3}?^1tt5J!#yJ716A3%t(+CZ>1GLV%Pv7f?Vzap#-3d~$QK?rtIwgub(K5VY1;q%+Fjd0r7dzR7w zlSAVk$7zQH4zq}YJGzgRzBUILg{;`j`(%GT5z2I^TUSitZPD1)St{Go_n&4`9IS9G z%mVQvs!& zCiW*EebKavsW9CRYgGM84$JiF!O;eN90ohV@J_X0r@Y4&`lla%9+k~{YcIrlY28Ak zVshf9=<83DZ4MC?7zZeLUU*hQ(ZCaOcBR=;9II+rTAn3rAvHe$^qApFSj9Sw!isCb z$XYlUC5V@rfGu%R>qX%$o_S^ZY8zr9%9I~7J1(t@n@|Vn3^|1rC!LM#W3%pfQAMw-(k>Vann+stU51~fOLMen|9u_bduCsWCNMc9*BQLx#&KDbXo4Pyz|AvQQcMh*vh=0Du$& zn-_CM5<9YWyCf8pR%3EdK}WHt_bgrZpTigv&$>-3pj9>4K6Tx)#QM9xt$I9CI)={I zdR5&V7u$7c_(5YG{VFVUArYvc+@mBfv99Sf(4&R{yeNN*cWhm>&2+x`GIn+n3apz< z-%bXr=jkd#+k5YbYw7grrg<-Dh{MxB#kE`MFT!uJ&{nBPDz)2nr zl3!YUu4va*ZY0UHCu;hz{OWPhkPed)UjS59um69r_ufHmWna1|-3~Zoz<|j%7%)5NCpv16iFhBAcD!sfC)lKB$I;(CW(x%em$>dy5G!i z-m81>t2^~-s`%p^75ixIwb$NTXRmL4pF;DkAN{e)%tdvkQwD4O_ZrP_J+fx2hkr^7 z9uW~x!~kq!bvV_PfLCMgQWli%B#?>ZQNNJyifBaUZSOenGzYoCDHjj+9Mz!!tdL51 z8AfTO_AHAS1!NWVJfYtKg=n;&qlU46PDn;01kyIsF<=GY({`_se#zvgwpE=kl5~X< z^b{(FZq;{Z0#?Y6sT6HrP6_QnXP#a*}_9JzG+E)h<4>C2;b8YKOZ`D6X8r$X+ z7bWu-WC9#A&273Bc8vbPe!V=5L^o>#T2FtyrqH-qlHuWX1hyaSX)7pUV;SfhbJ+9=*Tk}u zsa3?=7^zTovG<~^kaGcB)P$v zQU#+`6FNXW)HUgxAYS>=jnR3(bc%%}ofZ+3784|Rcd1#}9o(1~0208)f!Qp@du=mv zB0tRUlmQmv?$91l1Z{zoLjBQUMV8Uf`7Dlib!CsWmF%+|?&{g7grJB0F0^XTGjD5+ zCK?Z0lBz92;F~|G96}AS7c?y+$00|7Er*}%<6M7Ir2#K?I5#%sZvIYV3;5s9+4`@W z{QrrI|EavipN9Us-yfc|fI~lZikR(CHbob%y?FMV$^QXNci$!CG}-ckIHJmTJis|* za9(=z+wU}pxZebJC|U{z1E9du`Zt^HQHl#i(_E9r6y_&%h9?AC4lJbeUIit8f+UY% zYKmJ>LoI2+HL&W*TeEVV{S&9}7@Wd^e-vWE-1cucOE*7k%=09MkDuOdxRh`4TgpVk z)-*m<%X1(yV&G-@LMb#`uZ23o$X zeb)2TqsFe5HVh4AJ7Ufakf)rBn9OVc3%LZm9kLvXg6y6b?!SM3h!R3h_ z&=Vb_tEDR`b@?i#*doxfiI!Mm%5w(NwfUt}<|v7}8$&sRlji8OW}15CU;2s*mq^Ky zE z`H&BrN4F%)@`1R`lJU6CZ;7;=Te0TqOPcxS6tq17Ua>smh9zf-$Xrfuo~u%-Jtc&1 zmQFk12f8H!wr&I_1*5Zy$hKAnjyb5ki4eJyn|cR=?evQ5pZQnyYAhP2PnJ5L{A zuM)+y7~86+mn+=3Lz1coUVgq9dYuvvtX@n^fExHEx@+(;MtN)4#_>&TG?)1atb*bT zrv*(-IXFVU9 zOt;N?R&G>Jg~{6~SQ8F8VKuM^58vvDMP%?`ys z;@+b{%tZz<0W9lcq*Q?^twzPnvt8xd1DV|OB6;JaB+me5j3>DS_b~-|L(by~x}&lH zW!+t|^CsHkU6GjwY8;i))vwqYW)Psn{{~*{5cL>8dVP_O&c6Q}dJ8WYPv?E}c4%Qx zV%Q&lguUJ}vUjluy_~+U((yv2T*GHX-(5H5DdOs_Zzhx7kVn%EnnDHl?Pzbw(Nvtm zR!`P~i%PEWKbzk0JP^R?pl81&=GI#UA&D6iHm^5d=I88&+zWrlu2#W++NF3IQS_us z1Bj4Oi_jqbkPed&^9EyGxtpE3w-9vV+aLG^-jsIhoGdVA@r7jo0mIk2K6TxB5^bJY zznx<^9nU6k3?Njwj3#KiPT%AzNq}T-@+tC8rX$96OliAz>sQyaKhIY8*Fa1HqGB{y z85QgFr-}C1cUt9cFNrU^H$D-LeF5su>wVKU%;*6H!S#GrrXc`-+wQSkilzOj-b8;K zsmCG{AS>f>v=j_9RyYs^CZ_bL7rrQ1_EPA$S$M>^ zfpW=|E$>#zTbTM*3MAiy!cjXh@rhrRUT1H!V%sn~4yxY*IFLS#`2EY1>{nwflpwZv zbHRz}r`9Q9KA`b3VW?q3v~^fmW5eC8%*UOTqHO)Vt)oti1Je4v^)STV!e4|;iafvLnDV86Z7*so9n{vp}s#1LHE1vnBHf2<9l3L;7R(v5xJsrA2ZA+ zWBMj(l1xAi@8l$y=Pq%eR_%1sjf$ zT_gzwVv_C$0i8T2_z!CuA~g*5J~_X0u^0cSslBa$Aq;U-pvUdaSzp_$Lk>9oBntkB ziGMfDqR(L!I#J|^(m*ZPaV@5yyHurBV&s@h#%Tz0+I&JYRqk-qwA33oq!uhiYoc(k zUC6gJSdD@qj|fsujd1))wXU@5Hr(ISKY2f)d2OejyV1Ffqf^X1r0;u?*BZ zYG`Fh?@X&cDUfZ%6M0u}roaQC)s;RSEV+EVVq0xCn&6&bZuU^7%ksSok{p+<=sVfe zFy!p*jPo7+n!Aef79mV%Ls))|+hGWIP#H;bZi3U%q3%zJ%*D4M)8J z&)=z3{=i-#Jm#RJ(;!tLLJrs}-)VBtJTKZ}iniUY<;jB{2-@WjJRLP2r@Q?z9Er)V z{Pf<4IobIopz$T(ZCQ1MstD~n+MT5(9oG^IMgMk`wA#~v66bOxnrnXl1^FXB9vfip zTDNARA&v90aVdv?tw)+!49p*7M1XtS`bkQzBuf<^Nsi>aq~-Y{X|i?nW8N&UT%)%a4W_K5@6QQ zsA8C3HS>tBK~iIhP%nFw&^4@jEQ^#8oHjrU*Soa#aqvSSdb7AW)J)9+ z7|H{f{E3{QEDD(ZW{_7`A}%Q){O#}Ks28ER1VQ&(DLVdiUt@2yBs-!KBpcQMsu7a(J1QIi}35VxXrW8vEXk!OvFp;nY2cN-0DItTuL2YT|rtdaKP( zW0Ntd+^Z!cw3y_72_+|RQU$`^s4bjOe4CkunMw_KMt85|^3-Ms+i~dtehx{H&8U`M zYQCGfoluF%sJ{(6wp#?ce5vAqydJA`%2aTez2_?iopL1b0jn2WOKX8|-n{LJH89fP z?GA@DG5`kw!|4{#dO(+pLOOP(8S2FU%-f|QG4CJ{fpRHOxy@TUe0*o^P(NzLI^y-a znNx=#!hNr*H?H+DDBv(TxW21bF}9#>Z6Lu6rd~Wu!r4xIp9?rk&({$f^LoodpYnuJ zmGOP;$o5PN=ecfwM=%bL=HA5m$B&C(w>OXRW{RV38-mcr+v7}nP%UAW!SP{cC+nnw zmdlB0Fp^Zg1<&KD-*P!d-^*wxbn_@N1^Dsf7>UBCFAldmk+7f>+{TzUOa9hdwFs>>)y9c`&!aCu`x`RwB78Y0e<%;5qaK zx5cDc6p)-lqqV+vnwB6y(KlYjdptSWHYcpGaplVL5D80pON#_)ugB5cew1wMqY@1? zPqy`1NI%kdfV^WQSYEcj0uI4yjRnG95ZI05olHHu1s?^Sh3-cvW9w`((0HL zQ-i#y&*+_ILugsbq~cGi#tZ$%;5OF_2aRcqZ_EJ@Rc$zBVO&QkB5?j)nc3!JLyC;6 zG|vuNd`^K&yL=TGL_QW>W66zJa{xPqW@cs^Y#|=PVuAmkF&_dNhxsj@rw`J}k zOCQ(|JoO_J)8PgNx_&4gbQ%lx+il=)s~`k=$TRM@9oSnErhPs7&UO?&fsu5(SOi&N zY+Q8BDqXWm{}`3-z_3cO=04nKHCpFMn6ufCMg zPyVZ|q* z05zskwcNCdj{RubFfhQS09_ zeG(T%(e0A5?@wOcP8_^|Hc#)L*07e?x+;dU92Ny^YdbM3?hZ7^&*gWrToe`oV*Hm? zGbHEh2C-Dja=%RG^Ww^yoE)O_JMjbuslfWcbmCs|K)+w}vbWtaZEGJoBez-vQ4nRt zRw6U)5q(djw+0k6l>u|tH%QMh&j8BtXlrI&w|!RY<({cXYc?9?)vdk6J0%#E_^^p* zL3j+&E|`6&XUw57!- z9v_4re4YMA!2=6gST=r{fqDw&T@lzksyK_T(*`2BipRHohBWUF5|#-arBA z%Zz73e?dcG^mr|4+w)|hRRsk zIX{O{LUz3~sl&t7Yr=o@Cl!D5YmsRh;&@TMoZ`r|7LXvXt7ll6thwmDLL{&Dbu`Lr zq288tC`5294Q|sV*k3mmH@$rWy3&iMH6RPp?I37~eOI<1gZL<|4Jm<1VU+@>nIY*q zFt?zG=(?ATL|VcdwKkc`31X!Y$92C5QcfQ~zgmppNAF7QH0#Pfs|Z=GYB9yIo!O;= z)+Q`sC!lcZwnzgV$jtH4|o9{)S4fk1WJv(U4+{TV+|odz3oA zQ_VOih}~AdZrQ%iHB;rtPmqg#Qs$7`rlYrERdA`AL`*e&rg{1(ZSaSkG$_t?89fVh z(F$rUL0-8Aw&WduBwY%uQKfrhJknhZb5RN5o~4UMGBDj{t1&B9$J0Cvx=gavIMte` zr29c7-a&dG!iA>)Qs=BuwQi@6FXZ$$*V4WcDrNM#2Pu1C9F#idZv6ZXzwvKTi0R~C;h@H^_&MQL%keYhOMxSOTHxkU}O_2$giTI@8!vZwhGk;@HBAlvQhb4p}F^n=xf>pSeG~ieSGTw(w{r zS5`x|dmYd^7Pfs`@i;+ROS1_kPc5zV>V(jIyrHby2tNn`UT|(I+Ss`iZ8kU5wN`_( zoSn=Tn6A}n7PeXQC1nsu>O*E=EAtdr{s)PpC;Qb`JOIAeuZmv`8Tl!VnxS_}4+`pc zuu~B-0bZE;OXW7G4AT#J&z&OA7K87AXI#i@D@DLb{CuYD)7rLWeK9BL4@%zc3-yPR zwwqXa-h0rd6nkDJZ+ofx$D+>jHZb#-FEtjmxVckeaUIw@lR`0dvF~v~$-9B@a8FsV z;Q{KPesQq`T7clQgD_5t!7w;eG?F?7D*VF1AvlFAZ?=&FO1 zEAV4XCAa{Lr2Q!%3OcQ$F9;G~^Ky7?EMRUms5IbjSGELWew9}>AZhL5_f%7)SiMT` zrAL|bnWOl6OPtcic%$WL84%s>Dv*q|*nT|vdKw*|3&rG_O9TYCw{^N#_o&M>nL0|x zB)t6!vG|&PdK}@#;JhGsZ$U=mROyJa*W2x$&z6^ z-A`>^mHS+824Y_NLwz97PMBVgRygq)UG9z0qSX4LYiD$7@2;tf#8#R;7fBCZ65A0f zezk47g=L(Cl&}gAy)T0HN3-|`>vTT#)#==Ac4lx-$OwQ}>bn~$8dNzEp!SHGHt8<< zjHEVS*>CBEdeM0WV`+&cQHgN9{qTMNeQMBSQgq%x^4-T#u>~- z)-B4sb*WfJT=0pHEA(nv9zro(rS_V}_Q@{?fJz_ia=Nm{LYE_3&YCYa(LQjn-dR??)!2?ogwPBLj_3 zZ~u|r5r%P@cOLr8w_)pn+nRb2e7fK7X;e!XfL9qDbouMbbqFQjSFj$Cqc+YvrL}7kOM%n;|ha+X`J@E7Y3Q&I6b~xyLM_vxa=@@F!-ek zsu=Ep5{p;igQfLB8k*i_puKnjk6LcnXp~~-I_T12BCj z3|#7n_ki&ESI(}w#~iEi#g!yR-9B&j$+%f#^UzFP`zfg6sA0Ej?z$RVRYPrTk{|uI zp)&bx6Ym7o^~{d7qS?eWto!hH(fuOmFd5|Bu3M>1gf1EHrA4X?cA((H5|q6-#HuRpc2@$sy&3b(g~AaH zO}$cl%w0+%CH?{eW);iiOtvSx_nci5GMsyg6wL!2nArV|N(^V&IpU0JyaTA%pQ8S@ zZ2$Y5Ak8<*@-VGjn`!Jr${$7 zKzq9o)>!s7jmK%Dv_#^pVZ{rX6(z<%vYDjV^RP5~%o}pJM=iB@@%a8mXD^!8d^dfy zk~Wg$4%ez4hn_?` zZ0Ani7kVbu#w-Sf;lG4_;6`~s$XCa?UH%E^t7 z=8VL7e4>mgZIY!r?%DpL%p_ta4xL-*z9#87e8s%MbII@>2o zO@ZQ6>oli@WWS{@1&<-lFwaC!tfZ~AWhF&flN8~^Kb6NHM`qD0 zP?X1)GcB+B+I2Jhi4XLo-~;#8%A^}SFAY$KbMvs@9dKfH*vMI!JZW(Z;##}<^4mf2 z_qP`H{Yi9BMNJ!uaacCzB@x{25JQjZ{}-OaZbhOX{dl#RA=V((B7(u zIJO7Fxi1*p*G7{Jo>tlS@WD&!-Za#|hCk*?W`0~;Y!~T-j5S$Fxld8vULdk9z~}|Jm#0po6Zdv&Oij;TVdsSk!U`j7F`qOP^=p{CoH?4dq=UK zD{U*<92h^N2a`@lxs68rMjb}hl#?MS&YvgHxcY}dIANZlmaurQo%7UCU5lLHk%OU< zp)o171#~S1z=lleT=~AIZmJX{tK>j|ApA0O-8?JLV}AU=Y)zT&M%&gj|t9L&`p%Z0q|I9>rU+;@^X)qj20>AQs6#z4i$ zG|F>2K>wg}c`m@R@gW@sCfNCQiJu-A8j~+;cQ<{rQeR{8#+i}`2O9TI1RfLD>7&U7 zR}es(=mfexrpNs*WDRuHGuLawA^ z)vgA}%M0G{h=F%D820t>W&>1srwry(I2<~vIF{m7Kq|5&uM2XBFurg5--EyW$g|DO zVMcKaohOQ9(?`q)z?m6l_BoI*2;OKZ^2WN13=g4D-13+=EqSR;v zLz!xcHfRF}Ax3|>)D{bmO|g%T)*|c2Zz(!dHOG2}jzdoJJ)TGA; zq;vokM*F`H-Kv|*9RIm^GB<<7)4d0wuc-=^VcJT>2UVn<`8S2!b%idJCCZ;TWO=TI;au+5 z*X!tOBiV$ss`#!TPe#K=+pI6eGNLwe(lwfFgenazQK<)=P)u7S*%$}CJP~p=diM9 z&rXn_U7(@qqR65d%y$T0JiWANYMLAuTh#oK=7yZ695_u}Oq8IhR(Fs&cmf!qF)3S5 z8n_aqmE0%rwz1u9zmp1CM-N=oA8MyqK(miu?L z{4QTW1GrDhhdH0O>^}84TdQ?A&}o*z4x}t5?_NKfAgdi%Na`a*X)cH5>jI8Tq_z2=YJ8pgH2+*~uxr{oVQGgmxg_A9`kZtKCU0%>bBh z62X_2s`6K3|zft>youVw*TG z^2DDOt_oda+-S=1T!Kx=dK36cOKa&_dV7B?@DCPg>PE=Jc7jYDVs&0{<=)(i21(k$ z;x+u}qTyN*s%3%l%H=eK{QJKv|MgL7PhTtCu58vOT;zT@;vhDo*pHJ|032zv zJO4OySQ$Mk@plU8mR5EKOI@sUM%EUjoO#%g^G7YWazB@ITqRiYZcrq-I?iZdmPWB> zeS7I~+O9|EH9@8xYiB@8$>{=uvqmKCCzTgXr`f311)$VHtUbglJ())R11&zHPlZkPKwubf|UQ-zwEztUGC;D-(g9+E<*=w zx?leBBD?*y4A8Sb{*x+>Vgphs>#yB=_4lr`U-|uqblMCm&vUP zRrNH{uJ@bVc0hP(xtqVKH6?@y9NZMa6Ul}q9s!d+nasQcCZu0CMq2M5IgYsv#60`l zE|lY3p(&7N(xcsL%(uMwn%=Q1m*qyIzjIFYM#0{D18=sgK~--fDYFJyH~l!};7U>r`=r1S0JS z&bFm@9_kaO9?gLR9@cXgiw6$&58K!>gA`Nl*vwVMI+Ev-{Jl+Exxzy-7Iyk^%B=fE zlgbVXo)(#PpP^|TKrWC1@;lF*~1b*^t-Ftj@|a zFC-}_bcT+;U#G@PWlV&<+*EXNL_8kQZ^rsOFjM1P)f#ZEXAPo1N)JZ43S6vYNSh>7 zRqT`tXLuVV8Dfk^b$O-=`#Yl#-i=oF3d~t2CHF}IS;a`hywmCEx^8JrO2BR zUHY>{mS%d>z3p^PGG#<(l8)r=C$3L<$N1@p3bz$53QrZ;u2BoW0p1rOPlqLsY!^1B zx@aX%EpY~F64sCp1!{oK)BX)O2>6zICZ_)@MNVgCiGV1SK3Fb=HLmpbzs&XyIM&COV*bt?KOE8Uvu!AA1OIW9g z24ef`5eNd9WKy-Mb7*h2DVLMCi&_4%Gd0asW(QC7V32b5P#*NyTFfudcZ{~VD#d%? zyCM<8Ywfap_~s~XSCDYggUYV;d3~ZiE47|-QxY9orfMNf)Os_Py>X|&+{95kRZYo< ziR<_rZpOdAEJzy(M8=a>Y9TRNjDnh=R{^9HPZ96MgKf|$ds%gi4#Ob7YsP}{+wmvl z$pt^-t~33$L}up0lw*wMGi48njBZ))H^YP3d9Qi(V-hTwJ_OlMBoAlu1kHKGd6YT4 zevC8He${Ez744A17731@8Tt_>yd?8|7dC1P(Nd59Ftbh&de2uaKTlqgTN0G3n6obgT1xq}0hlA*jeygxhn4E;Pj9a5B)B6@C z7I`=4xQE{8u~k7irWib-X%@+yzcp;cl1K_-UU$ShvT~FSY9p9vGf!%Vdez;$-*t}q zHJ0ePWj1wq1|~3JUhiVprAu)sY(9q^ZBLe;K|I#vW~m7FY|co+a)Ek ztmq&0f$bcaichTttL3=dr{4WR%y23*EtM&5^x}5b6CVc=O#3V^zcl(XSh@wc!~L=L znZUTFAdh~6+T|au6l77(Y`8=%%)#=7y93yyvcsv0B$d|n`Y0_pm(n^u30Yq^I_et|1($9 zWq-hotbazi|5`D&*ELsLi!2b=AkEnupw=G%x#l~RVDgau`cEn;8=lnP6v>Y=>nX~4 zkHvj9MG}v|C*%pZV6 z_Zm?au{M^BOydxdzncrD4nxeBIz`bM$aI z^~2f|pHGoI82r>;jp$Xs%*~(d9u&9UbnLyQ58~AwRpX7V(AIec)OphX{Wv&geKQrj z=dt8tZ7hXbziBzSE@xMxb0DEtu4;g~qmMQK0GqrA$%@xBYDVS|gNB+?dr42`N;2KM z<6f0@OmBj-y^YkojeYFg>%459Z7BH2?%r)~Dyqt5tsupI1j%jGo4_E67zqT%09VN< ztsqcl3%*h7l7zqz@@l6&x>}3}bz|oft+33#<<0{99?IG?N{vUBQ5)=OlZ&9hNuN8@ zSfQOwL3Jo>r*>=CwBXKI;>!T%*iXYmc6%Q=RIuMdw5>q=&GxSvF9(|7a5TVwOITrE ztUGtDVFKH7d}ZCfxem&_RpmvpAAy2_zY~989#hH{ETKiWQ!`>Kk68un6U1Is3<=^Q z^xlYn&B>g5C@e3Z=FX~SxyFT7WV19LF=JkX=tfk*LV&>HeFh~ZQ014y}NC&A-r`U zGtQz0SNhUtg=a}hwzR}=vp#Dt-zr^1Bd!uOy)OBtMaLArc!F*-OsM3(0Uz-YKl@C4 z9?j@e!F~mImqS@$gbd0#(biJnG$OA6zhil{#w*92u;yOUr>wSxz$v*1KY-Ht)~yWj zzNP!Ss!jlTDEyYQ*e+e{LN-gYrA_1iBG`B~H0Z;Agz^o}2k5L&T}0gCzv6K%!*#*` z1dpr#bs!r?K!p72ClH|g;|gY6a^kJk+_p z-Xt!6P-Hyn?e2kD)x4wI1>ADVlYQ!#X1YF8h{`LPfi}Xumum^vv7?zeM++ES_4Fl` zaTjx=@7_w&{=0L`Rl+F|xK6_i?aO)mqfgE}E(se~A?a#N5=@Trb5Zf)vh8;J5UqWi zuPGl`EG^6}N>0+|_z7BzOJz&R%a>od{sd zq~hC^*qefL2p>ADAV=?ZMW@bNJxI;_)6-Jh<~S|;z3P3GFeERwl=$t#)U2qnkG_p_ z;?M;0k?^kDu7T05wSEne=wET;H@j2-4k;x~P}tnGvOus<0Kf6N{pq-(P4sD28Xkzn_yL`h?lK!I)`oMfka?-o<~MQjBoA;$1n$PS)J4u2e$fJk8UEVzTA0D$3&|(#ZIZWzkcdHl*CFfk+8#Lan%;3$~1zMyA0dnCMd$vV}&Xp2ketcpy)PjP1 zjK!{|-0|E^LA#f7Pw8u%ByY=%+`)g}T+H1P31D2nt#0Z)8xeda*7cli+efgm`ljaw zvxa+=?E1Mff+xH!#x_ID5%V%NmnY0ywRR$|>EXboYUWqyIH^3DSU}oTR|mu>-AyI3 zLa;{0wD-4OGd-L1OU(InNl`%ATK(%v0{~v;`3TuY*S>HBR2-GDZ##^LnIhxBJJq{u z1N}X&{=QZ9yOJb3`k7c|;p>FQs(I-p;f`|FYmsO#f_D8;Lpw|As3Bu{O!3WNe^s{y z#qk_Oau$K)D>Q8wS9SD5hWnj<_pISsMO-3*gkTNV2U!Dgwjo9=+^aqp>LMW-(i;~= z_^w00$}{-PQ)L84&qb-E+PCeqj}$&(IT+#Gpe)Y#;?7U1;rf>CpH#cJ^sbo;i?U_q z`6`W5lR4!Pi^+_{nE)9hzv#G9h+x;ZI^%JwU>%XA-|KbrrtS#z3P9Z(#f9wfw z%k^0vHYmqz`^wO??og~x`bqT>UBmLiU-?#8ponE1r4-b^1=>Gfc7K@K=ZuQs*EO~5 z$q{xQ=O}_=C-L+W)`z!+Q4go25lKX7D^>|}ku@kZFpmYvV25O}k5(VqZdd#m(_LY;nnf@cN z!+2+BbWQ!hqY2@Lz%)oWcQ{9}kiWj_bv*iXS*jxf zJYAsPz?+m{N*>Mq3(@;V9D?XnJs$JMX%D&R@cBP=&SmrC|76}a?I)ailr{{{_XiK7 zB%sRI*)z)c^Q06dh+ntgwO(mpf!FT(RB&yu!e&k6`aP>J7tF8 zm0`1Qm~*DN+81&S6(00_*ezDUAeBx9-A5TxgLVg#nrKJGq3rvJ#reX653U;SN!eZHt`aSF`XfB3;GbUg8;ILh{!lnG(zm zbo8?MMY+AD6LuA4*A|`jb2`6VYWe>gZP$M=lD~d#PELV?+sB)j^G>A3{p#y(w@mwZ z8~OP4<&?-^Dn~2sSdkP)AcKn9TnL6((fyn_>(U}FzXRDGiD4xdn-mGLykzzap^ zxza|DD?9DC_7B(IAHAhe%5Ex8)eFz~>k1D`IgB7bD)*t=C!kD;~MV zEiQTGbHGTFcz7OZ-G=%rS5XiUQwgMKwHn{Q=dJ4ZN;@xaWX;s^!dq=+ly>I|g?yP9QlTC_T(Px;A3kGBx0=fu=Y2H=-Tf_GgyD=+*!$-UIc;LS3A#Bo-xsZAb zN{n+c@&vn?p^**6{k(zwFMjO3{t?y`48B8a=Y~`?xki$7zg<5!X1)OqZoO$;_HQFp zvB!M`KWUb0C&L&GS}{s~_@7jWl9Jrqis0^`_V4}cr-L@8J`|mF-v$|0o{|&D;yY4o zlGP&;OW?g}P1sn#<4$o(8mR)$`YfhW&~(Sg&jUVk_u%^B*SY|Bzxz4ad3nuzVE?^B ze~|T^iykSN^~xzSiO_#!r_JOhc@Hv@t%hwEkDQkPBTKwTJFc5^ znezIpDyq_ps)BE$Vd?tjvX2Vj=91rAusUxz)kDA5jz8Gm2sH@)tTyssPx#=y{ONVI?KAk08Eja%zgY%yR(;Li8_lGT^TcY@LI~#r5rz zL;g%i%hKdMT{a`#^&vkkYI()|S7Waa|D^iML;f{4_wm8K7CAPN=?9Z|)o1U}&kPphD4zE;F791t~OP>Dw--a~q zug3CGI3)h^kiW#LVjpxifeb3N38r?f`u+o>*1hB(p!{bRJ)QcmK}Y*HWB+Nuch_qp zc7yv4Zn=KhBnNW*cL4rt04k^YKX0~U=zdhW`)Kc$;X=Cj5W8A&LO+|w!u(#c0QYBK zvpa4JnSKkB^>Z)%78WQHU2d%-m;Mq~KdAsmDmxDfn2{-Qu@yV9zDp4bbLiZJp;+++ z3ffVx>q|s&wfcY=D4(^bR7uBuGgpq3kWaOINB& zCxseHLJ<%`M_K|20U`7x^xpQH`+2|I=Zy2b-SHi ze0%iu8=HJ`LR>By@#SEp=0R#B(>CWE|tQl@eBw<~dOVxJu` z!kC_!5?-m6PAx%bD)_tAyXmS0GXgX%+Y)*ooOSKFRKlBN4=v=CE|KdZB2u>I8%Y5Z zpS0$zl*q#Jb9)(B>-`NvNP7Xu`SYJQ^#1%WCPM$C4*fPdKkiS84&6MCcxIqz#QFK4 zBPne(H6`0tbk<{*nUuME{C+k135L^@RHzY{qDac=#&Et5xTbU-V!)G=*u8aDZllro zawkCTJi=PTQ&_@n`m*cb*Y9O=|3)?4by=aRGVMHC;$eP*Q4Jys_LwC3W0IOlvc)@Q zV#lO% zGYJou+)OU*i41s3NB2&xzat>kUSwpGFyT!m>X^It@5h+1o;l08(7)1l?HAJN5qL~R zSC+RbrPBye-p_BB*uNUk?5+1ip9qTMPPv@wD^v<2&GFWa6RcodlY7HTRd%x(levhW zuo<(a0xZ*t->jcAOVd_yRkmi3a1G~MJ0K%I;+ z8jJ>jb@7K^wK7n+gME{@f&HmM1LFxNE&Vf^2mBlFITe$chhJdwLK~|IK|=R`=c8Os z31nPb{Q|JmVEMOk!x}Yo4{kK5WlyQ8k15RcZm#RR0yrV={AJ#E$-;aPW5FzTSfOls z!&T~TWySjVE{h(gY6M$Eo;NPmN;Px=%CgB>`Vk52s|4!1AQ~^OobG1oLtj=<5=`^5 zCHXk}CadL+&(wU+;%5G^=}56#QY}`vUK>iJPi8U^+%(d0-s`@jPWdmI`fn`l|M$+d z__TF4Tu1UDZM6~W{5PFh+Sz{y=@q-v&<_q98W*n%61N0S=Vud*=@@+<{}*@ppLK}y zQcdXgV9&zQ#RR;Fz#FtivsB&B*N|k5oj)>YjPQ<4--o}ID@1^N~7CqbNBYV-5NcB*_q322u)@+Vi?bjskSF0VPnzP{LdgVE4% zLucSjL>eN}1g0?OQpBu%Q4{4+5nPSJaT(WZm>|GXEJGdwOYtK=7#geQl9$+%AB9P}wU*MA{W^ z{wmLTRepFkDE*pB{~vV$k#5&jyb}veFj?ckb@QPtzYm^!*$j$Cd73PSNv6YfY%T+q zW=*A@J`r4(uk4GLd=dmY-UVw`OfEx-m4CquP+gVNLqgV1we}0(J)lBuZBwhK$QU0> zNA{$E7fn*AIjLA_BkG-zdXEqL2UD~>_p+jMgaFU&4jR<}qX0ymFi8Ygujg1EDHx)~ zr!Vp<9RcWiT~Msc<&El5XG(I3r~O)DUab~{@7he+o9eIx4{{}w8u`KBbc={ahoko- zPQ{^1*1zcligY6OBsnhprNf%CdHTQ36|HnPMI36qJT9Wen9@f}I!&AfsA~#&A|s;<=6Vt4;WRBUm!e-vbR`sQeAga7p~4uQNc)x zl!3$0nQ5_^F=%r>Pylnwo-bR@Y`YW>+*58j;?;HtIzN{%IdV(Wj}4q7@a@1SnRvc4 zrk{!a(YE>fQYaC^reC>xOMEe7?x5W9Uf0-881B|sUWzY?Ba*po-!Ed!!(MZtAfi~W z_v9*mZYRO_(p6Uj`?3?+O|Y!ov6DOAES_)d#+4NH+GR8Z9h-VLD)O|a@>hf8HneV18f^jl=-hF#+38D_~>SP z>{wgtAaJNUjD8#!a$)Si`ov0f5X3avFfw^`uW8CIzV%G}rw!4`@0oWrw85YnbmnNJ z@;ju##9JDvs4S3tk$&A?xnY>4KHFmW@MfM%&SJe0OqUY-M(kVLw_6HNxwJ6q`=9yj z)Zg}#KKZa;*Ll$#Fhzf(=M}WBBS!fdS2mb%c&&en4gYyeu35L$Srvt+TgASOC&@sJDb+>o z+QDp6D>!`DjT5g=G7dj!>XdJ3Y5#IEuJ`Ml8WG-cpp^kAlHgj}5Ms6~pR0Z_|7Uy6 z%x10Z37N{;5OM^z1sS%f{gvTygOgGDcDDJZgPN|je&j18*}hTS)TympsTpBZ1H@#F z{Q7z7GLdufXY-nMs)tCN{lTOOz}K}fCcgSrb7A`MkoZ@IU|o4k^02!$BS(KvMFDMt z)nT&P{USI1SJRX36|d^BU(n;;`7@!W%NHHj1|P{)LHOhO2l614IMbo< zE`fcfh5QEHMdpX+WDwMJc#PaA$7sSO*h6SN|*7^v!Olxb?RE!2p#M^W1NU$L|X6Umq!C~B}YPh2E zD}NNbw^4G8{-#Jx;$>%x-_{u5PqT^WS!k?INFRZjQ{>ygyP2#UlX};A&dEWs=4Zt( z?|9BgC@six@TN03eb8A*<*`|&zp!48$YXsLL6(8@S8cOTmlH-B`>l!&WTPNz$q|X>*+7jJ~Y}pFy zkA+9vpo79<<6p39QN_n6uW+=T-?QIKYc95X9Vu6n0!vF20?3u}x=MLf-*I1gbZy@{ z^EoL7=Vui{AKjO-260`$usgkXRi{zDwD(y#{r&M3=(gx7GQ7XqY%x1kuh};ud zaYD#B>qg$yZ0bN@SOx|9D)g^d$7#U2=rD#TUf(&i+TZlrX7Ieot55>Q-M?u*dwR4U zU9FBHvmCFFG*A|Pyq~a>9V_}6E(mFOW(7RH)x1K)iwew7NGWGadK{)yE-C?Lid~f`IsNh8D$pi^E7b8HsQ4- zgD98nX8X+H8w3C7gxXApaW_cIqjXrvB2 zuP0UwT&m=rU~_y^(8Yy63pt|M`&`bL<$1&@0H=6aeBkurpiUX!C9e#z!ByRp*K9-I z^fqT1ZmXJ59VZlV6pHUm%5|c0getAA5@f2bRJi`{h@XW*|T0&Z8kC0omaoQ z$fKZ52{}=w;o-E}gQa(wnMwY%&fTK|6_EcS*h}l&TCZPrS@~7#n=ir-5~*d+m+#=% zdx}IuOH_14hKtuNNxx^FweXuP4zdn#z9+qsKgx=D7*dy~FwYs-NdEaxK2?)YUKCcZYaHxx$Us`eBArrQo! zJGv0E^Z1)?8Zf7~!kSpAB35LszXOIi~daO`vt*)<|YYcf%1(0>LFI?1q2*XSS{occO(f}%QWQ4)_92T zy|LZ$)UEXEk(oh#!C%V$P%%}!lviU^#%En#!md3awsqG@dDzm!;!V96&r4O0#Kyt* z*;NO7eMcVX#Cjytyj6(ep-D+2;?e!KL!1lcFyx}~7N?eWdovs+mz>@4TTS(ohWN&8 zK*)Gl7U&0PWH$ZQg^N}P6ZJgPeqFb1pmeRo)@jolmu#;63h1TjsWcI8qw+k@XCCXN z=DXPJN94?W{(ZzI@1;#rW?*-mRo_&NZ-UjYDdeA_p7P!k7{ll3p2VyDZWHRYE&cNA zf(gM&kkU#+ZpFbrb}-x)U@C~HWKpPO6euc!YH_XlyxcF3CzU4@-InNh8zQj0h;7UD zJv<#&o>ez!BIeaYw5zz)P_9itIO^_$fTlTY9rc(?sREP*C#?*^5#w0}(LvGevkm6= z?k3cy7AB=f)$pH$KlC;N>1i`cm%lpCbeV1|uR&E$Q+ebj=GHy2uUFgxG*P50{we55 zd%oOC;(@@*qibttv!bE*7KX-+#EA0oz=LnK6JXa@{W?-(hT0-wxytAc)cDM!k+2V? z!B=NZAXPMCs7b>^_&46#D~xhtQ1zp zS=xm0-{#SOEc&nwV+m4tufJLDo!6e$rC_gbab#;IeGijqlW+94X+Tunr}$(3a7NIP zeAR5PPQRNl1J6VKwZc~F`-O7W;pHjtJ|nAq?t7fWOtow3XMc4drip7{H^P15B070p z((n`m+;s30aQE_N2vHR%!i2pEmq3f*(gP^2g{vjESb*DPygb9QGj|Q@>_mP<>BTf&^QZvj zR^*r5qrk+bA9abg)yw-ndeAv*dR(5ChzJha^Qs2x!s_Q`AQ}#o>rU|j%e6=ycLN7P zK1McH*Z&Gp20HZ?acWiHGoz*@5y-IdVJE2#!+IrA=X4ehlLTFzax(&=(N1lEV1T9P z3_q*iN>|%EqsQjdpZ+O;Y&#Dzf?&FVRR0yGK~82&=E-1+o(0B1^0tu4I<7>Gb*R0e zsOfJy_LsVaZ_c!*p^e>1JIFD^ULT_k#%MV>q9A%kWjw1??xJzQ7WhrZjB3Ot7I~6q zBbdtF(-+#LF3zxlhp}LAj`NxL4Sc`Bfw^3%4WLK+;FRN8H)7Owf@=E5?r?$4 zJAw^-3m;ZG>6~--l|_~r?p7JM;>Sz~RXlz$l7uPBGf+2B-X>2%$2^BjcgN#POx-4abz3bTFF3^ZRnT)!uzGa%thO?;y^C+#{MTQ^WgFWna|?q4pz9J z{9SUA-y+(-@%{2L5P8EO?Nf4m<0#T!B*Cyyc2P~}ZdGeea&ls<_1VF~gxwrCLN`zi zQg^{JX+~}wMjrfLmm-tW;t3CS@R1B`aVe8u^0fcC{@~MO89$+W;9XBU53|{mDtob1 zQ>(cxz(|LuhBQ%ye3c4NL!wUndaheIqjqYcKJ{+l;{-<6q>@h><_cMPFJ@>4F=I`_ z%H4vuxcNJimd(Mpg@jv2@&iK8@cwGDYdEd!A_om+;cl}fTv4ip#?kr&qJSO`>ZmJx zu@cDaZv-Y_&g~oFR=kvScd&xq779EtRWstqM*{}jn>R?;0+o$UFCG8=hq zxErPjoKwrA?9|{a6*K?zik5zluN_FG_R+VNkq;|3NHMp{}f=ke?lmxXh4FEy+4 zQu_s#cus04MH_V*VKh};|CQ0bGGft`^6^JW1)g-hK8-!c8I39D@KY|@g7QYusyobFUI5BdqzFc+p=cp|plNB5VQ$tdp8FcEA@CuHud8m*B_&um0bNg$j-TcF3~yS71a1N0 zhcf17%WG2?Gq^lYkjiT$i$0+;z{7mb-?OUTENJle83yw)K#`{4D#IXCXo`=Gt^h^a z_PTq+#23#OOZ3q|WFwy4B*Ol%!-XbHc4`<|bVNnm=Fo8ZZta6Rv4nksGgiZ)8YQA! zT;*OsHvaj=u!GL=VcizX z8yAHybk1gI)wINC^#)|HT1!b74zvR*H*Q8jV1lA|>J8u^_KWr=4jz$3oFS^#z7@?; zQdONRo8rU|ZQR*V$o5V!cmtxU1di*Q5P7}}EogPte2HMS+cKaC2rb=&110BjbhR7K zs-!1&6Usdg@#s`flewGU!umz3d%9=Jazoid0Y83!d8%}Bx_`C!$e|RK*sg3+@Tl3r z&TA|Qh-6HD)8?h~3BTEo%R!mO0N;rRc&x{%h&jK+`lhUq4eVP!d^CB$!8G8l7Nf6a zoCLNnrYkkBw1kRkIR4Xm?)i6N_a-$T0k1mY+w6E1r8Ct;c_TQ_w5v2=C}{#)JH&xX z_gfBNlhb+q#&GSK5tX`k*NHICY-X@blr-1FfmCD2Rp6d#?I{YLPamu-UyM6?1AmzJ zw8QOK@)HgmSNgNA$tQ2{Mevn*+0SV~2VKKEotVd@vw-d%fMoGu)le^blQq0iK(N3v zyAr!^uu5SvYkKxWTX&&XQJe?%@L*Z@QRH;AlF<-v60>I{y(prIF|*$`{?!#(YjQcy zrTUVEMOgvQdD!>3JfXOOZ(P|lhKLq;uu;vyqowhs`sAE)q8+H>0@V?osI)w=^~w z#Un@fQ`R=ffA7N`fpb<5K>3Rr6BRT3<<=o_GTHIqU6Deof(?hVb(qC{#n$ILRbF*w zN)Hs_g)k0t<`(pDEpm9NUe&qoS&5B@x)TAI>I0536g&$S^+Ht!{t#F_0(x}{{8|Ks{vYySDg+cA~E)MTlEH5EfH<7`0TyiPrq zDcK5)e(QS`*WsV96 zU~?SNWJT$@_f{Ws(}@2ud2Zt<*5J8>oL8nXglDRIL*=-JrWz#q~k(S^a!CP2QwIRbM;g0sc*=LQ}2L8U`=4_O#pNxNIAZlGD6lXLTl1*>G5cHYfaS z{81;l43x@S_15U>TRf1g7Hyb4ftfDJ549!Oh5T@NXH_cETqX{?dvDF((?>QInimya z|9lWjNxKWaPW>Q$&cA zT-mAanerahrT#*}xB^#$>^bANmoN#G9@nuLZ>4lZOLT)}f+rvrzc)3G6LTj|O=?Z> z9f+{nK*ngPG+N)R-lNhp1OE}M?h|Hl*9E=@kFN#aq^Vs5S+@$&i0aZSMFNiK` zeJFPMO8Lw_-;sJN7ZRX~ZJyhy^S-D|pcXbBW*b;&pZDs4n%tMuQDYt#seSO>^bUAu_pE{ zEh40bNlP3fgIeq4%^ZrUxH%e)Bto3KjJDoGluJ3BG`ohsR8N(q!`=eKe1Qw?mi@i5 z`Po2%oV1a|{m>3JfgDxRh*A=82e7|hPT5Tk3ddfKHe;zfQ^|H6s80V4)fE}jWNkm( z5GYmrU|@T@o;f$k+GzOOv>Ef~sM;6WXUKdT`Pa3<^!zg{mD@RtEvrd>m`5JshIT(u zPDbtnkeN!6>`z!GE<=m7I;D5JKQ7UTsx;YS?fT6{k|T*rUpXn(vlaSjEv`(o-3Vl@ zRqK-uFz)|_)?A`}WS>VF9cuqT6wg8medQ zI0r7*Z^xs3f&{-wJc*0&GK&fq^Xga8WL530an1OwY1N}x-)~vet*PemkghDzE?`ma z@Q}i%-J&kuVBrH+6AhDZp^h|x-~B~Neu%z{yWteVjrsVeS68#6d848)t0<3iwBD7v z<=hl@g*=z8$Jzoi!yugpY&@U|LoQ~6Qakay51q801&$51uUelFgBaGTfIHtG>XzMu ztp>v;*WFKlf19QNZfL@7FbI74TZ!R@EP;SkH#hk%d~(s z97DQi+Q$Pjyn#2aH?~zCU!A_@2nTb0J#T7Xp~|=UI;MmYmt$P@vba!oNweNsK)YAX zzSq`ubhvFP2FRUd=!ROrY{B~(v`w8__6LeWo8K%BaDs*(La5OBJH7|Fo9{{;xh&Ym z7pz}s*2Foonx-Zl)W9r%jGzkKhdG1hzbIH_Y@X1J73SvpKH#_+y?iz-*L6}K&b@Fu ztwAw<(gQ6*{Bisb1w_ZT?tV-uS#@4aiYnF>6kR$gi4KE0<09_n+lUFUwM2P5Dx3w& z^h!j{%jvUTF>BhoRWk}o5PzWFVS74^6hh9%H2kF{_h8Cv=b$Nc)QG2NkhhCj^!KAI zAn@LhBb(-qw-;wIc5}&{acYVE3=$S@pDuzrTFh|RC^)mA3X&-b&MIkb2S3E%PDa*e z_dzM2VyEs%C(mm&TYt6qD<;-$xbQii$3L(CW^*Vz|BJ?T*a?@5`C{n5X_l5v5u??d^Lq zi`qb&%w7(M7n&Qa>E_y@+nEsa(>U+cV8zq^uLwBW$G7kXApG+peTQG8 zj(qKFXIcJbo!rf=PBBhP$$M!`_#f|VSF_b?C*cWTX{9>LbES&!fOJV7_s*s6M!rAmR5_Ldl*jF?E9epY|iCWC8sWmd3RyBJeyPR z|0^^xm7~e4`={G?9pD#z4eoTfdw>2Lh5B!K>TiF{MX0`TT&}MeLt1l2$pzLc^SZiT z;7(4Kl>j%Mthmb6iy~k}X(Z;bG5T8Tm)}>Tu#cC2=$BQSgda)zeHDw0pLqlsvB(~n zkGuTl(!lrb9nkJieBGA2TF@7L=hfLd`D}`Bm>LLK)(@`EDj`qfZY~Mt)&@*14Yihf z)@4k$x@CS&f`tC6lyfid!&}Brh9#Se^ljktpRaD6>C>!N19DZ$&iPLQ z_8>M1)YUYDEi1VzDjsqM^U(FCIF(zNJT16Pdb+Q%8T<;X7dB-6&A0$T>Dh&=yt-Lb zJ=P%j6T}*r*I2#m@AEUq=O9P?NU8g380gDRglr3FzjLu;cJ{TWjHhD<$WyX~==3^qp=l(}-e37b4%ig#%+W!WkG&XI z&uy~lkY=1e%zJUU{>0*6?!@!lt^;e^G`o5mpV@QrY&t?j^3Bl21VMWWK1VmWXR*F< z?nC3Wea5XrLFm1QL>kMsL1Qy*e+orB`AF+W4S1d-oe%9%=m=PgPs<1o^6}|C>Sf(> z`>JC2Wbtlf&84aR*Ux9=jj2$JKc&y7PD464XaKe{(5WX8c;2TFfd||^D+&A8hkaZ9 zRbrkjVYBHUOVw5Xmn)xtp=e~WR=9?*Q;RpIe%O5ZVg2SUX=a( zn+~NtVNMQQd4|56aE~AgUjNqu{_CX&Zgrm+g~YgR7UI?6_WxUX|66(g+w%T7ApCE) z{V&HnTEYcw4D==4JTWXLG|4GqBup@BUtb#DtS6;}ODI()Nq=@sRSM;FDPpJ`q0zrp z9g9>$?cclo%iygsVjtBNM9O}3B4L6hrK*de?lK7p;&5Sika`!BH-m$QaJj9M$se;iqO_A1JRbo{RAJaQd%`y~m^|GjLv92=NSdFPvSB z!aE4up`Gco?kFRGoEcnp-=k_}!f+s0AM{yRXXX!V<;V?foMQO_YdVm4HSXb5)wM>? z874xbH&ONOGt{+(1otl=gh>e=yy-!UG)IUOMWGl)Qi`e*hLfj;ehBmJ-ze5g!@>{& zr?>%aUmf{+`@oxMM+rUU*h+D-~%!ys(!=uWq#XKzk|NL51*TcyS?j{ z>2Ej|)|@_K39kH`?iRCRrFIp{=@mtOA7=j3M76FQgF?i=;H|gyNoLiz+F07+mK%5Nc1*^+`uz9* znb~wmTU>_R{qk>=_P^(;^coR5nBDDTS~T6C>~n}IQvTyVH>T8_OF+b)LWGNqxW+*D z0r_%VV1vAFGAc4grMh1TJbaA=ZoA-vC*snaO z9%r*>nLlW%s#-**SD{byv+=_3kvXkRK!mPm!QqsqUdSLU+6-e#1BqllM1}~!a;+QF z*TwUTuMPNxXx;V8*wqqlJPy7mJ(c0?h4M~vR_i*`tJ2dycktJo5r6XDSo&`|woq%* zX0LH-ZcD^Y{%T}sSe8II^Ii|PIjb`O8y!0DOJ(lI@Y+C40I7$0RaO;5=Q!;;0qLFxMS89qFUP=>6@SH=+rIL zhqG%0^|%3=pK;%{xNR2OlcK>|C>L|Q_aONZcGq=S}& zbF+c8MmD&2ppP%1Cd~nzZ)_A=^X`L^=zywop%uBWJv(^XLAii)QkViQgT*xOAGu1S zEak{i#lU-$vd#(|=ww0O&Ky-`=y}VBXGH7g{mg&UiS=j>sF_Rh7dod`Zep(vUtIXTlZ0^U8$@m$azzh3igN z?5b@P{9PFPeJpGpnNeR^VMoOzJ?$#7Jcfv`Gi9_IMFnm7zYSV= zKJl9aibvvZf6D=$Fk2xq_U{-8c|0b*_O-cLLn7;@oN%m5TAy2NZgFtI*EMy$UQQoV z;rI3Czd*AGqjLo>5CEO3#UH)?H`)q;n%y(kmjVu~U&y*X*i&M2_0KOZbawjYY~|LW z6o#IhtA5@D9o=_L_epO4-Yz0Z9fo#y=Q z%MP@PN`^Hu>t%!tWr>}b+$=msG~1*qzs&?*iugRzP7j5DgyWb9=DKgj??#n1<@}=; zq?U%*Dnu{I9p?BS`dCqU_^D68l5Pj;!{oNWmU-0!dnkcDWHfarTnH(DJWpS(M+#Hk zH5#^;O7XkM2xc(!JaS?tKFsLnd8ZdW2f5+G)Dsxq7h?${&up7a3glcD-FdQ_qN{uu z+o;ZTx|;HtGM@mQr`aRA!wWP6;JneVDM-N^AdhnYS}HiaeLC2?iW3C^^l+DaSJTmjHy9$dswAw{J>D zQqw_H`cW-V$I#3qzrqW+ddpUn`7zPd zOdouE=NVp?zwEj|ilqyNdzg|L;8O?bITPfbJGh2b$FE;)C-9Og zDm+s$c0Tb%P(#VY(aws3r)#XHA$&KS;>g^H=YPWS5*Zv}xyGF?MY-t8YP>!Yp=(&J zB6d+G9k=XKgARbHJkA~|7*ZaUZxx7qX`@+Q;`?LM&TTzL4)St}cQ_}#5Vd26GM{xG z?|Dgq6T*Fgzx0nn-?zcyj-7%ol=*_gyu`5!8+_d~^*<*+XhdaPiK3qb7thcY$ z;wCI9sB1FIz&~?mqkXNUob7Q1QE&k*n3?w>2XN@LIB@2bSjV7bVV8_anlw&MkFrO@ zhGXFd(5*o>1HIK8I$={XY7m(?w;LN{*v-~q<6+dh1Kn9n+HxG}ie+)omUWjQb%T6{z3>P;1=o1A;TQSqh9@Wj~9oE-0E z>cq+};E8q8r5Ul;#l8*o64{t>_U13WtNNa=qD8&(-OR}gasn|{Ii{P@`xb27EVC#Y z6E7{_bIhPc$;>9HSolS+QKoevG|3w5mfbk@(Baj~cLh*=5tWq&HRa!p*GxsPn;rtf^<6jYY7OldB8B^`_&DdOkKCD`Sy9KNhE7Vgt@Dt>s}_J?v0KWY%*ARD!#EHPuJKY z?B90XS?o}!Z*mzSK+i7RiD=-cYifC5v1e~XEkKU~8&rmat|efDo&Y2P*?v)+BBIa{ zOQcS>xo}mv()%K|)$P65R%JK1bfSwFaSsZW5;*&?HB$ z46TtLn~@yqLkX41kQ==6tz0r+&0nv2djVf)JGf++FK zPxZrE-+SIJ_u4KJ+x_k#tYv#ZyQF+0J%MH;+ve3Islh@e(NdC-byom+NGM%B)1#1| zO!TxdTfaEc!IGy-%D)L8S}E#>W@3G(^~=`xjqe}VqS07o1 z77#Ba7sBB)Ds!D%YITMIK5Z3|9wIA|pHS3`Znf&g88V9MOAfR9B0G9}b$u-RxGGOn z_mIO@radSdAS7&4_Z0WF-sN^hVedB%&3DlHU8&8^9{hv4?wOswAruVtxG&nj*UBhb z8o@Gb=O(GBIk{9Ue*7c5j*_iX+m%2`#u~jt^JSFXNBBpk*Jl8A4MhD^(tLsivB??B zTmoDY!)y<7D5ZR}A){_#oa^*3+=1@OhxXDz7v^UUrYK$B@*ks;aaF)*fReg3f56|_OJ#rO^z!9%j8VQ2qurvFA{%RcP_h6M|Y9P82Oi?|KO8WH{@R|gPdgm2U8Ke3!s}`S}2hBcFQ6@P6 zX}g;(BGhdQM=b)nV2#D2zv(84xcIs`WBIB0BA-^#@S9-g32XB~Dhvil zUE&(vWd{6Z*K24RQg%N=XsI?H*|qo$5oTj?IY*lpI4qX$)SE0E+8b0Kp)lz_3|S&m zSb}}kd?4#P8+)%OV4Y977h?_l;fek?=Voqpl7k5BcW-gG`aa0)c8jmEB4k-FW_v-i zX;VP^=mT_F_zTsiuMf)V-%d@n70)KIGimqgXIQh`d5eZ9v12%7P>Z;KfL;qVCnp=sxn)+tWe3V z(AEi~y0bS0K}v6r3bUJ`YJF0>7taHyf{&rUtEx*Q?qEnt@nGuBaA_)6acXxF&vkgd z5T-j<#5dXPeDLs08#)(`+)qV98b?c%Ex2GV3QZpGp>8d;4q+~DKh|K~*;!n;gqSYp zkdE5S(Lxl?y@5M=+OI|0rM=-FnJ-Po3$h3iw@+Dv4)}tDvRe4;=fzY+HH)cmqc`PX zk?5zh1)(!ya}$!=hbwu?Ijjb{2EWUFEgI*ya;>W|XQKzdhO#i24>>=QYc3dYab8>` z^ix@ZV7bqhd*!Y(;E}4k%Vtl8MwcDgg&>uUB)i&gEG8;p=Oo)aZ~@51S)umGqfTyy zZApvmo{w+;#EVG(B&JgrR_9`YE1gPq!dA_u0yfX8C0UXPlhS@8To%;(Szy9VTbGQO zu@h)F@+XERrLNyJZCG8(n^fpWiZVTJXA)?vboG}XX>zT$l+be`g~JvMubOp(;Nb*8 zdvo3*=@W=@LeG#y82e)Qx(OCcNF4qeQu);K3p*Y^A%^Wwe7#~mpYi&5FkdQcu;qcN z_bY9=oLZg6r0n=#wy!WwUBqu2eFQltJL7^omeWYP&p`3)WoJX)TT-!nO4zTqlT9uI zBUvSd2It7urkfy<9RX*7(qw`+>9u&0Cqk}WGECS@pbQGq-^iTC9%`XmUiEiRY&D>5 z8-#j~hU|b7^eTLJcP#o5_IoZYCllE`%p+_~gZOm4Uk*3ZR)3k$Rs+=QdX+Rn8 z{zniEr1k#_zr`T_rlXHoo1Lc(W+$E_Ha-!O7JEE9R-R9PTXUtw*Zzn9{&T2t?T-J^ zetv}H?bxj&^~S&Hyn}bu9ZrxmbqDbxbBKmV5WS77K6{n(iH=96FoqU>ybWeGcmd{n zzFENF7~b$waBV7(YR4x#m8U~rqrYY5p^KB*8JhPU)Li;im)>hF!+iP2&PP{*x_@_m z&?slu4RRgG>V{WFa@L6ab3J`0MIBvf>kXu_4$b-ax~_;(rZX}rBPl0KuGj%7jV#co zYI~}_buapnQ`>fJlhskEL55(bR}IOiJcSf|g*U~>133k=5n2oQ#(7W0Wm zZ>w6n<6pgM;Qz%zWbu+7n`{Uzd665M6k|00#2l5Mmy9Iho7|^cxAvVtU0z(N0_Eg_ z7xJ=2Adl7MROUJZ4)0$y_J$k*|5}TTY(sp>>E@< zYiYHBqs2`q*bXfzG6hOu?G0k49NW`AeNfRdq|gHeh8ba>Enm0#8Uqh}{evcRx*gTcNHK5~~SmI*c&2 zA2pLlWF|&D80o3s7wBe9h8r>UZhp%&HXccOeb8AfKhTlx&w2WeH@w~=>r>2q`Xy23eD(KiIrgp_(M zT7!kSrbNn<4iN7Fa)@31O2KB2p}sh*y*VW>d$f9eUy_4c+o=>r1HbilXW>zgV#IrB6j2WPi~6fykPA`In`jD zi@2lrv5}>sq@5!s9ndC+754uJ`ZC>QJ#M6G|5CS&ez!1Oqi@w={u#Dy1T4 z%_1rCsGGv%a(~OM@=N!e&;4QOZTG|S2hQnMq~W-0dAmoGjKi~SFR-4#bSH-}1tR0!p2!*;G%}tVB$@uZR5jlh4E!H~*=b!PeFQf6jY-hAS(hLDEr*3rCA z6LVw1*QrI$vSmNkH#2v#=G0v9;$se@2E*2kSlIv5-kE>1o$hhGGt*T~b+gry4pmjs zT3W=Zl2F7|x}wHbOTt_cR2C5uA+g+2!Ps|Mtf9=W3!e%`m{FmA&2Blf0d|_F5eP7~_BQi&cu3#ed zY()Ri0vAN0p|xwgJ-K1&(7o-M>)S)7?ovUMEQx|j&9h7F5JD7bMZ{gyV} z?Ijwwb~P`YsgNM1hDq?$KpcHMIVLQgR@IE;gqduV8pz#sa~27IdTQ0@IOLc;v*VZb z8X7(Q?osybs)}fI1MFosWg|*`E)J% z{oF}b;HtDoz18jPen}q&n70p@O@#?h+*?s5AXRHUuW@)x9@A*I{z(Hkgcd{(QF@{A zI*Yjcv}Qm>fVh3Xwge4yf50+#D7G+J z46xI83niCxZ;&fT5q3zhRb{IS$6x)F#-px4{k^hQjtk-xn{{4%g5ydJ`_9{1>pG+k z8q%+^t!-?Ti?;zZ271KO3#gZ1wSsbkYlr*fJ3x`hujv%wnO4u%^YM_8MHg*qKUaf9 zpm4PB(sDC$1Y}Z$T&I|l--RDGtnd4zTA{yIRI*c0%pZ*dxEeE1sevBB=}1%Me#^)p zLd$c?6?U(ed9SPtX*A)tcUl;$d_I)j;2`}7+!w3z#_Y8=0Nigd)$=0a#Pc2L zx{0L+k&$m!NI7Nx~A&7Dd!8#h}~ZO6Ne<4|lO7`r|a{EmJ@8S9aM= zzRJ1o!jYFmmr~D_LYh`KpNKR#54<3lI3HT>?#3=JOf3@G!>@8Rbg=8IJs$N-)2>}a#Dm-KEBhcCR_g};AWGb3dTWB15xTkt_)^kh zrUJ}#btG$WQUf?rkEh*BCTCV6iR<@ne!un{((ltaTs>uS6|Se?OV(uzAYkYyheh6V zk=NwvG>lGgMqrvU+lD6`E3H2cStVm35vlCm$|U{XMCupa#)9zTr2(S`y7Ao+<@pgp z{91WY;xoAR16V_#cz9wB=~ zJj#8{kil#;N`IS~nE~QSXJ&Y=UzO3+%n5QCXmAUNc(Vn&K1R)f;)dR*PURr5TCw&w zWYlD4VPt0kbJ#JZgydhGUTJ!1#)VQVYscH6G*?@ zt-88W1p76V=mkX+3-P7ffb%vBTh(PaR&2(n+nz?=6)u2$*HFgb-Akv z^6ymp=(dFu);U7AB-@yMvkMP>|8_~%^B_RMLf(bH5XJvkWpkrT13$MiNDxS!gM2~} zn{}HbYE}4gWv~>Pm5oU|BXbSQXeN#pkBR{qd`Pxe5yh@d)vckR2W8P_Q1V2Ge(v~% zOMvL@y4$>yah{7d-9U8lj!4-HYzd|KP&)+N@E}(+f-=WOb_+ZcIDrnCGMHgQI$?VkB%3{ekkfvf^KeyAkdIfu z<7%`%L7a^Pm28{?uDR zZS+~Wb`?h?oK4a6=(hy* zIXYx?_ARTyYP$J~3k{b92T&9YuR2`L8bbU6m8rDQIC7knb>zr@zG?k`^6vQ6-Szu2 z$xntnsT+Coj>^a4zSZ>TslnMCc|sf%mPJ#lsU%4aJkV}ijj@%wU<+h1^`dVRH3vz)KS_`vyu14EC;{Zor6TQf< z5M(Q#H2lSGFWs(Me1Z+$%hK>k`)KWJY=As)smqYI2fH$ocf4VayLcTxH*9ZF2;VL* z+#AwCd9cJeemI`K=zGib{j=!@p}%Zuev9`D1{UZsqLyn{TafnEMoG zu*!;2;DYAG!CQi}4YMk2TdwV<3puQjGvy!#<+otghvg1dGxy&wOAYFEeKt#OY>(O> zH{3-kO%%Z%SsR6^UIQ1PR+@(R;6?1XRgZJhP@YASx1OdOh z1^=~|I9RjwJ#M{s=hrkevQsp({bDJ9w^^i<(Z>NaDp!jxXrt}*D~e|lA);w9Bfh0= zV){bjmiMc1;ja9X%MPfI`&(=?V}JCgCv98zv0%)=d%MTfWChi%qbuAd;8{m}83?>;C-*qi?I3w^)kPkOr0;KQ^?b}m}f zY~l6tqUT>k0Sui>i z{A&ALr5e%*!NfWeJM`{RyN%6nBPXhCds<`5unDp^8YYdFXNSM9K&Qu^GGme4*4{Hx z&`-%zmZSQLOyIj4xwfI&izy_UwJ2=98;e$q$e_{|i({O|U}=_7d4pDy#;WKKLRK%} zA!ucRpl@yDU8Hw9vN80-8kSwf>YT8p)UA_C0t_qK8~PPp#&5lZ&H?BihFb`dpLVpg z8cHZB*RB&FE7OG!^$)S=Q7z5a2V-O;-Aj{^X$7~!pCY)l;K^G9p`4?`;G<6OuT*zP zKJLgcUqktJ`1&-4x#iV$2P6o?*4ig#PhK0B;kmSG4}u=&V9SSWEsVsYLz`g>2B`1k zu9+KEnodG7*-!&ayyHxkO{#5+){*ee%m0&q`9o2M4jejg=)j=^hYlP%@LzWT@CWB# DBGHzR literal 0 HcmV?d00001 diff --git a/docs/pictures/ndtimeline_trace.png b/docs/pictures/ndtimeline_trace.png new file mode 100644 index 0000000000000000000000000000000000000000..51517d470976af9e99c12d461056969b62e5a74b GIT binary patch literal 243438 zcmdqIXH=6}|1Jz@L;(e*gA^+r6lv0e1yFkLMa9rNgidHGf)pKk4bppP0jZ%$m0m&% zy$3=kkPtYTb^hbbdgr|BJkQtjAz6Fvn;rK4?O(sHJ4{1O@j3+y1qlhs_1CXnYLSp! zJ0l?hcwD_gocVdQgC`-OdS>(Dg~sa_FE}+^oh)taK_n!v!s7JF4Ya!%(vAQ4DrZf4 zIs3!xWl|o}><@!vuY`glzh1ujphJ$PJnIWZsp-q+Po?499&%fs)8xyr@;3n>Q~rrK zah?ox(|+S=Q|-ZAK>yj=ui7WrSyH)eI>q}v3S*>QH&??ff(aW-F@J1iv&&sJA|V04 z>0ek4Pt??0*u6mo4-ndMBzrk!R+(T+sJuWhTxSO+P?FpNxbO-kq+Hs&L$b?T`YM&2 zM6)MFb%6Hu<6=2ws!Jcd%IWfYUE=8SdiG?KR86QT7a2%mbtG=PNRkw8f6ZHS8~Ogp z7T6y3JYDW;APd_&XliPK^k;3(>Q7>KKj+{gMmgMvFLIM4i@O)HsVz=T-pljIedX&} z{dEEy*+VF+Rln~FeNfEHy!e%ABlYd9qm}CQn0NG_@gE9qextE_e)g<6X7WXZmhk;M zd#=8tfhoPgPcKQ*NbO6IzNKN6yM6yx)K&f60-In?v9G0<0~xOFPq@&apEo2zS(%E; zCi&A@vUWd?=Y<)n00M&U#h=FEe7L*At#}X3t#&6V{8iwO?g`#}N724|8vNnvXWcs< zU47x#wI?D3e!Y>Wn*99W@$gCCou~!M5W))10Fo@ht?<_Tj~FQtb9=W>fo$KK$@hg9 zpD&-b82z|yc;+l18>|f%MC8z zq`f+YW5La*HE>;Z4rI!{BwMEc=*7KTAerZ0k${i$>Yq<4xrH^>u zbkp<^RGgn^LOzTAki~@fE-dVBk{3PcZCPw($;|TqJUM7ZWbg(?e;NEyRTf*YP*wM@Cx=#&1yB52HC4&P_SbZvnRaPa(C66T`->@rt zl-WDh&B?sUcNns*mWPsdzclAnel!WybhlEpVkF6)V?0HpRjan@gVw7cYYbwxvIYXtCRDj0hHqrKujIS|7*O&yKP;o)y6oFbI!F=~G%|nt2h~)| zb!Y0*bjMw?!w}E;C&8TeKZFm`$h0$+%WctEeF!??40!80U*7NHe^06v zUi_T*tH7)46&z24Qxyd7vURa7-yHtZpdkC5sW8~~`=|T|k}RMM6%ne)7hK;r)1+0_ zUe6`pN`6n&P?%7YP?4a-YKbqfeS0-}$LJ<%!A#|@Nl&e2{AR_Qo8iJP|E{!*Td<5rrc8O&i<7btG&r>#e?N>ij9j?jgyJZ ziUaf<#P-HvdCWB+*=L&a+CTL^<_K%bXI7|yLVI}Pq` zXaVzwb=Y(!3#G?u3*!op3iWl(3%`x=k0#p2j8TmBjNT|T)kJG^6&V*+jy~OV8|52| z&Y#ob7|gJduz6{_Y|TG5k>jG7r5KZKQ^1_xJz|nyt9uY($#^|0X~;lC(1Fx}b(_D$ z_)9)0=jV9ODDT)*)>!TstfXK&GcCh3!?eKPq+XkRq#n8}(bu+|>#SG-?>x=B`Od1e|i6ZQtRlhNKZ%|w*;j}k9+#@{X`VABUijFI#pDr9M9N%mb9h*Y> zB!l5cVAjm*Z4Y}bLsaVpY|*wUZ;fqyf8Kz*Rnks4l%bjAnN}VvNfi2w3i#6d=K44J zmk{oevxPc`-VMDHnvm6}$Q5n&6nD1n6jd5E88v--=c$e4M@jmpqE8z=s==$0`jX}x zFW6{BEHQ~4k6|WTCMSZdrjGT)CY`l{t_~iA%~i}pk0LiTSlm>_I81@`scI)oSO#GzJ@C)As8<#_>vOY6q)FyrAvHd z99NRAZLH_k?fNk6*w;X{&U? z)8*WycM0>;xKnvCg?9AT!-u2y8`P;@nZ!*zg&dwtE(vjMa9#U@CFXVaweIN{@`snz z45YsMW}@2{!^F7^Re2NLT`JC(x4Jv?@>Lg9o+dtj^;%^zZvP`~cfl%Ec7=A47QU7r zBZ;9*=Wx0XOW5f%{}~X!npVwQ!iAFZ2m3gEa#AoYj-`mE{w*QyhQe z@$Pu`c1~(+>b>WxZ{y2Ag+rsk1#EYvG|ThdlGm!>BeGgm)}|K+8}q;oQ@;`h+g@_;|a@W4tYxi7U5bVHa6 ztae~hSxdR*F+iDK<0cD&jGxVYx&1|tzR~pSj|`jd3$5Oi^^j}lIn@Sm0~0^HZ^%s5 zFMB_Oe>y04@EanQjjzF$wF)e33^x*TW^xiBU>y7m*#ffZoro)+O21p4fKOwlCSS$oM4EVX~;Q#+_q=YZB`h&~u(b z-g-lX=g7d zlXeYrDOjkOUcu=h$J-F~=a3(LB_-$K__z*_=;Pd|ayG6~oSBTj!2DD#dJCQ4^m1nX z2Xnx|aYq|<>TcBZ@OVAJs&HGo}3am@&MXPjqjeQ#;o1lpfZOmB>PpJ_H{jdzwu z?|@zE&7`~i_j7y~v^Pg8tU3m4eNug+j;b#9*P0fAqUXgvHiZ2_b4URE zIC^IX@&F=w;Ccn8d5`5{d0C5)m)iES0a&i}!$UP(#qwGcJDMy`FGZ z67cSYOda((YKH91oS4^HFwT~i^m8T|%^K<8V~bDdK|@uK(pZMO(&YA0C&|e%6z8hn zpicun6$hVY$Hh2(0&GAT5ehpYYGXF!D~xZRT5s}_$mo$s13Z3wS{`3Fz=eDXokdOulxG=9@{x7yj zTsQ`}dQyaG)BErCX!JZV5E1`_%AM>#KNnF5L7QZk&vdbG`Tn~-3fz%vHdi*eA{q4G z$UjgH92B*gG8z8&B02O`DrVvK>#SnG!0x+`tHGrdGSB{fHQ{`_>qGaj^DURKCwoha zcb!-NzKbM^uM0xjjw~;JFSFK>Yk{PtrCr90Qgpfe{g;1_R2c_3ayLEoa434f-|`CL zzkKzi_j!`o-`ar_$oK`J`#)ZVk!IkWDST1)yZT?h+uTgYIq&qwOuwJvuODS{CR16G zniu5Z`tbLm{~A#-kwQF0-v~b73jS-Vs^L$i2?$%~{O>NL|K@I|>3?MmfpV-@X?BrU z7{dhb{#Og-nmMq4ZTCO4KL<{ty0DtuulxMpEhG(?x%8KC|Fe}qUFF_A za+DKhQ1zZD0v@|m9ZJoCQHZ*|k*k)rY56O0;h|b;q9N4!>R&eY|Le-X5_MsQ{2$lz zl8jNv%-t7@0V7LI`OterpW=x>c0i7etsL*n?L1UToYyNcTFF&S(ent<`@fq-9|Pw+ zY%+5+^tW;j>XAir06pqGcRcdcp%`rhJ<|^`T2I`n|8|WdN>coN*H>A#f&NRz^X=W* zn3j;xz1LA3z<$*fNzfbUBP)H)`H8(4HPSX-O2%JqY;1j}PA6@l)Tlk$yhWeT8*G~l zzo4u@$3qTOrKx^topFt0?X!?8wL@77^ton8XC>~z?O?Uv#)I}BXZ@|k*$)R(omjNc?DY43&vAZDZ*KI(&P1_E zrRMHARB>x)kP5>P=XI=)-@VKltNQ6e%`WYXgv~kEWU7QFFM1v3Yl6vdRQbjb?07b@N9$rrF=!_;uf8v}Bc*ZT~%+!0%*__!BhiSci+;23YAJe7%Z;=Z`MRpv%)f@z8t&EpUV{8u<@8jCOg$>C_ zRYXG{Y(w<%K0+CzLK#>M%1?1X(txAsa;|?lon;}S=Q~FRvj6Q|mN|(bB`%mCqPjC$ zpp7ENvn38NtiE3Ayw-g~OPTAHdd^nJ%TCuygI0IZ-b&egbZAJRRqo7^stWH&_aC)l+2XC2Hc&XJ3#` zGu+cNCoW4f`yl1afrzCZM{0c!u&E3Fn8UdcNj)(F2_dKU$Wj2;!HdlEnW<_WIvHI^ zGNVQk35*CoV``p%SkdpJ0BDHnPr{g_{qlF{T^Go(@oW19Y5R2D%??=_nYVF0I#JfY zZe(CMM|o)@06{pOw(c1x&q3)VZ2GOl=_{e92rVJjJ%a;?33|XKRDl;SMd~PB_?nQ3 zGt9w&!|J@~#}ihHd%GulG2EXQ3_ra9e7q%nyfsX{VLdxK2Ym4O3!met{xhz4RF|U~ zC~;UPC34*6sO%+4&^V1({LZeSxd=PbIQemhwWOQB!mhH_+ADFJ#dhn>l~Bj!u5ahL z@qT{$qc)!N^9yD{-Vm#Ln7Jo(XJ#c{(9q<`^d?dB(u5IuygfIH`GYA|ecz5zh}*43 zDAm{Hsh<@YyNHYZ7RNtCL?uT`c7A1EekJzrRHBg5q`~X)jj01{vb1-_blKY%rslE( zNl!XelZ1b6&(w~9!Qi;Up~I$w)jqc#8VQ;nq&eS#{>~d9fb1i)O>B;Wxrfv2;gz80 zmlhVW2e;49_Tq;{s$;lR-O=Wd0;uSXh&KF_Z3|IS&RD|u-{`0;7mRlxo>&^aG@R1>(qTRGVBA~cPRMbzK7 zc7NN`Hxu~vRaE`T@rdLYfspR6$(lN{8O07&)?Cx4h9nbgu=rjUCr` zd0ci$f`abNKM1mOr?PdAsJ*EYRDW{%(P60&TZ*bjs2=J=31%h-!9*`;-(be9g$N5i zIRi$WVDOEYw3@kyywxLWz44|;6Y7&Xj0(^qI;M;DAq&q3Jq#sOfsiXUGEi~PDD?Wg z{*NZE=)cbQcN7ZBq!fw2lCf@^3j8e@xoHaPO%%}*9^ha$eymA)YMtu_hQCy`E+{K?jm6F)!rro`K=)K$Gsx`^HTL`-~N2Px#WtBemH{2EA=s2cjEZng@G zS_i8U&mJT&m_QZK$7FFPdA}&4XIrb(f+%um=13j$Z8{jhYGbuOeY+!? zL%$kE@p81vOV1K0J|g3H(#3OcVT_wEE0|^+=3B1+sFgrYJyYw(Xo5XLyKpWobwrOFQp6H$Z&cBf*ieCF|WWRzsHe z6r=K)G<=AzF|Qy~H`a9HRGxCNvv7or_tOpC>TbZEjF(uQUY!tTuAYNDQ!Vk4Y)`sK zRw-4wtAywI^VshPM(&MH?bU79sD~iET zUCQ+#yJk#)cYhMWL`3w&#&e&;v=xqRve2rasMK_PVGi?pHLXC$0$NLq}5bD@^;Ly2q1Tb>#?6 z7QO++wb?Q7gSvs8x^HRsmXO65D-?1iys-oBFq+>#vAdYlJ2c@s3052_LpKfEL3|FI zL_}GP`7>TU+(*zwrio|CUT2l%KThHRxJ8Fy4vges(WUEtBFAQ7X7|*gvSE?n;!?vL z>@`3jdMZC(NI5}z2P2>K0FH5WpRLJqDRO&nwUvqVEb*Nzh1~QrZ_>#-6qNa)B?X2CS=Mj?G;%nzBfjfU_wW}U(snvHd#g{UWEo9Kb+hK|(p8S?W}+nM1t z*G4Ow!uO3r>qmK5Xj5zz@ZMpL(AJlKI-dV+$*(fboW7R-qgC)Gkb*nERfuRJSg6>c zMDacQtnW&0(yxMEpEDwH6^%wij3|b|?}RJ*U3Ar+bLVCwGpJ~?<5I;I-Q?$9!|h>? zC}lf7HNHZK{e_p_0${44qTS(jSgaHCCBEveE9U;p&p&jj0VUg=&N{q`RYpjub+&>ewXs}r)LJ1*)Ha!^Kq&K<<}wN zg^Y*Mep`X(>BvfwajS8rs6>9>CUT#cZI|y5^VBR7U0HThBUPZC7X)%-y%A+C7W4Vas~BK;6>ALl|4 zM@_!CH=1tey)(PQO|t9iH>W5xQm2#*y}7wbB6a8QvhA-Ch}a6_3u)#3J2D3GWeD{V zZe2m2PH+0(9=}Z17-e`V-gP!M8Q`8-JLAPx#K>AclQ*Qp^E-+%GiYnSnO| z$J}ySNqxL=Mr$R3(ts9jH_dw>qTGL)o9RpGnWqO-gB4SMI%~y#D>o_tX>U*XRP)HG zGcVA+%)YA#xXz=UZBDm7PtMK5S!FUL+Y_r_V2v;3ZeEW|Fn3+)H^oL!5vp73=;m|% z$4kvtEOlzf7hHTw=Sz*`*Dh?_{SR4;Yu_3Ew6}rC)a}F`@fBstr9F}pus2qtxN36N z5R6C%Sx{li*LvcY$?-qYXsV_%NxIuLZb(caQVq|W@!Fy@c^&sLXL2nku-q1O5fq~N zMpqxkDRaRDsP5h)CY!}f+4%(eRQfCU(TjFFlIZ+@GL&$X-@}6tuPIwVGa{GHnz)uUh<-R2e50 zH_peYyz~ryFyvn>cL(60OJOFUYp@#BZ-K5JSjy-TPEjtR``WNFQsOw$fxOkSn#0yW z4k?4y)Rl%xhAORn$C@HcK3C8iH=B=JM38A8`P{e^PB`BvCo2=RbTmK+&cu4n&OKHz zoIiV`>o2Gg2+7&+uw&(Q2(OTOcrkIG>MFnI{JLg0(t=6E0FzjS1U+6ALCb11J4_Bd z0p8)cTA%*MvDF73T(j)Ivog7-K`kqK6*vA)t4S}jFls!k@DO@yS>T!!zA2*!x;O}# zrQ6M!uRa}eha^8A6zVC{!B83jNKaZI$z~eN1(xz7aNA8O1UG9W0){A=u>gqXH+=`H z4is^cf^P1A$z@MJ7;(>fs}Uj0I8LEaR!6s<8~7N$k?=?giYS?~0KDVXjx>)2y%M8n zJ)G7I{36V7oby3!EGi8c;F!9tq5T{uUu~4fL1{}BvKp)Um_PY^Lgq0 zG8>(3S3ECtfh&yUIObR%i|rVCG%--oRB$7}Z>Xit2`#WcB$4rWNGQI$&pfZQ$0vV7 zf{+=JC}{->ryS^<8whB?r#LCI8^PQXB|%EkmY*hfVgZ9&XOgxu<971Qbc9T~E@Bq; zGpG+@4yMLT2Mo@^S1nAJTqlj}*bOYnRq+@`07NXynrf(#A$e0~x+7_KDwG)vb~t2y zwqv!BSQ_XItaO&~$Do;LAvd%hz6K0#!Y7fL_$j3m7WmPE)(8qOKdEAzlP$WsewOUQ z@xd@v7beTcn{Q^J--wj!gb!O?J_`NaR{x#yUri)oK356;yQGo3bNQ2^b`?)Xuac*y z=ix`!l`Z>fb9m0?ZtnzVV9VUNU^L?|=$Z&B7a(O_&`b@8v&MD0EdB9}Wvn0lReY?N zjLmGiOxC5L!rg`f$Tu}d)>fsaeLf}P?IZiH-5I+&Aoh?`>P+ zGd*a)u+dijHvcCbif61j+?<(s5KdQ5!@N?yBF7#i((yIr;mk{w6}TRtP*Qy(aL_^ z3LagP)gT~!5!vHk&L(t&htHR{5)7Z@DU!aI8A;-$H8c?PeDFkFwdfDh@)`e}`KJNN z=72Qb-ak0F-DD}q4njWDom66SL~}~5jnq}|6#8oX4E~nD6}r!|C$;%xT*CLKxFD@J$$Cb0T42Y+Z`lm%Kc%g?l6KjhgsB{Lj?njdhX|K zPkE}-MbyY9t5xd_#df-7W;|%eVlN|)d#_Ulgm94EOXq1f}@JNr3Vo{Idh{aKFh$*kxG7`XIf$n}~q(0rFQ@DuG z-#n${(~oTgiZI|uPHd$oPCANQ;^o=$GUXQUDf$NQ1$^#!0Pq{Hn`2?Ka5v9~_`>u# z+_z)x_0 z1xGxM98JPLCUy!tf*vn0ZX};ObuyZFJi7_ec#(5%>!VD`JuF+983BA-Hvi$;QW_NW zDu^1=A)SI7o6;cBnSgm~R?8vWFX%E>UO?rOAUFQ}`UQrdJ-=ZYhlG zrS<1#O#-QVa5Q0tunTh$eTp8(2fJ&?# zELpXdz-pKmNDA~#CzCmAzAK|gL%dW@JxU*K=ht55JTU{sNw1eGXW_ai$V0_6Z1PyQ zT?x2IkH5yTz1IhWZzt7F=N=gwn44KP3nvSXE%_xRtU)pX1)f*w(-b1ir=sZ(m&qHt zmdEtX)MhS>^5i5bQ?;OUa&%YHKKkogN&%`q1_>-cR?<17Aj0KFFm2$3IxBtglKPho z^*gEK-Ww)nCeyT>#Sm^jGl>UvztYV|X%BM=boMKAYY>vy>I6QPPCrjLETgQ- z*yqrC&FaxeBK~BLcKWuEz~Go(DURoJZ@S%6J4@lLJPwVu9jj>3aeNEJ2(|{9!_2<6 z4An_HH$YB4tOwT%c#UoEosdwS>C}l;gkXA|Y`=GU zeu208@NKtbFvm$!Cg{ic3L~vWgoudQsLxI}{>k%$QWnvFrK8Pqe*&wVX1e{?ersk% zW;3yUPCG3K82R*2Z2a_ou?6S*laT^GruRpbe7^p#3mqubrfF3_;w}iMF%nF#(LHf! zWQ<`J!%OXhIKGiQF4z~2t(;;lp;A**+_g(qI_8PY1$r6Z$n5B-^h-~eIYk>6zTRoP zn9k0)is*AL)IgYl#`i<$?2RSTzC%lX9T!wnT~cKPBJ#H|6qgV4lyp5>Y;%%T;Y+x> zK13A7SIxjgdR62)dEW^$hXY5z26+Sy&G?d(AF{W@SAsyRN)<`=_@BZG?h`nffYaIL zo&oH=&zU*tQwz{5jDUA+krgTaCqoRLXYD)z+7F}K-Y@{RanH2IuK{$q?|nBfCD&fZ zO;;g6qz6bO-e8iYgnMv++N|=(2sg3{965C% zLJ!BFi9hf5%V+ZQaj;EDYc3@a0vXFz59`nfFz!itx^@CqzuluXFcpX0hE4kXPm-X*@ zOZYcJPe~6#$zg~9|AR_!2mvt+^uiP#hyl?`Vwx8sM-Yf?4 z|GOPaYMy}=*FV=|{|6}bsQI$xuUD35bxf*BUbw@JPXk(=g*W(c!c_?E%f_Z9WjddV zYci?cCua^*tCcnJj6?>1t9HC-5OFAW7|~*mJ6j}AKSg(5e#TeRXz$tcDKAh6&%+oi zYOS%45Yyz>92*|`Ms-$}*2Z(dd!&!&OH`4?c?))pTOHXvwRS%3KVimHL4hmKbR@5M zRMO4t`fzysC*sHL=?hZMqOH`qzYCmJ4|OX8T#U7r##hcllIeMXwq z8BsX#A-5jxx&sl$Czd#MXtgM?ng@(Eo^SH0B?%ESky9jzkOiXz;@_cd3Er;u4 zWN*QyXLi~J{Yq#;hMDStX~Be@jxw;w(r8V$fFn(lCdQl>P|k}71$-gPJ-aolT{O|W>#??AMupLIcVN!9u;a5Ox>p!-?578wDTh~* z4{AJ@(HF>iAJEB!4v|IL)H#EWOAc{5^9z60kQEhT=ib9pcAcOn9jTTm^0yk0#P8*1 zb9!eH8cuVQ)W5yKUjS72ok7irk>!7?gV)1Hk9+*GRh0k4e82hOw`+!CeYJ7Yy6L&b zqva68D(i0}hgp{j)+sSWy$`!IOzhw=t3N56zlT^?#;OCS*#YF|{5{#jw0?|NY>CaG zya0oZy|&P3rRbv!ExBZ}XHo1@m{?LZ?F*x?sT}-Y^Yf$C9RZ}_X+t96ap7d+tr>77 ze7x}vxj^i+nOz#Rq}1e?<>I9On~Y&w)wx1!Sl7V))t!_v3)Pk)$aLfA7#Kc!2-+z> z-mn`~Ey!FCimH)&2+V)jAU=>*Ki|ruQo;@l%+MPbmC6MwPg;kYfQAn9Mz+<{BH4Y> z9>NC``J7oP=1zGDX=-Sv*@g`#$4!g++(Hqm^dnZSvd-kaqBGsSrQ!iTzeatE%Qvv- zvArvLRHZbeGC!U_a;<$x$M3t`_1N=XU9Qje1iISL!U=wGBTuZae3=SW#FPSDxgiQDX z6!NjRC;olYSE`BV{v|xl)#L9X5_I`b_`cc^Wi*wkqABHxac32?&zD+PC%r1!F(8~n zCEXh@z&9o}$i99S&kvD3*Q6OHV(_R&JXSfcKrZwWIrW3wjfNcuWE8`UhGc&0CA{$5 zoFDEm+A%-}0n|-FpaO39t&eN$!SZ;SSX4+SHF9{^mdVqvvl$T(9``~xON>Swb4ORq z#(0fm$Q*SJlZWBD^U)U{3CqIT-;N9Od%~ll(gT5%S{27f3mnhbZm z4V;IZ?U~%AG|qvDNF3@F3Yu|5>5mCZ>XmR$#B$REO1~$`s~_kJx>LoB0OT>@Xp4 z2c9|8UB>?jw+P;r^b@&3RU8{P6u_Z$+hFN+EG@i)eY>&N5c{mMd5)^CGsMsjPPx~<;#nILj8WY2cn>T@3hlB`@63ShUTLm-HB_`&>wuww4OEp2*g<|(q|S> zQUprdbVBt>H|YUGUuxf~ewQTr--Wq zDwM;)r85e;1%~nUF4(b-=S_aB+bjjc#-_~l?>*hpDVl>mbz;;}FV>VU@KsfygRjQli{P`%bN{i(CT)&*pOffzq-vTRF$0))0uSA_& zNJHlO%To)3e>BGBrYVb!Yx+@jU_J`L%_Ie#$^53XePz%6<=5iiO&xK_*~y6_*E9;n zc41QI-fi08Rlaq~0fb1)I*u0`f@-L%;&8XXM}=`Nf$^%|g+g^s3L^(LO2cK=7l4Dc zPer}5P<8;wNHe~<5q~^A;`J1?pUs`JcuO3Yw-F=yU>6FU)tcg>V&Zqm-{U+Bo`j~C zcYLpP9M|a_Q%cn2mL9O}%&m36RTa1>04aAzbi&wXV+U=0jSh{r)V=q!t#CFMUoIXo zfuA4xKqXxK`f`)m-1p`eUU;q#ev?p+eR`6nQC+0V7f*yTWT1-I+>|y%7sz2B{Pzd$ zlr-PIkSrwZp5MUhV7&R#^=l!kwQ9^ef+h_*o53?{{oegD9IB>HhVt~w+tCBx?Kb8i z(pS-=M;ND!@2cIU=NwOc_uW#sP9p|iY85{JD^~qw0mKqZu6d56J|p$UZ=_YupNxXRi*STQ5eYQ6 zo&`u6k^094;|j-0g?1cl=RI!V@DFjrb$mUp?^hU|9HK%hw!D9&KK8GZf-Jtbf#=G@ ziB5*1JTm##5172R5boJaict-{G?DP8Sgf+dgK~txPG|D?aIQ+p(a)1@DbHIdxLkL8 z&+T;XUFXAfO_#6v$q3gKrP!zTuL`&}*S>O+lvS1YlT%!QFIolPW(RQ{^k%=NUEGi* zfNf;QW&UIaV9q{%3BylM>6q*Dn{QgG>|BGu9SjAfq@-y3Hq5jFm1}P`nJGjou^VF& z6~=kG6oF!bGl+|ffw8JZ@1xWETC(%*v6BNrJrN0=1SYk;!ubYG)w&uYGRl_@(G7~k zLnJg<@%Mgzhb(SaoQD{~A7=O!z=<-&Z>_8$o%s8r$1iD=7MVO|C0*4<3@Jp+pESUt z?E)WamE7w}^nW&g+0!{n34@f(=6b=07wP`(*07x_eN* z*sZsdD<~-$(txoF%Xk?tk`k)9=)>d^y)!EBbE^p3&O>mDEONq%QvhaCt5-<@a4_1# z=YtpUedV-V_Q>$wUALda43rNeMI%P+BCivkmg3N5f0%mpRh&l6BgC*U zBimNKg%-k$23=4fnj2|APtbPW^Prt!#?6&nSHI2HZX|B1r;;G>B8JWXXRo_8+TNQp z`O^`RkhV84WC2Gs)GQ^#sV-?pu>lRsH@^(&jOx8#Cs&@tHN;n1_7_ha7AHO7bu6+9 z0kRbw?;HwCY=XDQ(Ktf^Vq~Ah=L_|4=0qgBSptz;o2;b+6<{L(mwH-tXk)xpNV7hW z8;rd{iNQSQChh+ z5L!Qaucx)S7L7FK;h^LPW8ijzN2mZUFP82#r>a#h?B_sQTu!{*$; zIYi#8xO^bM9N}JDw8%lePEE2U%AfLxyU5V&tzI{)xRc)XLZsc#fJUk>=bY_A3!{ft z1A2&1aDY45bg#%+`-)eyc_rqOf74!wP#chCPm|!=Z4T?)jD{316PR0YEw}!azSBGk zN}n^CV2S@n;y|bIW~#~tvt2Xi;na8(Z`|Zpn+1o|*aW z$zw6~J9R`7X+rMY$YJ9po^^F?v4P$w=j(eVBG zg?l*;3ztBZr|U5TJ|Hs|c2!e5dGr2HNl8`PXc(f>ba7>Ay%(#Xyw;VzF)xyn&Jp_- z%-a_0B>eXIzO(~EBtbjNcUXv`oi|}cU0VzHENMY>4{beeanXD1Y};0LUDeLOkyjbi zP$-`G0&89CdT5}oAnw4yI1>%g2*rR@VaBChU}A3$6%-QM>wrN@8VZ;^8fZK?PT3o- zLF@lil{A2n1DO;@wBjk>{qYURu%7rRzx1_fcu~?Lh+lzZBxT+}DvwMJxlsVdZ$MP{ zlj4EH*g-{8P3thV2TuY@NxO($BGym!O zB}6p*VYzB24%Ye7;?nwSqJjWZb{e@yR*%r)K0KyTWXeUL;xW>aUFPrloreF1_Y2PM z6B+0=WWPft30Zn`dl)S$oSttc#i@zeY8T&&I8(-^%$;>O%r-pjU#)RoXqBKDl)4X* z_!@jJay<_!qZV`%>VTKVk)3C2k2BMkXfv>vApfMGjY5;1hqJ@Ngj7W3G2`1%uHa+3 zW^CJe$X>z*Mcvfi&(fElf+d)F?(3+}Ct@5x-ZT4?$3FxC47g_wVIlaFm#`Qj6w}E}h@FJRg*+t!?|(?t z3wc0R1brv-4wDBP*P;Pmpyr8Jhm~{&KXAPzkY`1tSoI&O{UpW}k=cG2UB z)I*l1e)lJlRu(M{O1imqW`@hMXVzcv>r3jWPGi|3W|BBPt5)m-RYVB~tM;MKY^|NA zd!JK6-6I}c!)kyYkAwAa2$AUWz*ehc!a=IZ8R>L*IeG)0f$!q*bssEdZoDL+U!r0e zJ?h(Ay=~sU!$Xliw#L$k@T&vYqcNByHRGd?vPO&yh8ygqM@v{>@x7h31#dAQ4rzq- zvqc@*x!PV+9FJ5s=E2jQ^VRxx*_HI=Q|9j`0S12)!T-Y1ubz;xozwoa2Nfu{M8ByVk=gUw6_Efz$Z^f@=9Z3rE7wMOLM&#EV z;abm^1dNx8-xew=Ks0wh9hVF3i>{!aA+|F#eJkBt#-3Dp*d&AgHtNzn&y8VwJ& z5*6ZmqP&IbqIfq$$nNXLcB{2Gyw&7P4?aEk}`Qlyv9vmlyk-7N9yOPzJ& z6wdJGJ4fJv*n*AbmKWu3YkmH#OJU6|X7tGN=cJIgalz9s6Z?|MRbH%-x+>|mZD}kj_dvL)b0KhI{_%h-UI>F z!yNhnzM+;>=sLJSPLi)e%*mKhkia?ZY#fj-6iHT2;HeXBvX2bT;ceuIv$M0L`;b!% zWe~~5ANO*9R$0oyK%MQ8jWSs(^Ln*)1Taei;`A6l%&Q70%#cKd*lIhRf^0+{Zc6WS zhh`z+k#0mE=&X1b%7|?E>aBm6UWD(X&L2PJ~&DmTXHieQMkCSv*ewk%jOTc}CD)PMm5+}avLglXoAfrKf)ka6?? z4D1aMrF2_)7IY`#cH+A)01>5pF{4Pk4}IMBrpOBuaDpPc!XU8BOjAb*gF){t|_|J4^kN0&#d9Y7De55ZVS~h?^4I^j2i_Qq160Lb_bKG}Mc4ALf)3+zTYVSzj7jQb!M11WI)7Pbrtkk-o zn<(%#AeDqsbb#4EAa#Zl^^-5`E6qLQu?0vy}v1`tZ%gS+)XBx%< zaFvfUS8KlG1M#&8q)n}h{IespTQ5#bjxP9Fj74vJ zl3y#1uuqxw5-9J6J~w1-+;Op`mVTIK>Q&<<(q~`=bF#i(#zU|FBDLum2kE659{g$Dl zIz#_R!L-_yvkw62j>WdbDcxIomQ%-D z;*=ff3X>V*FoW*aZW9Sya!1x%$djDnptqX!b!O*l<4VZPF6hnY;_3guHvV_Sk+Zu- zWz>L+Y=<6TW{>m6H}5q5gCxm$i)jgP2~PQ07=UY#>C#WR3$Ei~ghFKC~y=S0h>N7BfLwn5)mO88mH+7`c z%FuIOUw5hyBBvGRdjnL|$i{e5xvb}bc$W=6e-SBG|0TjkrnxbUtGaNkL5iNN=R{d- zOoYmnEUj*o-1Ey0KcP*w3slw+vRB^jQ^g0Wsh;F@iKlzHY^*xY%ip1NS6@%8uFh{X zN@aQaN~v4TC$GHCuL3{vS=^C4Qz~T{;*ME`8mI77f(vt$Y(QlPSa{fG_36o zITLtJ^%INx^R~?p`Ve(&8)BB_c^(P*LGE{_tC$nl$3Od+8%888-BF0SMpn6R7WP_1 ztt;(xfBpdJRMXyUkqH$siLfVw+8+%omD2A}uR5D-BzTWm$@QiVXu=7-gf%s>hR;-k zHzrjQ9<1p7V)msti-b!gNl*L-Ez}%aPikSeGB&=4TiNEc@kkUhTI%KlFH5R=C1!Gi zc6+UYZN)Dyc0?VNblmw|oE<5x8|956liU6oA8vMCC>{lzdgT>Ev z?`qeR10_k=v>R7XpFe(3`^%K|kZX7^?d}-CGCiP-^ECa&fa@>TRY|fhOc*;%F0DRB zpHETtD{GRAwob~$bALlCN|vCC<@8X&ZzZxx9|@dSST8PA;o&@0m#z>qi&U3nGf{lE9v(f6xMB3iQ_D!(=Ruz!} zj1~9^w%+gfwBz=6`Ze_WOtxk;m1VXus$64dT2&cSg&PS&z#!2EsEAgZy2niD-r}#N z+~|2sCe8N&ZRsK?ph&45=l6eB#HRDj*ZBJ< zgr4;@RNy}FNZ>p2f==cXLokQ*&#iGHBO_HlLm|Lz)EX!Kw)Le#?p);k*@9OW)hRx! zeb>MB0th?SW8~SgG#1GYk(pUAzo&L^#{I_NQ{;Lv&5tK?}`a*v% zZ~tW(-;p!4DwfNjCLd;cSM%^6Z~Gq~N|FyKp|ZovIGWg*SQjh-pZa>&= zK%GfJt(o7Uq+iMZ-?sd>uO<25q={a{&(eSEr2U}0=Z^Zx>wk-`1`WVTFQZ+Te$AW$ zlDCsy1)X4I`i*}0nqr)Nc z)spfCiGd&BzuxdW%JI9f8`MAbk5oE0&i8dbteX9!r|IZ~t(!(5tyA%)-^+=27m|2` zRBU#9dq5yBtr$MDrn`}F_9}Yaf8d$a(chl?uRkR{ zhH(~KT@@PjaBlmf1O4;Al|(>0Ipg%f=x^=hYsUL6FBRUKVv$``ciQV}uN4eoX2uCr zRy%^aBY~{lQz2VB1Bw(k78N(0hgYzaYHag3N0CFj=4pa!u|wFRnBV;i4xcmBOpsf=Q_hcxOCPy8 zeRf`>vFA?f;IG5SOj@>tA;Sz%!%8IUpK~BRf$A6 zU3RtFx;Ipy%b|UyeEF^|q087fojwx$?5yvWg~y>m^aW2?D!}y<8Psk!2-h zTY+3ole8X9!~9a~=6hbo?YZ*cYiN%hvA?r79bK|~SGr@cc(HJ?I5k(x&X(AJF4y&E zPjJzrp)OAc;sE%XV`kSFS*Act1ta+NCd7xuC;q{-kmtaMK{shzcIv-p5!S%x7DA zsMiimKa8850=LAPqEgp$M_aY0^3-+Od`30*{QktxG}H^ zyE{k`%80$tv>r)_zurr<8sVCC9-GZPhh4i zjB|}BRwHI+Q_%+1%JUO)#Qo~lPY^PB`gXk-qw=UekI5bf9dBu^>bIRnqp_jcJu#(- zZ^Mhho%7s`7+1TtH!Vos?H5j4$~6e1enkW}BHJk+lUEkcs`U~Fq_>{QF@f*t+&Kav z#>Fcw%B9+#P!Z3ZpufA$!Eca@9cQ=U;zd=)L%bbYrCP-iAAzHy#< z?Kw)wQ4yXOp_czA_d1$kwd;H&)8izp=k8w;$NYMkK7V#!?sXtd*{+-=EYEsxNi->~ zLzms6$u2hE;T|E>!8v)!MxYa6pp==@$-EN)o4=m5=kqPMU?~nC7l*35x$! z{{J5_-ayJ`X1V0AwD9ME;Wu0aeX^wANnP2TS4>^)m<9Xn`(h zX_LQ&Xe15;yF){WV?^b@fs)@K8Gj_ehV@mrIDx-~4PzmrFO48t9IO9djPzr#ona**2!LsFBG8s^t!u%iU=SNd`B0z=AKf7N?|P z_B{QRPqO0pYs~)rvXmx)ZD^^f>Hpp8zW@DCQ1y~2*Z@qd9(=U%G+(DNh=P`<0{j|$%dDaL3#DiO{mu|m*ZkFyb)fVrL4Pq6l60vBg zm`X(9fIFkO4>_TK-^Hc!V!-~&Y=itNkW5f9(HvD>XwkSlNCtuBInbeJ4){17N3!*E zoj5s56^puu$~{5atcX5nc#;KEJ{GX|+Su?49z`;9|3ok8G=ABn734NJ9pUeEzfWB! zuNIGhY``1V*47&Xnu$9=FP)BK;I6Qx%{m{-a^>8dgQUZ-RQauXMoEu_@!pDqOThDU z<5>gmrW3ckXSZq9{Zlk*ca6v5hK8KVHorK!Krt@jU}>|w=Tb)0((FKSJQ8>YpN1(t zYxn_fi04jF@ENr4FL<->nAkgfodw;&cUO{@%crDWf~|frbB?obr%sLIH>vn|*g_8f zXHD*5)O--U|1{JNYei^tL1^yN+kjLT#*af4j>#rv0Qi|!?y_!+bKG130%QfWR&rki z?LNkYoRKTl_!F4A4?-ds%3Sre!|Mm?{C4ab#&5LZD_&g;i0lnGFhev!Qy!8RfpOe< zn7>o%$15S_a369+SARy)y-yclv|GDr=Z~ZOctTV`7YpY^?}aPVi_uG^4iilgbESP| z&6~G6cF-{jYNyO!-YyQF=2VC(tKf~tn})NklK9{q&-NhC+*@ZDhc0;6i1Au+u)SO5 zLWt!3*xvU^1h&S&VlTIV(wYmxV^N8|irhanGTi3ZE^)j(MVW8pt_H0>>QUQxX2^uT z#wQog-A+G6cQE64Z8+wzx#2Gck`CR{MBoFhod>vPk!#KHDv;ULh>A~6QdDI4ve>Q| z=1&ceAW34^G&U>?f?a3(b=GgCJ&*SB5Jk~8R{Jw`(;GIW`^)T2rVmnidXP_EVGnwe z&@02h=M{&{%-U<%qlWg$cEjGWFWd()xOR%;=P=YpGNl|vDZf?#UlfNJd!1K1eEzZ# zj{@sn>3$sU#}lgB#FfAKqAf=JOas2k#jbVJZqh5l zhSTpB1B;~`WB%kaafr?T0!sq#(_%m`ysg)gUk|u1sT$GRUn>VWVVK3jlpN^;#r991 z-t0qBqq^K4GH!GQA6%c)9AgHKVB?E`yL&74LnI31ncpxS=$yt~js}Hxt3kN>AXZsQ z>YD_4FEwf>{7Xg+APEOhXW=**rIV~u&%Gp0k4SxeYE;ZVHEp6N>b;Eq8}HLmLU32$ z=y=R+qF7?uWiq3lpb;xkKeV4q?x@fisxAUQGm?Kz7UX&P-KU!+!+&zT+|{#>TS1Pd z{_ak+de^fw&?+!{Zc!Zt^RXgoAtu@GIl<5OPToiZ?H{wDWvI1w0_b#Mm?!992zY;6yVtz73- z@zd*LUY6!7sF!V$n_kbjSPYUbk3F@T=?qIF_1raGchJUz%nYR@7hr(c#oL|##Z~-H zISF+3a&UX|fIfBEK8Be-ErXwufcfb|R4wRKAw*2yyZ1DbuU3&dtu=Q5wd&u+DtQ|8 zs0)tLO}qLeYF+ibAmfqil^{W_=K7b!K+C9k?rfhkK#n^itz7 z(Ax5yK@$Jiv!W`(fi^LLaLnavI$sidvxjRuM9mfsz!=6)f}MTGN}DZiA5Eu`YBz|G z4DyC#kWnrE-80_BH)(ow2l>wcqJ78w;H*(moo%lmBk<`a0Ii$rrtz8StJaNXv{93c z_>im`HCSpcx2L2SF5Dj_Se8tjLhVit7G5FY-I_R(Z*?8$$GFadc0{8A>)E!4huTR2 z*Zc|1380;x3}Gr-)Os|>Hxk&7n4W6*FdGdxxv$dCl8IQFealgBBHH3Ed=QyJi~2BF zYBxe5VM$2J5S~x)k&mF)T;&uq!2{F<59|T#&}p5VxVbU)P%|n7eGr)pEXjQ{ zK(ZQb8uIumxx$p$0*Vf$o&aFrdeXbG4-faSj<%bu&(^~oBt3UIa==;-@q;#dV_}L7 z(W5KGA=ZmCo8F%`7h_Id)nGWTov-wvfwy@seOlChGRQL*Lhq2UcE62n?&U@gabvip z?$u5&e#l2r3HM)Ra)PJyOh5nO#vgo@nqWITz)~f(1Qw=V~L|6`B0%lD`N5DHoqgR{g56my!DV>-t)d~TAU(n zJ`B`FI6Jf_$O8xdDmmGUp`_ZEo|7Pl^CSpJLV72h<$g{ka%FBry%t^#zrC(_9LOg2 zgHg|3d_|-mqiX?k;8_Ay!?^0npbhVjX0G?;DDOb4y|GKw<(fe;^|IEP62@7>*ulO* z#+70$4`cdzspnUDg9h>jLZz>`B-y9#a?B3=%nOeEg718WYUQB&eNc9gp84uJ&SNS* z!QUM;ztsS+bmmo#1eg6_$tj!;+g;jpBJETHZlxiF-C?%BQ2y{2(9v`K1OT^hS%DUN z=|Wj1N>y0lYCPscmt{|xEOWGi89wI&pc`f`#WfHK8d?p$bx0>bV75bw29fmGeq_^5-OSJV2leJgs-ybr2fM^hoz@TNv6USXOXy`aK{R_(^hrp4 zArP;8rwK%jw(D!q?->@t2%ttMt1`FGl1>YGp;@RJ=za;cM$m+~x%L2@2=kDU$zjk84VgEv-z{-93W-e1mdB-;|Miyg5w1^oy%;!r@Xa|5Za8gW1U zYBV!~anl^rNrRN(B#>Yn$E2NZ2d9&!Hjacd$6=|V8I~o#*>xeB+NB>+z#30nI7Qyn_Aj){qT^3 z3n&xGEJ>uPT}EBGJ+P_9UbD;9I~R!EL<8{yK-cg>x`xfMb1#fx=^{{u&lD`EQz%+` zmk`1lbZPC)GaWi0yP8ik~%ym8|G#>aG&n2Vp~-kfp-d2XuwEd;eyAZn0^DFHf$3h9gz(4R0p^IX6_ z_DP)2$_TB%5d?%%jzdZ{Evp*Hq~g0I<2&Bheps|&lw{T8`MP)nmRd#`!cvFo+M6QI zSO`!Z)Q9dy1}ZQY$-)@Ku0R^QVm7P~yEYm790zpjSR)8i3+9e(hcVpV(!_1#k8k`% zifH$`&lLb?zr2aNF_0rgu#0Afl!eVP#9B5UGo?5o@{2{h_nzPGLwr(}dL8gRr~5Vi zP>gN5zQZO6v|dwQI@U@*2{%w|JvITjVFcio_|O&6pEo`oK-~j9X#+Pqwc%}_r7ZVe zFH@j4D!D-I9@~b2ITI*#T)PfrDFspHLFevt--~2rO4i-sib9rq8E_}>_33nnO&T@T zl>pfX^&t@2xXlN_2vt4p;)V~F&5;pl;w$BgfKQ580iSe(+ORSNo@)m|G*Q8AI!tC( zHE|$P2%bQtPf`XDWH+SJqXv#Sb18Tq-Bg?ha?Ak~eHQxDq-G-w%uYpr0k-x66By~b zrUB|+)?S5UVY6Yknu(3T#0Cck(J=3+TCGjDE>aWO0(CWCvHuuoS1a)uI?dg{KDo#! z*h_jMmP&fYs~U%l%KeFZUH*B)4+2~Ga$Ek--c(zl7`JdSeRw4)3cF#s@u>zN!2Qd* zlW06`x%^3{8q3onPx6eENg`}@>0xka4-%;pdGv7Xe#0=l#}ua|kdN+7QW^uXpXNf=g5{AOQazdCNZI8jbLi+2ezS4Y`;J!0hTBeJq_)Sq2FLbt)DA#tb%e|`jja4alXm3nyG9Y~o-NXl4x8GI=$vI?hc-Y%BQ z1Ty5<7CBWMR0Y6Ef6El##l*?UnQDtHK9|M`%n4vhI$qw?>e~FE5^_wB^C9T{|1=Ze z*=VP(qkqSV1XRLq;Z%f$1INQZahLqDM@1zo<;iTna;q|+e?Bc5|e@6o~;3{Gg; zhal5jxFra)7)({Vk=}_J58Kf_Po{F$Q}p2Nwj+|co)}kD$-0MRH45*xX`YVDpJ>~3 z^BP?R{u9Odnhz&$ui9M?9s3WyvOjSBz0(Fzb8n&TuYLFb8=u=YKNTp`;-ATW-Ozuw z@hfilMtuJe$iMN?KLqj*f&3~h{*5-azWzfX{}9OkB?3v}eOqSp4Hd+k{Q^w4lXo^g z`X3mb!fsMYhqioy{{_Yx*nM{ufk^2*%0|0(GxgdQ#NSLr%WjX#%{WBIO)A`7w0>z> z@BCAZgE#KoRAsq-@O+8mjv$BXb;4X`W+oynDkfnDe40s!iAJPlVcMC=&S%_|>6lJQ zoLAC+=7VSYREQ`-S8h7P$K|6XNv%?qH*h{vtoPsvCp6<8!>#HnB^MJN84ey-^4+!_ zM)#jS1RH>M0`Bq=Dt~$9x6$}`Kzl8o9L5uWJ@_xTj2#Afyk=UdO#j96Y~AAbgn?l- zmLdDX?r)R+rzJqw)VkBtZG`mc+WyDIem|Ny4-7`*`#DSdKkv_Xw+NeU-;Nq1UqAL2 zTl(8AFCPP>`J~`<_+OpEuebbj9RD20uf*^tAO7Kve}vUvilBdl)jzu9FBSAZs{Akc z=-VFtqssq(kF3n!KRM}hQyKkEaWio)$05hY z&(W|W#=~n22R|PMAf6aG+QhO{znAK z;}-h;{JgB{jm95gbY}8?WnmZd6KX^c9q%~Pu{@vgo^l55+PnMU(R`($#$=n#Gx;^= z%&r5QLpU7z!>bOCOTM+2z^)y-smH$^wvR6O;=Y30CzQ-*VRoU_bteE9=&GDbh#d_XyG+O}H~REpv@Ry)g*E57UWk* z7M3YwyBF?->!5AA(S(^}{o!dS#Vf+?{qqyH14Wk88Ofn)cKPV#oEkKHg6B-;>3f0? zcCDu}uvE-sWq0nA9Da&h!K|c8qNg>Ai>R~}tQH^(A(`#&Ao{N%OR90!!y9si3t~4G zvS`!=`{GuKX(|bYvhGU`Q;2C(pv*3fztGYjdW{BcZ9HUV3*L6xnJva>Vk(dD@Gl07 zwp~7;xN+|=2OD|Fb;v$@EIG$B(6zpDp0*M(j;7LtoZ`+d zEk#ptp1g7|657p^V1hU9&fsg5d_P1Q*9e88iCrh6Zag2A;X0M~%rSTPR*=B@VSD!N zFpDF?`!)OOQ#QxWwuq1*hZvrm3cPZltWr_g`b@x zap!Ul9T$%Co##&#LajB&<3l_ThbhulOFlhfUv?-1tZ|e+_+r;frZb1Ihep7v^;((* zt7o4tgZyO?wKIXxqu9+>t{^uv6Q+~7rHe4eHQw#=D=E%x(kGRV4}#OAC z?H3wZi8r1xTn;|a!K2mv0~fE708VZYM0I7!O>17td$QrRG)rX@WS@a58lWx{@nMierR!VAAhErgs%xiUe*dk1RXZ&fODOr4G)01FF?+O+6*C9+PFLlG$0MTH( z^7^@4f~G`54+%d@m191xWWE72F!Qj)U%)Sym6Jzs)rId3H_;@Yd_om--%^F3O2>v( ztbMP^{XWkdXVk(F&&vrvf;Y@?puiraEVjw!FDm=;^H{A_lY32cZ#0iJ?`GZ(Yo%ML zxXp2zUDTqXdsX%XhJ^*55Qu#yfpZ^QYLLVfvegXJF|;rXbEVD%9ZSXfi+i9fkCg2L ze>($`k`=h>@ptBFw@8$C2;^cccoC62>Cml{en)_~iwRLBN2%4oS=>;8?N@Gj2eyZ+ z@jcIox*5IAk8S&yPVqcHPAKq-wWlI`8=-ox%FY-ZiM6i)5C+<{Qr&ks7&|Jm#}#Q~ zifJI{Fn_&97%3FEQdEvdQv7h|4jva)@GETj!urOU>9fJ}QIqIry>Jwj1%K}Gfij+V z#~;v-?#Kf#C9KiCYKwcRCup@Vn1!3aL90cD6?I=Z$;4Y!e%FV3p$mOxUiJ}^Ryf(! zkN(*Qx{>HCSj&i zGgV;w4H_TW&IY}aOB5=rujHv9z2gnb*-1{MEv4%fnBU;HSU&j#(meX|etZ@_Eurk# z3e^tENrm|*Gp+us+ERIM=T+V&DJ^XAxmM}>`L&go6@1)YvxRyE@Qp%dY9Gniz?j z4|Pj%-xSB$1)ImZa~IJR7R+^|84e!RPa=-#{j0-e zlJ+GZRM4>yzpHE?>kyj^LoOw^jTDVKzj-VDiM+0h(nFO^bW$}d7J`9J&z(bFPWRC98Kcw7rN$c0V)u9>#zDt*YAD!?6sgs zEZp1wHh3p$cG*KX)gwb>EJ#S`JgE~e^b|~%9?^)hRur3|thSBkz zh7LcJ9ljS$TPF}=&mKBn;0wh4v{C5;IizG3N+K?5%o+4r^n1TcGI;u1FM!=g1Fls~ zr^U)Ra&yzZ1Y~3TMiJs;?c-){YJ9>`wT5}?OxuStAC3K$mrC zLUh*!-0+a7a6;u_DXn&L<}m4C&=m3O;#H3m`ny$*tY@norF<-}UCDxKR30 z;jv*yLcC|wn*a}2^g5=71Lgh5Hf8-R@+OO4{^UHzF4i{5fC+`BypY`>~dlLj>AQDumokXk9@)Ciq8PIcc}bAXpr=M!$V%o=QtI%Q0(a4zD# z{7*~(*?%4S$pu?+M$dcl+U9UFE&95AJ^%v^LgsQY(Vv90TFxOYPcR=BPV{xOvAU4) znET25%&!=%kZz-X7j<6a!TD3Wit4$Q$R@qLyB?fO^e#G`b_Mr#Kd60lYC6@^;z^bn zudR_QHl1yr=xyg7a=vH2-#Ex##18LOnm`tQz%;Yn*+bguc;MxQiV^c+&-4VI$HF&2zVR-b@%Q`5 z&h^Wy7Nt0h4I%|s`}1%wM-rRIf>`dFD|?}U)%9Q&1xLK=HxUS6=B_8Idz?PHJ7^&N zUm;B^0MAzCozVzIJeQo|?whFyg_08WB4Ria3B#1?s|jy}8C$S>#+N;RR5Gst>=AIG zm{$JA-32(%l@380seyyrj-D0*>JJd73SCPuHuLc)J{wQFdh>FT+6_NJ#RV z)U)y~Sf5;*)A2}Dh0z^RsdW#RDZW!rnLi-Eq+JDse0FvDM0*J9Wc0lwFd_i=so^9m zwnLgD?qea-KkpIB;R?A(B(fUQW_Irl?PRwYSqLy>x_)HGd78@PF}oa{5tt-wcEKJV zJ071MC?ac?;;D2~uzbazq9HMmqI;EY_od^e!;=zt@Cw-6!^QS>xr|z}#1hSWW>ZN& z9N`iE0k#)WIT3lkN6$d-#3ggnJd3;TAUE2KUcD~%hR1SBe}exG)OuH9!<`78&q59( zULTuG&P)=~VL(zj7$9olaf()VT2Ae{+HHL?*hjh`nPxv&q3lRUsdt7_tM%Iv4hG%u zEx@m*n{+>?{mtlyKpf%14N2Y!g8SJ<@{h=Yu%S(rI}iuxL;>VfUZv9LQaoMh6z|NQ z1=FV;dPAG{Qtv*wtE_Lugipy_4myyA4^aLMAcol97d(y@ z%L8~T+}Z{Y6sYw%Lmnmb*Wf2x+#bP-B;&`j7nYDruCKLxf8KrYGsDp^E1X5x6~yS` zOx$SmEceC0y8|@MsKIm`OEt<~oIp};r!fT8whvOw5UGZjf z+(BZH+~>gi#%|QuqZUA^Xa?? zZ~K1PS#4^aQwoF;*eyxGe3Q?pEkzI(Z-F( z$u%e!uCRruzGULO^6JZ^6AHYc?XK-|i!Do;T0Wa?BC)j<~Qjl zy-=1n?MCm?F_ah`$6X9P5V;UwE%L&9c;fbmOP{@HP}!Ec2DNTTm$H{fdFGy9KTu&* zdq?UpAc9ULg=j$EUpGQrxGin~_i?G(SZLPu&f`pUTfvR&16Chh!wKyJtB`dmd&t1l zi&)wt&@iUXtR+tlH=;Gr=W)Px&_(X=S6m)(^ujVo1Su83X$w)OWKi-GCiM+vd>+nh z^9}9O>4|0$+?Mr{3WQT_=uH~M09f7xU}6Y?XEcpJE4?lwacP@L%IXdWX2@z%OQZ}> zb%M&QM4p2posd1caM!Enk;3LB_R+G#yEyJCo#YpKu+5pxnUmWuL}Z6~ituCKx9)Gx zB44*$o_|^1V{Me9`pSUKfc7FRs0Wa8b(WXA&vJ3122||P1?;o5y&wqu*t8|ky0hI?Z$1E`*@#~R8?-Dl9-RRv&covrP&_~%whJNVK6DH&cJqe=K}cMAz=KC zzEbnJ7F9C`@WFeA5*LoZE2T~{0Guq|>brd%as*gc(*wA6rr_gtlp|#TGApXk`d8=K zo?!#Hk?!dc;MEs|vy$&)4+OH!F64Sc2FfzkSK&d&=#gR*he}Iv3cTe}ISE72SMJ`i zf3aMJfC!yh3|L+;dI2`MRZP zEmYO|?x4%_#Qquq2cbA?*A4c?oEQn{<%s zCS7_q+NKp{=4mVbq)wUDwEIoi6My3}0%k;F>XKv@DXS*~gV%%y$x zLWC}D<7}_RGWo6rN7Dta-B-=b-|V2hz9Qm0d^3PE5?ESV*Ztf)))$Y`&G=vHs&N~3 zRILKelryDnn8kS8E)B`tNDB3k-y>B{m+La0emc~7ei!HU5uC(A57_CXa6(^xT-wmB zk3pFcV<&g1UScD|8+M{^l0lwAnq6L{mHxJoW+Ciwre*^D35g}-8N~;j>be+l6;h7h zKng}NQSq*Gpl>b5B_pi86OQz3OTzp3ts9!Hbel)z z`+0VUikmKmK&Q*|i|nGGTmT0mYzA%YSPv8rPX?+ojSM+;0?RFcBsYQJr?;aNPB9lB zeWGN6Qxi1Yu5V*4nba={0Q=4Wwy%lgS5HwX$9dY3b-812!2&=D3zDO|2_RBm$YZw} zyasvD$ZU@C9en)@p5C~#fJ>yINy)O8aG(a=v_o1yg|>FXU?=dI2Ah7=+r64y&V$^( zM$Si>+7H^7kIYnC3A9H^(mpqpJN0`Ka;wvX67;m&A3Z*Y0dhDi7rfsGZjIP){CQt( zv*@PFISIqZMjN9;X7exn4=sLLJJ7fx5L|;^5uy3+Ee2kpbUT9tHermd{(;r%j}P`_ zd8Ia1DlWY`;PN`VdVhk&iCyRGO337Jjax?=E>o@6gNXBS6UU!=Zx1E{cL*Irjtnf- z!lzqm*W{F)=G~y$3OQNx;cMoY?(tPV4@!LT(&|fF$!j-?XLLiR@cTw%jInKG*(iIk zHolH!M;kGDRbc;WZXf%w#47v92W1)8{;-Il#1!oH2hi}`42|*xVATc9vS2?mN-9~W zB$`I&0B43i1WQFKFAcxXpe^>6V@fS*fNMgpyl?hy0pO~-uf(n}zjklq(l#eerl%V% zqV3nS*U)n{?Y!GzV~1@hQY?wp6WF&i+fiZi%Z4Yv^h=51`rpfMXOYYgcoMqstja&7lom5=ABL28;*8&N(Ng)KZS8fbDx2F>Fss(*WUij>V zx^2`)k~ObhpknRIG9xXls#Li=Qj9~B9=$Y0^s^9K&hK!X%ea^V<-*6G7si`OmNm|* ziD-E5mw9aOu@jxqEKoko9Ghhx1RSKr#(CvGU?LJ1Ye0RJraw%C=c&-n@wm}m?6cR@ zwMev%RRWox6ov>MuMMEWmJ13@d<00@-UxrSpfRrzGArd&Lc%UPJ9JLcHq&@iiT^c=3-cRH4Q!S{shi^6k zBy$ zl#jT^TkZiA1Fw||?_|fXHZU%i4LM}rGP-r5A>}gDUc=3}g2B&_tUEX+!%Ho%^}=7AX2LRd<6iQ&%?&T$jUWfCmRdBHV!>rSrKwi6DpXic6h>GyvFG< z{>;OTd=ynvzc#jy1ljF+RHu6v4H_PuVRB#5+Fa7=$?*bi^1ke$waFm{3@8^MF$M?K z`ZJ6N!Fx4jrc+KM=(sLn7+Rz@Hx_lrpPaguobU`s!L7L7VJW6euxw|O=I2EOH3B!R z0SLML3GR=je4$(Eh>tL`pnn+#2W3#mA z_9BUPaCMO`^Moq#ov!{~0ZlN#hWe*~!-NU|wP~w1JFv73xz8g#T(R!{#R;RH%zYCT ztyeI4V$v8fM4Vz?_Onq>m)h;St~aMPs!oDfMS9gE<&P8t%tFLzLwR^?G6$p$7>@?< z5wh>Y=$as#Z{H$Ktuil;jzNqG@@98S2{0+kCrd_Q1t*3LkCXvJ}XxWS;8l$gfw-)%YXF z{K^23kFBSXClo}3Swy+3OIMp%gLc9892Z_@quttB9XHeLVIrq;(2#DeUQ)!Z|7O5x zhU4vN2JHqhlDi@%EG*C~C)-mD+7J0_w79e48`kSiDm?Q_rV%V?(SBfEC$Ign0Td6=R`<7VmIQ-hL7Y8r?%@^_2Gm zBQlsrdF|P2Q}mngp8cCp`b+I;UTvA)#?o_mE`>a9rn{_tRlt)kK&&QVOo{JB1-tWs ztuEBc-iP%LY3VD`$vxznP>t3T8<_J{5v ziigQi3SwV$CrzJlxKPvl)0O}|`!0aP>9Y8=TE>()7Kmd7pN~7Z4J%Im9L-Q-xP=T- z$REOxV#*t8b+7BJ@61>0Tumzl9_P~4h*#5eN6dZ+Pf|y| zNx_#OJoUQj#4S4haivOJ(O6WeH~UANkBDzWu%}2tLwm+4Ttyfg6@-)Ay*K6?vyZ>$ zF#I3{zZ>MLJo57ytMwO9{+xLc46@V)E-Tz7Y*1!H_7(z$4k*lJWmiAv=4za|{8R0^cbs7i}=66R5m_+s(T7(F`0WL?8 z5dey&2don{Ag^N#$+a>}VcljOR0|tK6au!~D=t5;t0f-+g0ml7W`SHy04+!!=6RCy zmT;J55TU&S{0s z9900;iEVanS=Pczvo&M_G92VT%#SXsS!g=bQ|hs{FkuVglJ`fC5N1D#-AiG7>^dKT^1KY5Eh0n_qYZlw4Yj+&T zLCjQhy!i>cF(CxrI5(_IPf`AY`=H7_^4@DQ+Bywl*kT;wlW5}FRb%UTmn>7A-VjM# ze+1E#7HS7+yJa6IRYG)dIflpO%Nf0_g>7q zJ7iyOdiXNyoBQO%41*MgvY;`Z5dZ<>of}$^9+Oyx@n%sreKyL*A(tmC6S?T+K~u$i zAEKSCY8-Hzf+*bh5U?Y!(P{lO0h}c`5YnHRFvkLy?vM7sb*INBviNZ#3XVI~4uKtr zA#I%Or2$0BGRHDJ#yd80X0y5TzEglr6Kuixy{HN|gU+ucy#x*Rx)aplS;Dfbm0m#m z6F{Uu#|F{494VD30Fis{T)r%kw?ixG3Ur3x0KgS8&aahJBq1-wS1Nsl?SBK>1{#}=rpB1 z{dB)uM>W;dZKSlx6I92#?tfzPQ2~m%uQRD~=(HZ&auO75btukPT?9fGBu&UEY(*U(SmJpq_<+PSu}Cax7M%9y-Y^%}HE(lk74N$r zF;vaGLnj3=vnb?QBCZdBM35{WrhZmYCKVY_Ufu-`RDbPK?af&zfS&C$ua(yfq{z!q z*|`c91;|J(#k%eMx$XRsEu~T`2TSdXtdMBjTjS`nvD&DZuwQSPD09D#UqRXi|yBq zI{yoYEHQ=y=A()>wOdMX?pSd2UndA#D0NkZm$|*H+It61a=sm}WJ0Ib=nFAwWlNfc zl7fRaiqmSZ&Hra3L0k)mxb}us7bY=`y<00_nxX-5?#-EV89r;3^OuuCU>5sUgOsqi z==GV54;ynwn|?XB zu;EzzvP!`%2d_L`4pg71n_hk0@mO7n;N!uo&$f;X#D+TUnt?|`|1i7g2xd||KQof| zs3LsqmeokyBeW|}BJWF``@!~>Ujd>jic_8$(r6HIf0DS17XqAzQT#U{299VM5|a}L zekbJ!KA|~ZRU+WgPZC>7lh2~I00=k);P|q9fqZ%5j~!>eipg-r-(RpIU8I>L@Kw$} z8v4DU{wm3j@DtfnsppzFA(fbV1=8B7MEh%ilqRMcc>N?8gT+4zIZUvhcOF7ueZ;HH zglzVA58cVmm8s^@HV3IuJ?k&D6pZ8>a_?A7VsS=mJPd-@_Mgqr_oTh&M2&?=<~fY9 zlTT@g98{*g^b>CX$m;i48p;Lm5U~dl zX?#-EPa3uqp8QKm8esXnE0SZ{D;2S(!58gk#0O&iGwnJQ}u6I2N{E-LGcO43H)*3;Yu-V}4mu+YcQ2 z1p@eM5)M&ALK8+*r-L`zir?&HzaAuQC}6eU6WD^W3uis0@t{48i9o|%KW8Xk2s?Oq zY8|A{Ji{!d3h602#^TSP!t1*On`iEJ>l@d~ISCzMMuwEb-^LTi*6|T`9 zD!}`I0Ez59k!aCTDmulUCa5tan<04f`XJ`r877+-fJSYAqL`+ny1XB2PQoNE?$eD* zBXVAka@U9`%h$+Pv_GNE%1`9gpiBV)Vr#k}C2F(x-P)r}^{(apS?VvN%JOL&hI}1- z5c(=X?H7g6SD2`OgKr$-WtEscDXFT?E~~`H-jiy3#;*!Oc|d2d0hIT=G5WlPQk&#e z)0i?hZPTA?N_@yujpqOk1ALK2b$$r}J&!mxv7O`IZ@W`%y>|7SFw+GPpo^eoviqk$ ze{#NFvuL`(v%+P*SkN;_D%Y@(hX@Q-+q{9W055lC~#|QuLQOXH**ua+)^12 zGl0n2oJbeAY@iey7GhA991!5sviLR4rbhIapZE0KbX3QCHl6n-`~EB(+8ryIKXr~U zVaT_YbGo=2B4=>{c%oCZhdkV01v;tJr9o*mAVYwv{c_qKUPivW#mC7d@X$_xu7NsN z1xU&Z;OzYL&gIgH()4&bPY7MxZgN=e<ucBm%A39>7 z6m$S=0IK{aFI&p@i0t2guOnoFll36lp7cAbJt(P0nc(-qmI$W$pu!?3+dURfGJ3>kUw>JaAjZ=MI2{^-q4u<+{{Ixg7_Nrzed+ z87Vu2^Y(c9mLgG;O_cbmEfsC50*B37JQbll;H2=LSy{OiIOgy`g8lUSBhaLR@kQ@<=Siie92neSzU(hFsWEPlW%%9Woa z<+T4Om7%%za^vL=rbS{vpOJO^9RqYr|Iv-#w~KNKuk&cZrVq zI2$bw*h8U1Q1We?lUW5w^zDKYtnPPSIjQ&m*n7{grn+uzR1n09f+8YS5m69OkX~W~ zM4AYpLr|L1J5obYQ2`MUBE9zxLZ}I-^eP=fFQJ4E351a3EcAKXe)j&v>wJICb@mUv z6tc4BoMX;0?s1PYW2Rgx0IyPi%wY7{Hjvkw)eqy->mOn}0mR^yB<5lrv#sgg!hyh} z%knCmeij6@sRW;yV)Iz~(Oish_a!+uh3nbLr%g2LzG&9QcJ+F{iFI!Pz_fEm_X_8(DP{9FoK*1O`NHw;%pYgFo_9 zUGlAZ6x`iUEq8uj4T1!2IP)hUXZ9rqkP_FNo0pk-_Z8Ru7lh&{c}i=#p9sg{69NK! z00H_-G5+6FeE+Xy{g*Q;${ zkm=vE4|=UpK$d?MuKv-8nHUXr*0V@jpz)57*88Dj(7kE)jQwzcfzAp5Tr8wdd%Br@ zJF1aOBfq!eip<%VtrOOCC^U)}zEvh`Gy{YxSpwstPUbNDX;)sFdKTOxdwB^<^U>nLNezE<8YgO9?-1tWX(zShn`@&!f3&4~A z;2Khb04Cu&>tVUSOy5e{@_%6D|N8s?i>C1-U|Z#0AX(D(A@Y~ikiX7ywIT|Zzzld} z*KRu0La6Yp?(<0{-cv3$p<_U_D+35&KqIZ&jyM6@4GcfjfnSW`FQ4$+>Qz?-*SYMk zFQy7@&{48mf8(Aw$K-yU1>Kzs`{cxu1gU}9HWycas0?r>U6)&V?Q^CcYU$g{VlSBf z%*)vDETKhjJVPUnk^;|_km~tfan3r7ue}k~@O_PS;Cw4EmqVk}!1s}J6kw6A`!R6N zy4N3y%23R6+4-4rdi2~c7=6vGV863sPS`%fwCH}$mn6AVcz@0ce-T!ohP52=9OwY} zp9&Y+f6_M+AI~5C^5B=ujF|uCG?dPyWkgY#?c+DVui?JnvJXeX_ldf{{320|Lb2~1 zMXH}@Tms%3{-CL`4|7AXy_iTkjVKHjhLR3`kU{CkrD^y@YraDFrspl)2xutqJ`*=3guTR z0ahV>Katq6;pmknI6nB0?|#%X&dcr62e%q1fC2O*^&!|La4t>+1QMP{T2)L3i z*ZnT-7k*gaYQ*?$kjy1V5Yw!y^DMwouSzQ~UE6;d&u4*7?Q`(Wc3=NcH^4Df-e+081&ki~@)ViCGJrnQb_KlZdB zF3tI!<1f1C-)PUDbl+8?!{EM(d}#xm!hcb_dWO(pg+zY9&5Yja{3)#mM8!s6wYyV) zu}!;ruMb$e5dnA!01;;8-Pyu@c&WAuX!3v3$BfpWjsoT^SbBSG71YN}`VqpS z17UT?1MFTt<@DufxVPePIo4_^@J`v$4axaL|0f?h+PMP{)c~~i^#V-_2f8a-WF4UE z^?THzZACT1whxRYGxbaEp_k8GGP~RczxcuWmj46*=NJMPKHjIzTGYtf+BAubn_SBd;27A&JfuFar9hJZT;ucn=OGj*_eF{nC1D%GVsx_NY}0*oHvJvRVUl12^%3=8u)B~xS>QvU4j%o8?#zXfvkl8``>?E{7>wc^yzKoGW)w2;vUj{*lQJBLabX99*|4rf5w%xcN4sfkYgA(=RnASMhfme8X`%^bYDTJ&F9gp0xHj zWg)QdbM4dQpCtdr|NH`Z%KLyH*SW_0H%0tw0KeFu2LMgHPaW^m#D9eW3?rz9uNHo> zKGpwU^AP{%dc?vDKnoAP26N@OgO(sBIs+{{bhkfQ-aMdtcJziE(842dT}R#gVE!DC zsvQTk@aUJGy{Pm*kNl@7|LGqFssSBUo_Y57Mjg~^)a3>)s&H1VII^E#c#wx`pPf6i z2EUL1w68wc13+WrV?g)1QT6<~e~$Av|M*{V{I58EyBGiWbo~ENS>43>AosHk;(+vx z1)#F{T0>B^y%G!Wf!jM#P?k_PTwH_s6TJavk&%W7#4kOA5|KlyVSK;QO8-)KM!6JF z$j(-XzeUa`Gn_I0#z@RLq~hU|BnUtC8hrwRZ;-`ou2g6fV$z zYVg|qUl#g6(_bPTYb9p2P-oAGj5ekLrpw}!+2BX9{5pT zdS78hLE-jrf07E&T>l~n+3H3pUwtWN%J?Wt8CMchvk$tmXu0Wc-xnp_^rS*>&C(2%uc!ptdUe^35V2DV>i%$=YgHwF03d6X2~&ItQY zRi5|_`*SHL4>}5to9BJ0kL?6TqZ5(UQjx>FuYRi+|1ruh4~*(4NneHhNI<>ZK?k2$ zQQM6U5c50mTh3@d_oa@Tv`u0jnDGQs>@35)W~u>Uvsav?KJbEhP_?ZM#Lwv}9#pbM zbSIAP=zUj>&ps$@x#I!kI+e~<^glW1VrYM-wB(2o)~ zjf)9K$o}SY{}Rj9D}{*^Q1e)Ox%cdUphKQtEi?j>4CY-onz^+=exzt=1&4#K$RS^0 zMn4W3LJh#f^(|apTLreMGC=On+>1r5iUa4i-<6Om!Vf`sK9S{x9dtwJa@<^JBDwn$ zzuu-KU!Lz&g7Uq5{o?XLk6nb+z2BmSrq%8tSewhzUIJzR#Gp-#EdVSzyp-2U&cF}8t0x`#TTj>w3 z_U=U0Y9A&6RqE`{Hhs()Av$~LO7N=wU8vfd2M1e^eQ`OR00QW%t7Ot@jvlVzI{E~z z6+C}xHxT>h<_YG%_d#ZNUfdjqk@3o=q%uBV>#6jICd$&K-~%GkpA`M+oDiWtvjGP;CfMVNKzm8B^SWJ6PbP=xDU8VDS-59 zp#3Gq$GC&5=O7;735~mr2UiNzX8`x^P?O5t1J7J;zr|*D;(@#O_|#GYEjINInPeRF zcmhQ}$8U(QrwZg|GCC~GBBKlq=we{0X^KzWxs>@_XR}?>cv^cuaH*4>)3R83D@C01 zluYBUTU7?-r!W`1)wNPWANn8EJ~V5q@E^SZ{w9YDcPAo_G?I2CA^8phI^C>_T9D3G zp^zT#x7bBwR->>-CsUmxST=bg>v^?S`sM9N6+UEfD)+ptMQMv5xHq>JnZ*h&mGFfM zQFzlm5F>{3_UQS$kBxZ|2OuxJACmp7e%tTqpUQ@sk`AO59myXT-_K98*%#7^N@vfw zI-#dyht;lMP_CcO8xi515IDX_MFgj*P!pA@7x?0Kv;0WU^k@qMzdrL?O!z!!+fj>L z{Jh|=`CwuqHcRq?U5AylWOOZ-#UkRM3jU~)1_#Yvy@LDnsKZT2X~FxkK&hS|uN2d5 ze#F2yRSHyrfWnNd&v>sA)c)~kv^Y(Wo>r?nIp(9H*PF zA862Hsw$vd(7}9CyY>0cim=~k)nEJa2EA{Zy03?LUNzLlFz(oV%y5`LP$NKvmwk`j zSvb4KD85TYRJO*9jEUmrtibj?0Z!SsOV0?AWXrC-kWdzI28$$9S!Q|Ul^J(UZT0&3 zI}9ECtT%9Woh6Gvk`!S|rAx_1r!{rPJZBHXq}E6 zWDz2QV>WRM90*+v$V@hEUPsl&Oix9+X8L(ewp(#oEmE7RS~n-!&~=KZCXO&(r8&3` zNw`CdM7Z@M+!tyn0RGXo)!x}2$orrqWiJ7*e>v7Nfv`}gT9)k{Yvc1a-sHt=Rtx>` zEUGv=3Kv`vc)yrxaiq3mc+2nIuKS(bgWAj5WX88fHL(sH5W|qhU7*|1c+0!B4bPXM z=J7tr?9W+H^wHgzVZ(MDBZV)o;u009KB=Y-DvdJBW|y>h(ZO0Ob^f48&!_e>-F(}c zD1(;P>6O;L0^8P~)9eiK0y3DU^4ao4OAt=8=e!DL=yR^)%_zy~k5ut0-wf)Y_GtIe zA{i~7cF}=#pt|^Phx@<(i7)t}(Lz{6sI&UyOYyyYxVw0!K-t{vSR)`&I(yNxGol9b zVP7t*Xi)F;`R`sHZ#hRuCqdJ6u%Yu1H&aT1_-0CA27+}w{H=FMu@0u)m(kgy!80i1ZbO|2Jo5_V~&vm<3cYV zwR6v6J1SO_#=&vZeCt=R`5-6geCVCINCYotgrN(D`thzV@D#7aU0?86M3HL=aM9&e zH*;`6CA-2qBeuPLTK$fbsXJQ%FU?)B!Ufbrj$1mGTbUHm4Ys>RCwp3jk zcWMyRGhFn}rKG^bSopp!t#_vdDnbs3q>OGIa{^u{84oDVSzKEw8_Jw|PkFr3m8Z=% zyREK-s9e7R_H`rrk)P;F7)2uxB@}4gpZ!q|b`EoV@2v89VU!n7tp$2vt+9BV8Bmyb z!|PTKglc2f2f3iLm;703dCVz(@QEsR0y!w-r7x2nFBDm2et2p0AKm!=`~CPIBgmzt zq^s3BJry&WPJ-j}KIky{F}4EI(tJYz&+_TGwT>PgX|016aoQ%((gyJn{de6DAL2BZ zo)t82$yqHuFpW%g9(iY*&npAfbL>#=*VW4o1?C=x5Q!37O+dM?D%aEJpPbZVBEbdY z1!(4H8&~es=szb5e}4p~CnQZ_s$w zGBB-le|-9WD*Q`2l4#sUS=128VFI$WN=wpndpI|!EpVEuP5uwW6ljom<#WMU!^&Py zk!z7PFc+EUE49d|350ts#7n&8b9{|0f&&fEv*uyZi4h-i2ak^)-n_zL%ykd*6BG^FzYjFk4msY9_Hp&{*-Jt6*- zlvR#;2lrhA)f3siXhGr>1~ZvHKE#g;x&PoJ~ZKe4%Q0sCJy^;-ZKG2^@2Q($fzq zsT-3r56osF9e~pqcK#(lFYfbfjKucSaB4GmN)(kL=Cf1GY}?>Eon8Dd4O7 zboY89%xEyqG}JDpW%mxMr>Qg#`wmffV&(ccvFPYgTZw*~F(0J2Y{#ql_8ZuaJuOf^ zVuba~1&>Nz-A>B`hhUBZ$J%iu$3JzSe6*Ft-#`(mbgLVI`9M9 zHeW404w^OLJoyA4B+E;9>hZlmN<}~N#=FfQZZ+e^~nV|XD|HR0n* z)oPv7Zl`zAU^Ym@OI2!e@Jokfze0vLR!_E3z@YMWec(<~fOmqSJ3hkcn=)L5bzo0; zJ;t&?wZN_2vU|6{Kmf?mesjHrH9a=@QR@NM3dsCPK7_=t67=FsHEYM;N6_%;sGT0g zuL+>3F}MO(f4`gI1DC;Sy>pUGJ zz^Bjeo7K~cw0juZsOV$lnD-gF6*-cPPS?Lt9Y${!sDwOans?sm%km-)cYmqz-+*Fx zp)q5X11oXrHRP_tJ>=b0a^qG8xfPO{5a(rMPH6jThlcKTf?%Yb1ZbIhTM|u;C@}^^ z^4zTp=~N=G3y`u)Mt>Cm{obZMqLzElnnOL;&a?*O4LeMJ=4IB8)IG*)TQ$6j&e^0s zjNTwfA3wL0(LH9JK^i6NP!sm^Sr?KO`Bi~-skZLp9%G?pD3=)E&bE!t9-&T|82;+{ z?oeMDV&{idDuRIrF?gXRCtty18G!#w0xyY0t&hnA_XD!@at@2mdk|UM8kD zQe88|t&HytHWYRC7$uHZKy+_82QEMf8PNC+=~Wm*GlU2yi+T}T$qD!W zogNMViXKs4j=WB?jWnZ);REd8(g+=uHaRMev+Vo3n7u85G*?UlshK@vCsY&SE8s~I z0#D~xk+`p*0q7c`w3w$0kZK zX+V}H&nOvakOvU*Dlt~oPj|aQL6&gj4j z-R`=-P}G>9WWV$@faRmhxb=V&o^LjR3g3p#N-{8oox#mB5|3Fh4j?XAmBYy;h^nii-vL#(Un|Ll80CWb|576 z><5s$#gPizG$PK!a@cVj?&JcU43PFE#Vv30dEx`fuES3W);-Xj zaS{)a03wLK7N@1p%mWdn_GKB$v3dPpD}a9Bw?lkb3tX$1B<-#_K$9 z-)f(cz5D*CCe@9%@2elF-@TLdtnv799={fDO)3WFZV%YlrAu*Pk1mdnRJ9eyScRRs z^7Js7Al2M9BI4>D&i%mAVP%{wu>^myDZ;v@RVC*fu-u`kna`44m9Jk?H~A4n-qoVn z1D`x*Kd7>QHKES|FI{{*LdjT@W@|5Sb8b)EpV;QYu$3{)4=y@|lQg43^63@n+8)MC zPjmK+*|Dsd=a)jpWh4d#xVXUmxu|QBuFIiIOIGVbf`>>_pRy zi(ld`M~^7FfEcVqiNq}6sav-c%t@$XEU3AusSDMI5c4eTJyEc{rYSmV7k6^rBf-QJ z=v+CQgO%vJ2!u$u*_A!oCqKtV`1+DI`sUadjgUkdT80Zt84VsD9t+kUJ>4BMu__Sx z#s-Y>^Z9z}DQc$9q?S-ly4}y4>JL)65^r@4;$XlXod;eH{o}r0Wy1{KGYpRi6&Qg^ zTGLv#=H@V*qN9T<0@MBr39SV7PC?j}0Rz+{86m;dSuKr*>eooprLw3)F}Je?Akg*D zm+tN@(}&r@PMdqikDqg~W|Q-o=nOpH+N5@NX)!~ECC`6ZJxz_mrwIWGcD;OBam5O%olRg{&F>95lq8-v8aH$J=e1Y#uhykBve4|w z%E@Bf`gVaUTMxQBeC|A@YSG4R@wJ@^@KRKM17cXp5Nyk$5d}?6oqgc-d0;Vf_mfjP zlH|qSg&N8B5knb)xDdllgQ9aWt9h1f`18Hd#?Pjg{D>L}WS5mtb|?uPf*CEb?08CB zpd1hsqy%&#OV#&U?9I=b^FZU(Du&J4NpkSMTpdoW81dCW9#NZw_A{hQJr+pXi|{6fPRZut<| z#hywxE9%^f32qKPBhRgldE%CI+;%HqGi#y7XuA)8Ak9ORD#u@n4XuVbw+aldPE4k_ z8}O{z;@Y;qyXa{F2fw8kF4NUGwWm>(Y$`c)IN%26dbWpb{r5!<@d&T{o2HmDR@y3P zoa#hUOEnBKFF6P?pN~_olSlsBY$Q-dGIM)jw@xoT6tKDKb!COQWna|jZMw=--$ttEfFT3vKD)9cr=fl5~ z@;@%Y1U`4ofdV$#JuqYhQvZH(x9S{N-LQz3M7^ZF_FSHpSgQ}-Vrj8Kcwj>#3 z3ax;T<=ilBDOXUkie-_$m=q#g_B`gDl+=w|X*@>uUk5`)_^1=&#l!O@-Q3Zvu!}5K z$I6r*J>q}))zdvSPCP~LJ}=m5khiw8xX)fr?gy2DMD?+VKM?R88A^VpHLXD-&?cTA zxY1hErOfbN#T}^Qby*iD5X_!mXq0}><<{9wl2i02UdC(h-YE!9G|#2E#ptEo&xr3{ zSRu?7-MuehwkXvoCtKe%Z?Ep;5n|)|Xfr){a|x4qmwGVjuK$PO98KE3KJmlStl0K~ zj*;l{M-eTwRp0_c@BlY=hL}|b{hutc4F&APPF0r?={Kp<*&e5 znJjcREj@j~=4ny{i_7T~CB1-!khF3w6XfT=>zW5Hsr+`y%xg;4jNVC1{7+W5R@_(6 zwBg+CC*FZqUMU!UR-DQb)V%PS@dWv{m=Ofd1&c0*~!@ zFY)0{Xx-d%yO$N@E;?z|ITcY!DfZk|`)~m@D?;ELC6R{pn&sgpv zxd-%Y&kqkht0A`p z+kC;h;g5t=0;fC_HfrC$!#{2olz^ zrzT{1v=h;i09H8z0zslS7O!pMb_W;wX*ULaXqFcp412Q-)UP5pk`0_u{aUSYt|Q49 zGjH#8-}fLOp=KI%2YR|0MfnOJpRs$~UL?7Q(XRp^OE-sl3 z6d6#=XgBnM7t37DjVoM*7#Uz7rH=doz&5 z$D^%g?Yh_fxf$OvX*K=!$61pVbM}Yr7yC?Gqd~#0_S)IZC?1+4N1BF8%=4Iz$Hj$s z|B2>QJ6>+`2AS+x8y1bhx`OuMTRt2Hrg(OWxwo&*d;J)9s>dqHooPreT|(ZpF1fzg zefbD$ZGXAi;ViY3XxgvpOmj9>CGeC>w3y=H2{Eun*TiT~!oX_U9%cyf_?&`P6NeX{1adBDmE%mz4 zMT5n}uSMar84&LhJx0bqSqFvVpWL3;a#S_tmSW(MGem(_k5a2K4IXA+D_+fl@Scpb zo1P8!0;eJfgsqR%C&(Y43e+US%dJu9i=2aPdi}Yh*WSq;K}=nX2wt|2f%H9~4-X3_ zSe=PTT6+R(&ms2?Q5RUd2E?j7YH<`lg&ANdrQXOE#yI1?RZF;F90HW$#B^!IT$Zs9 z4M#U}3rtL8s?m$*H^w_g9R`n9ryd--9-?IADfDRFIKgTm+F=UcD%CT+UiCO?ia#~8 zwrVz~->|9cUhup3?;2S+9pZbk~)h`hFb287mt7QN^KYBON(wV`~Euq6~!M2#OK8EGi!5x0<(_{ zyH5?ke}p91ZOl+p?_pPlz+W-Fm0`!+Tn>Kn%awqO->Z}$Rxdnq=~5aLAHFui6c6O0 zo=Xd~HJ87$ePj^hgOn!Fo--5k`U7i00b{tf+2wQB+*%9nLexYlcKpf1!0;z7Q@Pmc zc+;hHo_3amwlE#sW&61y%CiqqK=XIav>E^GK@m!-y&)sJ_kSQ}4j-SKeTih-zHz?v zC+L>Lh!fZ#r;cR&f$)KG{qNNNCY=A3+JEly|J{&(J5vAOIBPFw^tBtJzQls{IsZtL z_|$Tp=lC|xtOL;D|By_smVT$Fvk`d*0}SR zKZ<;Hh*Hb1cC5w&YrNq#|L7%3ijy**uXdYrz#EiH>&vU)vnsFEjY8|t*;!K{*eh^^ zr_M+96(~JHm0@Edo$q}tkFR+(36G!1xp&CdkGg41tB_9m;P`ZZwB<8^ayHn|s-^t1 z=dNi3nU;RR^S*!68-Lp+|F-D0T#OtOt6@mW^Z)k41AN1sI$)FxRu4`e+y?KXbp8sT zMlJ}&iN6rL?`6i%ShNAA>m_=Ms1j{Md6CHa#AOf)f$C zA;NZ1&QTCvcwTwc!~O?vke8pD(Er)I-)s4tgi`OAnVTy*JjyPGRaI5(L;|~x5q;&l zxNJwfbWXsh>*{iQM19_`r8hh8LH^|HeZv@l+QUPu*cjKQE*J!kJI>F34b<+{>At+A zpucx);m0=AWfU4Mr0nP4GSp}PXFJ4))0dZj0$|OI=^4%n@I{M!EPmb8tCeDrdE4R) z|12dXi}-o%x#?*>__#yO3f!|#PC-Sv#Eigeca!yxc5wd7pD#iskUvr*`9gJi3?tkp zh2uz+^Dld_-DKYtT7tQ$X*1%m{0wrGCRINXdE5E|e^SJ~rQPgh5BbxqnHaes4@J#9 z)UyG@i~QXksjckf+)oX16UHx>7rBaOUY*S_A>@0}Gs8HrnEtTc2FXp_22ZGm;azls|8>j!9VA!khjOFx|;RQJ+k#b~tMa57b3>_%UT-MyXEfaoZvUH~B zMZq~jil?9lqAea?yCUN)r`fP==#llfF52a+-7jJI1fJ;4?9OI6#1_R?Oy&H$fr1^ij#MrNqVi|D->ScBulLo+cf~+Ou}l7t&$<3Yn-}A{ z!|)ni4Y|ry#=Yw&&I>Szo_1xzIUxhf{Vh-B>EF^8$b9NQT#qOe2)+YQ-i-P(k9W7` ztxS}1IhaMP2lD;pRl*i*Ln5O4a?TMX$|-)GK(6auMXekar5K>$>>IGHlkAbbzHHZ0 zrv;*)%Uh{RMcNe|Tj5En-;TIEY#*b)g(IJ@3%7z-6H6 zESZ@b#J4NIi(BD|;L$99TUFlCbp&hC8kS)GuBytTh@VGTSlR5t5K%(}>3TKz)39*P z0%6qMnE6=NnHvdS#1v1$=2R=s=VI-NiLfewy=mOejS}CJTC3GEB>gUcD7{P>KM|$#Jsa585FrtCG%s|(6TTg`=~JG z9$Q60jE&x2Cy!UPz6^tFVS{vS_?R4fV3790%@)>`OZOzvW8J*nk3PcjIfGJu6KXKgZL_5 zetek%$|I&?4UPHyDVJSPTKay!?bxN(0jm~lSM&9(V&efByye2hp<-(|8!RVWE!0t^ zk6S{-F!Bb3y9&qhQD|Y|!<$*XjQ+HKw+)SF2%^Sx5+@x!9Rug`@RyIZhsgRwRGCda zCbFPnr%j4mCH_42>G{$IsFuw~B29<|P05&4G;NETX=`i2wZ^wi&vi;|0&nP*s&z(% zE{O3exww?!6q;|D%gLqZt4bQ*(sEn@y?j72C^D4HvVVQORnxj_)V6YRSpSgfndI{iPGi+`HEPcpvr zL*1ecN55nGpqZI`Iw*aw2qkyI*__^Ph|Ivyy2j%{*At{qFNDy#sJ8HU1Id;|lGpb} zRBJB1V@H0R&8IQxm5;}4+dhsGPV1m}UsJXA+IK1fs;9qlPo=oHc&2`RwP`pQHCq-l z-12!+6+K`urK(N2c5&de>!84mFUeG})1Ri*ifBYpb#G*Dc4jR;e4P$2 zqta&)?BwyiC=QPkV&>I6#t1L5q4%nu_n)=wwE!BoOS4rd7MFBJ!<}AgbZVE_D2lt= z#iS-E6k*QNYs$B{M@FQ`lS*;~mR=8Etgr}{b6A>iQBW2q)13W)wihj{&7&K;r}AC4 zsrl9Phsv33_051cubEMNwuK&sqHw_{Ib)^t?2g(hac?_>`ORNPXE;Q-yQ9!yBYs+5 zd^_%h)*s*WmirSNy?zTh0SNc$VQm6_KBjD?H;SO~QI~+xeFsF)oOwrvHoYUG9KE{? z*el_I8n-ENB32~T$csZ`2vJH3qE~Cnw3kPex=u*rhep5>kAIw*Iky0Xx~^lfp&(dp zf1R!IRU|jWTS|6hwK7ALY8+ zP}zpNlwuezyD9VO5EO*b1s9G8adIp8g^CxNmb`ILud?J4A-%d`P@(eS!GqQ&t+B8I znJ6B-bb@oSBR6kQP%u<)^KGRCK2z@v+;}7Dzw$s3CVLpp)c^|Y$xYU+0?{6Hm(d+C;+(#3$*dNYM!I;((W5mE5#Ct1uX>bo- zS}|gd^g2>N7 z5%0yS@0(Q*323dQIiKPFKe+wGAfmQSAg(gF7$2{ET<5JN zYI{W}BN#^GK6;%Pbf}GKHK^yCx9(Pmv*Mf|`rSLlE~ZnPi;QO`a$=qZ7cd|HWkJ#m zYEOQ4++wj2L6y+5l5=ibxugi!s$jK%rZ*Q$BPP#6OPE(CP4PNcISTz~0c8k2g80(= zz)L6NhqGBJRX~4Nop*Nf)4-TKEKj*IHAS|A$@o>Tt+lPl&~S2Boo$(=mgS3)Da{33 z?X^)=F9#5;^l@B(Xm!WMGv-cxX{Is!a%5N@@EuRuhraY#N7AZzBWfDSsfw$iE_L1( zna8u4cfPW$Z-UtM=gg@{JSg=R8dMd|p0jqr zE(@ZU`<~R`!2Rax*4Ys9oLQ#2<*m7jbc54XMxeeV3#{BmZ{Qr<9%lz)|MJ|-`!=^= zL3@gL!H^A`^)frlPFZH{BOnE$PnG&OEt4_cpV^T`{&$$#&f1s$)QEGE_)8ylZiVYCe$=5aEnQczFwWhVDyyjOFo9+PE3GD_e4W zP^fiNRr+$02*y=t*u!}_HRd#njG0SMBbJ#j%4S!(Yc$``V8q^1(jcR3kIxD}UK>!~ zv*|hYuryvp6jG2O&c>iV;bivM-1{yJA#;8pC@9&U&rZ$hmrI*?fx1_-Sb)}whgpl5 zx|%H5m}Q9#J!6jFu5u=LaEYZx+kn`Lv>HHA|LWZlEWO+Jjet|Lemt$gaV+N7WWpn@ zO!5g2G}6VuZMPVB`??1fdrAkkzr<^0>o>ZOW#aJ@$lx4kKX#xD^Ma~N$9rY?smYeQ zWfVycGCddPG&+{-%!t{E<>pbGa4l(UN|-C2cV3iv;HuVH-PI%_X0|3QXp4Rbw454& z&V)MhZ|YJbN>_K@A~ZU8$94<_aYOyZQw8FcD%2I&MbWw?{u+Yvmm^2=btF2Q+z0QC zxu?a%Dkye)MMF?Jy|}L*T8irvTEEkY21-QBnvz=|s)pksL6Mzf9#2#fzU)<}1X=VI z?VYs535W2Kqj(FXT;p7X;3Bj+x~e=H`PMG>yE`j8@w=AwBVl6v=(h;c%f)Z*v=MiU z6Wo-YdMa&>3N`dybPspXt1XkPdS!GF*K$ao9+yX zZp{wT`KPM++m$qPh17V&o?a1Re9p`^aSpT2>GMqVx^*|@n0b|T>Tud@+*ro6P3V^q;v zl0bSD9~S2M#Fm*rN+sm_x-8q4P}K~nWKb^1L~nfD!%bH;xKt@R^>pM#rodcvEYVd* z+9k9DvJO=YV+(!^oQl@V%@=t(5A-pQQWigruJmSlWnLZ_24MS|`H9m;lV zPiQP5`vDs&4W3DyC8xg~?$(OP7LJ)uX<^eYD4IP@r?G|#acsooeKh9WZVehBEXXXX zn{yf)bhhR&3&(^NF)$q|dr)fuE;bHDW&V z@NTAz7Xg3G4Ih=`{N^r$&-2I&oSTU}Jco-?irv};T>2uub&!)@M~!wZ*CD()H|k{Nz^;F;ZNW zGa|xLv^9a8AIT{n(puDX%>a67tp6Bm?59x5^OHtO-r%8U$DGOw3%~D~rJwpmNerBQ zeAn6nr3;=)IlYjwoKc?Xe_3nPgbG=K$#p~be<|~_>Qtv!y?`3H%IrmaV7eA4gHRnA zm3AtF8_3~6rNOo@@3*qw%gplw51E_fm+=8`L}>h(B+zm9=Vj}>)8{DfKGPbbF27Asi7|f%oQXUxB*m1 zug!nOTs_}X&paZk>jwIK4Z2iFUf8fEq;@0nWM4;+eVOW=iuaY4I^%bkwpT2wo*Irt zf>_p1-`H_mKH~6Ost=}AYf4Fq%p1NYp!`zn{!Roo^bOV$h>vni!v<`Moe6C3-oC&7 zwHF90OiV=a17rWsqG)}!R@19KqoLs;$S>Wo; zGf>urj4FA0R&&)KE!=ah-&HKW?dt04HQtlK=knqF(<6S-Lnm0VNwmMb_g2~3(Ql~T zYYK9Oh8dR~D#UrZ$S^F4I&3_<vsrp2r4I~NT;ZEtRUp$g1y z3$$Q1m9O8!x2iEa{}xH>Qu5QSG14(vpTaQ%?w!-h;8>H1BZ)C@)mus)%^XF3U!2Kj zm+^s_NA*+!2a;U##HKQ-bfhaitq1;CZz+mw$-vqp$DOVK*jJMRK?b1 z?R=O7!nn+;qs}mJ{WNnFA@+Jz87b#5NdpJ348Ae6hg&t|m4x5;QRl&9SD=@`h}F=E zVxniR7?)Tc-3}=}LSP+3>O>OPUjzOKY^2Zf^B2tEz;Hogcu;n~y&+69hO zwQolL(F-8{3O831%D4)h+drVgw6Lq)&}66mxTwZF5n31<1V?66!MI*_^;ai~2PThx z`=+j%LP>sK87mEYAzptpd(UH1o;D){xD2;2{?Ts{r8EWX=nlbk_6XnR{C(+E46A6w z+wK$h*wO3bN(!0t=nxHwFxaVbJw&5Ctq=WEcUGM3Dc$D z{>8jagdNMwvjBI+RneV0|0K&S-&x~mX|E|W(@N~9ip%Ek>xRf`vXJ0LMCeA{Q}+&K zOYeT5$J8zPRKTmllp$`3m(k^;GV@jvVx8{E#P2@N$BOCEF&A|X3yXcJe87TwQN!P+ z;JKtWYGADxTk;B=Cs4|)&wE{gTUOffd);zBf0FRUrqFMw+=0c^k&0#8vu9b!p5_A; zM+Ky$ytb@(T9bK(h9ZOBE$%&oLSY1*}TuYvdLm&hnn z{A*iGq;lXoKTe&nqg^IyX>4Wep_RcbYrQUM-JnlU!Rv!*^Ah!7`3}|C<@_2NCy(~U zNJ#RW5cr)aYp_`U>QJtN7wiO1{X~vtUzoKW5zR_#-EUR6=OB~oARDm!R|2hF#j(r@ z3Gm|W=W&)OeW630BOm&s%t29h`hwj(xdwfa)gENMtb%PTh)tdrH;qevaTV>+FMGG{ zrz^ja>ARHZ$yMb|ZSdeC>Ud0=Yz#j-OD6KPOoH2BeqEjH*7~Nezq0O_*}yc(*H%G` zQ~fENAdlIdZjkQNtj+$iY?mMtov;@a1U0iWsHWS}*02BeZ9^zi9J;(apbg)UrZaCA zyTI%*XcZ|TSEZxzI3)o&8=}%y8QU0(+Kd;2)pQONREI|0x^3>7rIzECFTwM>a7e8) zwf^Oa0xO+X+zWt0nhAUV;axa{te=hn+K|T0EWV^;PzAXsWb(Q9^&@!utz0awM!e3i zpu3qN_=8ok;4kjk(CJGJ_sCh?Fl-Qo@Hk z1GnaVx2b1qm14U&(A`&IgWoxN7LR3Lyg&N3?U{bn90 zk!|g|x*%z|clZXy%N#O3`*8>xtDDSY6;rHGYxL}RbjEJ^J_s3jmvMHh^!SG=z=K>D zEuM0&aDAok+|JtXji9*OzgexKA##u5W#*%l9Gl3<0q4FsY)o0Wq{pd&xem`K7?`XBBbx9k%PtSP>gpg`@^Z|P9;>Q!GTw|pC2sir)aOauLu z%(1{|EibRkVzVLgLP`r-%x(uZ#dsmPpj9iArxUpwD24yZqJ}>AJKb}Qbzj|Ur9^X` z7-Lt|RFHc!%UJeaKxQ(vdTkLE7HiRbCp_9cU0dVu9?|4`NyuVFhDyx!jI-gfm8vk5 zZ8x!@2A6atBUQK2YNYZ$x&xA0fP!0_=D9|Mx6BDn*MH|~>R+BK!2qQq%>se2mRS4= z?33^yK4$^h7h_+N#*X6?W-&8xq^36PV^kE^*|FH9g1-Jv8yo)hSxW=1^$G+T0QXT<3%?r?4lo=akFneIMG-a9h8JIs<%afO8gFAs!f40_bcD2~hQOP7 zd-@-xwUjz180e8B=BGXKh=J3o1b^c#uZXw=B$t@r;vn0*5Q*nCqw&kFTNN3wkdrT6 z-kfCJF44$|a1~ivW+rzL!RPM?ub#aR>KX7TE4WF?=8D-&JPtP5!YFPHbTla-F+LBm zY?iO9#yWG{>pbuME^ML?$o7`Ac_YsKY0dyMGPZ{Nf=LMPC$_ghN7yz*q1)?HzYw4=x*IeVCCW6IK9=Nrv7 zBZJbsMSah=`zxzMIs%v|#;eWY_aPBK!pFj^cBQxqY9!Tdsg};2@?!_#Fl=ma3ydEwcW>MLn6G|+n(!hx;+Q&ZK#;UnC z)k#O5{FQ;rj%Wi{F@HA~kz*OkQk^yq zi3ayigOJ{r&FrFdqN1Ysk>s@xZr7joU7X)6iDK%nw8CuaAB!Zh9*5HX&O$ zch5ZEbG@#oAF=p-gmTa&I9o;%wIR=AR+ZhV1C;r6(0lFqWS;a!dx5KtHxZ0lh%N?m z?e1#NQce%`Wm(6x*Qy$ux>(2b-}v<9(*w)67`h$vNPsx#UTxFQiHWJQdEy%f>hJ=e zCvU|MMR5Ku5>#GZiyx;tKYg!V$!f*T#NjQBTvEb{E{r zLL^fzY9tQSeVcyQ@>=o-#J9W%|)_?E6Nm z1xluD&d@Mk@M*7+8?mNy>oX9H&5G&6)rM>@a~asI8li~E9nwJ@HG7Xb=57-r=^8sF zC+gT7u5#Y*jV7Z?A+B1Om7&rUDcqWLu_?hujp@+Y@UXnQf#`m_mASIFLaUQ2!1bQO zA!aphw=tW#DMOvOL4dPxo9=tMfeR?XR%xw57{DO+`8(GcFBttloPBjbm0Pp7l&By` zOCyLhNOuSV(%s$N(k;^69n!f$x{;FZj!k#>ruiQ4ea|_b_tZV#e>}Rb|r0r!+-fpQk{t@Hy3uSerB0%@j*lj?&uG4#;-eWu77) z#V>A1IgZB|rPjp9Bk{TFbnDqNt<+v#I*BP8cXHimeGqtO);l!W-VXlaqq=D)#X~Li z(mPOQ#AaoGpF-ie9phZTqt@CzscX|w64GY<`FEB~o)bz+Cs}v&>dLzML^sA$`=%;8 zz*&$5PhZb;%@vi64HN#y_Y&oUv%;Tlx`^V9e)e{*!reKtq#Y5bar!A*W_ZB0b#A`6 zD<{jtzW5aJVOI5F6}XMH6gNMk(*hTBbHZGyw3$Wzo4yC>NHJOE`gq5@B+U$jukxP3B4EcQ4NhzvBGWRs;AFO**sxme{X#gM0J*(E5r8Ps;D-N&lHU zO8UlHXT*%Ox>3xN1SsV&nao}gr0W2Kn?7vRZTSNI*8l9Lzxa)>0Nt7@w#abgFaH0F zv8e0(K%wZ+rF8x8p7bl3en;FtaDqG)xGi=N^1bk1;i*BuobEwoSc9wo6e0fIaAPFB z8IYg9{5a{btu=T%d=d2@jrf9G5W1*lZ>*?f$?R&g34jRZ7@t#P04O#x?10nu`Oz{BVYsf(BIl^yp_ZL_G?uko{09<^N zhV9?o_00f|%NEo>pT%XwJ@p^-$9A z68d~+f+|XwxLqsqEyL4)BchL>JE8;QoN@fc`+js7;ZG!_m_^~ zNK_znBOUdF1tJy)zK@`mYmu@0RFm_(>eTxOaotwQ@{#E}D3~w#z<;B~K{!cpL_v?fl?#FWY6yJgR zua?6LqNVWCe7{_97;AsXq)TaVP*tX?!P z1;MD>T2L3BA-FZ^rKa#iQ**B9t%1Xu#2#RdVgFpMSa&|jP6>&bup^9{gt$}QW0-0w z4mIucHAlH{wB@7~nAhQ;P98b1mu;ZMt149)a_8KKN@4XR0l3@OA>#iS()is=`Hr7O zESEl71Pc2_Bt@z&Y#kl37Rz}iciU24lji-Xt~!qZ${+(;(yes-#<2bkU6-C=0su{~ z_!f^-B<$;x-AT(|zuaJTqe zZsM7`D>;H=k7cW0Bwm}!KaUl9-j<`z=S9*5y$gh4*DusSQ+IKtv*bMmCX?o#a7ei@ z|7L-G!=U@|BKMn9!t$jWQUxgXrEzr_xDGI^Cpv> zdfaj=Z3+|ayg~-gDR6Pk+f~W@E=XPn_sQKY$F*rgHcuTcP(&ok1UG)|;QjNzBb4-K z==CCXQYegFF;(5l4gzJx&x~60@}PQ98y5zYr|(Gb2rKRKXG9O3a-@rf9CtI5#S2pr zHXG4qt)o0@(lDM%lEP~RumH%)hew_o*Rf55?-Az+$VDg`D6a)LOiO*cBm z&08YPG|dyfQci><|bL`L3~_U*HZYI}JuppE=)QJ%;q z&jC;dqjCmP4pJ11zg0Qjvee%^DzGk30JJn!@O@c!vRme26M&s$*2qU$f^@rs&-#NO zRP4PSYZ?4$E-eB*j=Bi94S>p4(j?|WGFBWFbcGqut^8lWmA`BcPn;TDKSX>@Ev3AX z>J+IVH0(xe@!ZcKQ}~^B_?^MyF({8PSfr1@f5oq=^ealet z3Drkjsu)(h&QtzAKj%G4B~7e${r#o4Q`0qaED$v$u>duT{*Y4z^KH8H-94w7nVA#t zCticukh)2h(`=)uFot|j!sFu$hN6y?3bQ=i9LI1@{We@ieaOAQC954SMsrIu&s^+DZOR7er~Fi+r}Ae z=lF=iRP5qv8~`({^O8_=-063xd;z+%FX@<@o2$n-*mLIX>u2ZP0s%TokP-*V6_@3K zOiall;xLqt4@kD1I4x>!$WN})lgxpCh8Z$LuimSdtHM8oCf=sGT^*y_*io@XE(=#| zY-_gwP;~u{rTfXNQx10L-VaS~+!TySQf5{EqruAnt9r5-%~Cs*IYTX%RwZstc1=}je=D9DY;e6X>{K)jEu`-hh7KAONhM+ANNOz5gKjqkxvcR9OR~U1n!KBrs z1U3~7qqXF-lZutq)l@6b38$4c3UHoV-8ZhALTx^b{JiQXpcm>qLmq=HG3UpTrG|}f zJcKa0D%?)*N>5t*t@IBb?pEFc$;_MQuJ!338%_=F0_LYGDiED*%JBjN;(&63^*VQu zJzrS#SfI$s#I5gSvG-2XIqQN%4X0JlNQE)oM0H1U&B18&WT8xsF`3cN-YqQ&R1Jdx z9*>0PITEK_;2%`kPl`@G+pEFR3&4vd4Yrd>t3vLT<2*(c$}2|cPLQxz<`t&a`^V5z zt_KrUo{ekY31puDElI$!f5?7t&H!2f;Nt;a^M2_v8AU2W;rifeHe(D7i7b#~zyfjUhkzrAxMG zTNmw!*iCL@sG92FFP;S(>;;GmR6COT30}JFOXC1EMOBdsX4-Vs~o{Tt{%cKf-b@be%dz4hL zUmQ#pu-?)oC6$SZbIDQeG&KBj?? zHmNYBaMT-$7&!QmvXBSps(>P__MxD}X<*+8AKWK_XC<6-y~7@wQIpcSxLCe#p_)XB zrUhur{l;fL^pLnzB`JUJn_ zievUIX#hgZ|8O_y@J3WrGEzs6+B>E?n$>EQ@A303J+<|5=Zl$&li5m@`a`rkm-r7} zU*r(7{xP=qn+3H-5fPjA2D>0{e^yQk52Yo7W5vLH-?bp92-Jri4qzFnjNbtT4{?W> zx34%Mco71^#H2G%@Z15L(`en2JwrLlGc10g{sWmUrl=TK+t9G-jHabUXf`*A-(YZfzajf7zcHp~wJxUT zOM6|7EJ(I4%08_v<#rLdr3XM>h*W$(&sN(^e#RKn{L{+*Z61I6X^9`&NLDH!{VM^h zlo&s&l1dUFw0Yruz-fL!b%tir5lAx~IF486ZWdMc)rax8Dz%!!$bkG$HE9D^)>j(I z03LM+p0OovL;!wcid!kUrRBPu@e_)i><>U)U|`Sn_+odiQcCAspPba8{Lz;I{@!cl zNrTtJeKkN#ly-5@MUE%+H1+l#B8p9aKx3lcm`+BpO*2(#oML9K?bUM2vKWQ(R!Q#_ zmBZd_UJZ((7Mh+vZ=#j9?yJ09&YySaKb_uhRUT!0ihiU|)iry*i#oAS&H+|$F#?z4 z3M$Z*JaS}Gy9LGPMr<=|zq5VS?R%XgZyL#IXuqeR$`5asTo3N28;|)n%31sl+71r-~o#hMGui2mK52a58;} zZ$7pN@LZdx4c0>D^QW=_B*)47Mb~VT)oXTt^)q{2+44qneO-Bj7?cQX7t7t1;IxFN z|95wADe6hBYl2Acvu;_%wj#R;`*nSCwo$6h0NvVq02jf%n}vKH0dkzz$J$dutQnJQ ze!FH2m+n{qDL_#g>EqaH%O}S6Xeh5EU0Uy+TCG2-oCPUa<|Z*^uUf%Ndr} zD7ZDOuWWsjSa_CW^1lhza8+;EGpPVk=ab3LC?^}2%d#~N%rli1@rXD)GJfIiW7B@@ z5SHe!4eQ*4;$_zL)I?OYcp}Jxdvc}KTyLVz*>B;^|I#pj{ECHnM2VZN6rUOi87z#~ zy#jo!6iguV3Q7NJQAAmy#xhXXtlMyLd^~Db1H`=NC$@!T)NUn{>8G}Den1j#Xf>)9 zmJ>8wk+-gsCff0;u?*Ia^OdTMY8W}Q;c zrQn`}G+PwzRhkE@`vg-T%8sm5>P>x^o|KYGT^!oU*wMfIzk98sn%@S-gy);~9EH!H zgT7!zr8(5EVSPLZ1J%NcaQHhX26;SlCr*eDHlX82oW5IyTn8T%?g-N~J<| zU0v=F>`1PZTKLxIbZ4qyy9}Ht3i!!&cX7K03&pJK1yJtUZ=w9J23i~+bewOpX6pND zu^tD;#~XQoO@59Ydhxu8m=12Z<~?NYcim`N~Is6Q`U=Io>Ea%fqCW zsR*Q zZy33IY+CH^&(f~7b8f*|B48ZL=q z)#iDK6=2EB?8G5QHixPO*BlehX|yGa=4*Lz&Gw%Ni-j|pki@#dWV<#Mb#(dgw$+Wc z7&>|U^(Id%|BWu+4<#RxGk>0mehnd1$zG0eAz&267Zv zC~=61@#Lk)o!<3@JqTA@-Cy*4gWJx4u`%I%oq3X^)Gw;Ir9#j3O8>&qMQs`c;iV}F zwQ$G^>GuG3RrHo}p3F}(<8faBeH(0{@}ueyfaTKxo_|5j1wQvjZEr|D5Mcp|7)_h1 zn|(B+aA&KdqQ6gm14^2Tx{wIxwdCLG$i!x^$=7X0ZdOW7^v3G<0ZxuZmnQd}KvTn- zb27!@V9XWo11+%mQ09e{xG7w`i=~&YW3g0b9kPkfTz&w|ed0{&M1qM1X+q~%vKug! zy{W=bQKHLFWCa;}o3A2&?$5ke?ylYo;RP{X#nMx<0;d!*e)p8m)6Whr&5OrlcjtEu z73UzW>Qjj5IAe^8OVr^^`6d1EN9_kZpGeEQ!ia{m?oB!y@eFnWs&s*Om8YRgQ1Qco zO3$EZ&17oOOo7kMprMh7?wO>>xeO8$$o5?&E{j4zCjvFl?Sq3^2#`x|M|Y0lwgpj?`%|J+-nsfHJf4Gw1KK;-C89f0WdT#LsaiXwY5fLlg44 zxmPxxTnkU`J56NAhLref5%9bnszL(XrW7uF)#iK4NVn5=t4aHh2TneSp1CdOvkH=< zqk~2^>Q$Cv&3!HzQ>jze}8YW}^9xamcLYemgyFrs+!9(bKvtrcC1|`5X*qa^i1o7tewoYh$ za1s5KGB7yYMM~(1!c5CestI{;bTK-j8ei~yeX#YI+ubl}-H}|3 zce{`gDL1XUSZ|*-d`5$_Ar4_*38>@62+gQ8KxpZ945dkC1}gZRC3JPs*5qZQ8^@3v!UT{EE ze=&8F1Sroau<`nKY9qsb*o!BVhx})|Q@Rn_Fe80mqaWxiOS@U6GW55V)Z`(28;SaX z+-1~@uZ*HH_h&0AMnkk)n9NdVN49i7-4&zF0BVJnad^luA)(d7P-D=nqM0p>7UMjp z`~avL8X@dFF6(5^^Y?EU2dR{>S?Q#z!SPQbl$ta-Y-EckGRQWmCNOGj|I!5Hw~;;! zEu?-Elxenk)H(|;uFa+Df~rK4%NiJB@2gr zO`*wNPVo~Z9O}b6??1o(uYdJ!dnH|Io$`1P`Y)OKmp4n-!o4_Bsx74(MEch^KLg@7 z_s@EKJ52sYoc{dHe9M?+)bn}%6+XZJU)l+lYaic4uV&eI^RxAv7jI=Ul-9M;)H1RE zR91i7?0rkBy{R6h+E_>9`J6IdNbaFZ%eEkI{_uOL|Bd#gAA&bOwO6{Z@8!zMIFU3T?2%o*mpVBj8no{RqPK6~Dh)oyzPI&7 zN&VN{1SYFUB;eOFAN0C^PAALp7qW&t?cky&E&}f{Vc#D;*z1Yr1ME)AdE|k_EoOm- z0R5s{^bv1(`DiQ46TT&tXG@zS731H(0xT_tJvt?)!Z-iVXJ(zslP@<`PZuj=vv3y z2><^f3vht(^^r+i|$9*~J9Sg)3YtEBO z@9mN5>e;p@bm0D*DgQhvZSkc#pi8suzFz_wryo$n$=BHrbU&Kyilzh-gn28Sq3nig zMrLLL7OG4U`woYkqE7=wOsiYW9ff@IP-`8yqlmLC<*F#xdlo>`a_7FOul_G<{7d1wZ8327J^N=yO4TIy3hHM--=W5L{fXe5C80DN8;?RI8eS$w~iY5jEFXk*h(l` z%Y<-+XrN8qseUIozI0@JSZr-cTMEh}hy2@O&D(T_!RP;qcyOve=+?Ji)Y+L--_ekU7gFi=zoVy5cNXc!c-pTpF_?;r%ngjP zgIKLhTiSc{RO@8Jz0}irdbSxh+2nGXHRsZX+C)?WZtCI3-XJLXF~$H?0Rf8Fi64NNkvV9Y1@$>jM;x#xbkKqI&?)QP$*pmtGnvcNnnq z{3Ii5Fi@~adQg69*a!w+TIUceWfdH3``D;$ zOv>}^xtR*5)iu0)lPFx3U|;Te$IS)Ftmoxxd@h%yoyo#N%l5zCRlM7Xj%+HB$F*Cs zyLvuS9VqIs(WJoqJkB2^;u3~VuwwDYO0H-vSu7Vj;Lg@^WS$PE;sVZJ-R0)&vq-Kv zj4r*TdO+A)@Ki@C7r9ft@+@P=TdvxTw8dFup~2aLyRmVsW|K-hR z;F_6H9%xHZP=s%-&AbH?9l1iUn9u#oo&&B z!-Jz2_R;{C#b`(?yQ2f0QfX4@F)7Z0UN1+3|6;Ak6bbX9%NfnKNHdS7e5LW^*DN;+ zTFqw3INhYxM>ZDoVB3eJlW9vKaBi?tsaR2v_r1P>$+eo>=|$5TWe8OaU6F9>eL;cA z+Rn=BJeVg9PO+8&X40jS%;hyyxl@xdr{iGnJgfjzy=GJb5ll?Mf{@Z0r-%AoxQI8e zxw79MTXlB!$W(qE25V?EyB3sJ1hoY%(~*vgdUX6pyy(EIIAuBx$7%)3K%^zPIW~UbDU}$M$dY( zD84N!74)9O=5e@t_A1#@jwuf8jAh%jmzxvXLA>WR>)gCX92U|@^^ab{XK-6!zqO5e zLS2zlKp2+o$eTn1^W6R>h0TGy%5s6PCaC(5=e?ZaAl6MI=_ew((I|}F^J;hnq-+y3 z;mBycN$=2lPQd`87P^hw_khC{W2r35*-IEzW zsu9dC6+Xt$LdKF;vTTDm)s*Vi)7$3PrboA9QC?L}(t?Vyy|Zt((pXN=AV+U>;&6eb zX+)tSE*7tFY)vf!LehM5-gg)C!XkBVe!c1gh86?=Z;mFM$~U2TM~e0N{bD0_io@J1 z@PluHSTC(gGshf<423Y^vnG7eI`_pjs)KNiPVU94gy5Jn<%hJX>5M{G#oT1w!j&z5 zdY5FeU{@*5BHfP%gK$kQc1ogwuA+4CQZ}GXidV_53Rv`(o<_VJeUPd*A>hH;G|Gnf zJjP7-m8=eNf5TS?i&U}s@96`-uBfNqBkU<~Qzp;HvrqX{h$>=2&LeGDDJXd!#0f11 z*^dOfcOfycz8G-M@?VBIHyPC(S&DaL0%^fb{Eldc2G?mz&u^Q_>3Cprb8{jJ9vmw6 zUf}+^i1>I=%c>kOTT0LKw%~PEAm5Hv!itS$s7`DPKhzMaqqpb{-8OUdNtV%TA#g=| z@d_?Y_ejrHpL!vFHd8OfoSB}^Zxv~w))GMQ)F4g1*k14!4e)9r=^7dybQj)|LV2=9 z#8B}3U`}FN)O}stOz+KL43x^9YtwrN;p**Ur&F(JL_r)E?$yu+Qe`!?SV$(1R8a$q`DjuLxu zjH!Z3H{2{h_Fj7t0ui&yjVs<6b<{z;eb=TqUDCvC-7?!_$It( zM<`KzGdUvReRS?Xbk~xX^dRh3?m%vQ^eLCk4p->Qhs2G9b5q%Y3{m1>y(G88@_{Yn zys?K3iP*zrm$opN`wM&y$?wqjl4&Eiwz(>+^*b$OdYj$$nrR0oQigLwT8(d5eFHKC z(0{f)xWu54GI7VxND5VQ18YkqhKM}4Swfl@&X79E%sjkXp^W9~X`%%hUO2}~6<%j~TcN9wsprU&)taAS1*^Q4_qUou&_z6vk65T%#g%)Q%36?`x^s6P|hL zaQf2ybjaVO)nTJj;d;c~{;Ol230SF{lECT%pIJG1;gxb)(nUfXVTyo}6<&s;)@>MG za9v@8(0r3P<7D_SE`k-}g_Te#GoIAs3RHsZO4PyVUT7BiiDs=Vsg|1Jwj|vI0e{>I z7Pn|11~OjD%msnqd{d!(L8-^e@^)Wq`$Tj4Fq*34tuQ&_hR*r|b8Kh$=lvyJbr^hOo+-Ax9E7~{CkY>EP`=C8nu6ex=F8`* zjIt%^W0r3w&I(15qX>tCTh66-d5xLkRNsS}nwJ)1z9Yb^^_>y*HHbir*Zdr}lLng#87?f8?qlC+q+Jl$_TNqcR-j3LF7o?F z21%-22+_&zd^ESXMUX?hmrm&s)?bh<@n>Afb#+4$?WyAlNzIFypd-#8+O8 z@?u}GoVy%_;SoU+o#@zSjSVeVI1X-#?iNROPIsM{$58j=k_`hPWS)b85{+US@sQm8 zyUCvOoIJ(h8h=lD2zYu^svJuT^Lo9*s&dKr`t(K4eZ*xylj=*#&T%_wkeFsuUSqK9 z$RQZBM+YBcsxJ2u(!SZiV;+5!PP5C-XK&fCy~8xGenozc;@)iJ;J4cjRdYH!m@!J* zTux}&_rSvX+2YG~CO7=D$AT>b^H~X5slaLaxzUE|$ST$fgaF>a;bGN&0slAeat+s; zXi-wWMx7VsCg#45h+>_`f{R`d_RM5OFbL~3JMB4B4^T*zmK{a=PWM)DOMOjo7K4Lq zF$Ljqlq>YMl#FfZkhtqgzcF4!r?-xtpftGqRcUXvqwZm^Ht(Qx^=%h6Q>B*U&g?SW z^~W}~vo5I${ba4U(%4pAzf`+w_QB?hUF7sJFCLA2qbUc5QylmMU*Y*oz)QwCh|+Nf zZ=_b{bL?MV44x!w(~dR$(|Eg5$*ob(1FggHREj4n5QjCHf2L49&!-Jkuly z8yS@)(fY9w7xAX$F21foT0$>6TRrp^_k>T4Br-jSiEKNTluVXWq}J@U(_w@qvB5)S zQ9i?L)8QIQ)Ynmd7l-e_oHcQGEEdYbkK695G#_A&GIPxI%NE~`#C7#f<<>mexYfDP zFVxIwY~>vyl?eCd8{168{DX1?xbAU=zs;_$@hvXuofKP$+QGrQMScY}6suuhz(uQkLt<@`qQJ16g% zH-z(^uj)2w`-jor4Dx3Ba!OofM9LRN)f-=&!_&X)FQ9IE#hfE!(L9cDvFd_*{KEWw zQKCFu21aS2fpG+)syVWK!v3w~*$` z^L^{<*VcXI7i&N`x1G3`Jmkvrj3|S@b*Bc@J+i4zo3_Knh`C$DgH+1E zetUIj_1pEL6QztIL++JUi5yW_H+Ru}Xtf=6xdXFlD#3)x&OXXcXYdVC~`8=R$@iL)b_yPRk_D9(ZLJ3zOmc z-}KyqHV~xkJFwDX+7dC@r^CgvMxV!++Lk#;?@EC=vL=)5kb3QViXbruMkLjeEon)+ zz1XikolTknE~P&)f?nHg_jVOsuIH_P;5Seg-95{?s+r*_W0o-I&}0GITeP^X+pT7Z z<{WXrZ`m&QM*%Us|HLeac%Q<;pXIo$!vlhKd}&^inlG11*Kz2bx@PUXfs+&SaT!)B z>4wr~H(d4uT$;%@LR_S6NRTEb8;H|gk&Bjst_eq){S(rA8lJH%O4wY^Q{fuqI>q_lsq!C%@m zsxoOYU#(`PW=MrarxBg3Y2H4=GU`cugkE;%YH|$(sno&Tvas;BZ{8st?U6#y0A7SV zl1E8$@oc4g_`w_dnH00@sp}Hkmex{7l1}w)1X=fn?3~#B##Be4{{nfcY?p3<2LtYwTtWo=H(yA*a z6_#?jGsbf;Qx@6hQUi?^0z3&Lb$d6(Dh*eWb{Ic4_CIEKanf3$D3JMecL&&VR16Ev z?W3O}pVE zTzF_Ie3<*!F@7|rLcII832DX^pRA?vd6(AON+)`mV)~9nAluO`*P}*ek4Nr0UaEAp z`jG~?_8HjC_Trac1$VWtCY@~-t*FH%(eWP((257>HAoF) zd}+J9=T%a45!L7Cx&P!Y`VDAHCiweJ32(Os1f01}GOSAo^V#5Xo>B)|Q!l|o871}>#%fCw9VOzj%)ez32C z5=kj29V&b^ZYStzzoVMQF)c!7m$LWaW-1gulIidp!z3RLk~rM&KR0!PsdEKs<&XA4y4rxu#Nm2N^a`q8ILnRCPr&18$ z=ddyDN-5xXkI@?xs7$ZN_lLgECVet{ysc=Uyx`7vno%Tj$AMkFt=KrbF9=qsONl;R zqX@36G#U9o=On8b)N*?#+S3UH!YLG179}j|%o8FbVJ_HSR7WOYPRI1enELB3$2_Ur zXWVngVYo(1qK9}gh!Gp%t$kDbFYm~^`knjQ^5t@0@pCH~eo;JdScAx-Ies2b#M2Xr zA>}V2p)k-bm{8`f9cQ)lPeZ^|F zJ6Q1sooH`vRpHy>%pXk$SMtNo9xGq+@XB@?y=ir}6X|gYI9SkOZE3D|8lu2e@UR~pp68^Z1v|oy#M4Rs!vN0wvCKx z0C)fC*EqfJd%m96U1K05W$u)N;B9;MqS>V?l69s`w?} z3=oZyL;k*Mj0tf(=N7DyjiCQG#q#EN1!s$5RAb$f7}#*^8KDnuZ=k*8HHr;&WTX@7tR|xcJNNV_uwdf6=sP*ssQnfj zWryw8&>YU!#_erm@9l)RX}dBdN2A7CiX<*fE5>YwN#GA{O!8N|yJnMzNFU~SZFZTIzWf@u z_I7*qVrxQtV6Wz8o)sON@L$KJAH6AR_r#g$ef=j{oWa)X0;$zP-#ap|xySXUD?<}Yh3vhjN!~FMds{EZQ*t)?`2F|!iCZ5QgMze1e4Zq_k2R zX3G6_xm?d_u%wV#vs&%^+;PcLu0S1O-$VNO)qCxbUmh$bz z-KL9oR4;l1Cj>wDp2Is9K0GZ-z;u>q+S+&q6puc`coRSoN5~_NKJ|)UGUv8^$TK3D zZ7BCS6O`YK!>7l;T6#csxEl^CO*Kv|#fh)I)k9RoCFV8s(1k3;lhm0<`&%s#l901v}cSBg#vRAy-u$ zP@euBFsk=iYT_1VW(HzlAK?IOv{Bx!6Qj(W8^dh}vWiOzjBr`PF&}^Wwbu;TZvx=^ zWRNY%legs8pq^2Jr9tu5XSts_ZHS+I&8)v$%8}S%pnKn;PMjxeNlYYt>KwXzKnBi1 zMOq+ME=8Z9n-bFBn|m)p>~%Px#pE46|Gg|Fj4eayoal-t_C;&d1qZVFIga*kt}(hb zl+iwxaF}pvlw3Y#UNq&wI#N(bKbNnup~|@j?`htsdNa=l7d||41G3zt61BqWC*N+O zcW$;GkmB>6ZpgOCYh3jvZp3J3m&7~1c%Y{?5`2*^%T|=8!;bH+_?VgE{GQgMm)N2< z-}4X;JN>xt62`1qh_r*d!17817TNZDd&7z*xSMg8s(WMKaFY@mdjEXKnN)nftGr5~Ey_)JH1=P%d<-Dz3r<=@GGOqpWeVg0uBz=X(Da zQ27sAB@IE40MzNa)1)`wq)e<|pRn1)2_vnnI(#J#7?Ym=X+~1snz~OR`#(s@K*|_#gq-aj%4EOh?>6zbemsTS3%^uZ0JQF7EaDWtK`4x zo4F(u_m~#5pd-X1&hA*Y+b*Mgc*u@H9^5`?g0HmQ$9#q{7N?2ft^M?A5+XELY(bsi z=1kCt25-_uXut7yT&;urt2e15elzm9IXMPn{N{s}Z)YG81_(xx;IKXoM(F$6rsN{D zAEAu#@p0a5{{}95#!wzGQeDS2*ht|*= z2I^eAbO`--C#goG+S)pfJ|(+{dV5Zx#zm0y4HiyhZ0ywoo~>;f{nsSqW7^qWL-YRT zoW}D){0{lS^BwuDJAWzE@EJMI^D_bqhqj2>B&&n%{g{DgT4P`C>WfO83YU@d2ERN_ zAN)db7@lFM%VGR`I{*O+UX9H8Wt!qtnHv?uNTOaG456FoOJeC2i=`thT4n^}@$ z>8>DAjvON?UIz=Zi4!xSl&eD@N!}oH=lFH``prtYg`Eo>QME2z(0Ezny@1q#Ugwu$ zUfdim$vA%(>T>I?xcA3pLuF1oA)Z1*6E4ZlqHMIiOxlf&%a=prH6u<}5xeI?hjQ_= z?Z&AI&{vD6W=<6%q!L_D1b)j(_$qk8G_Wz^#_u8%Go%<^HsKO4NipJ=G}T(I@zw10 zc20{iM34_lMB8M=I8aVX#NU5mfuiXys?9?rFItfiaj;j=KW3yZ>182H6P*GNO5msK zBo|ML?TSCR+o`LN)@ei4LtsrFI#Y15){NMqh@#_>xWMWaV|QsQ5G zy^ZPxS;?&|N^|OyyA@qaO-tM-=8VhQAcuaPV&X;pzCM*7ePEu?9dj79xPEoRZIW;U zBf{LD;RM8PeV#Lo4Nu@mWJIFui~)T9Poj08Q$Yd{=PQO8`v(EE@w`eI&DdhEMji`@ zIdf5Vc`73%<-6=xJoZ~bPukYJm!~$f`eog##OF@S@B+>Qe1=2~p zjMG!{AiMWDxqX&ExEv#_*l>ZLgPiL@_2a7$)q%L9$iJtnoL$gPofI_>!G`bEdsCgu9E-T-on=n4l1dv4ymQZ-d6 zCML0GRwQC*@5P>`ToqJ&eb=BRA$Z>v88|v96nAo@gvFViZ1d{1#3_v+RnZx8ZZ2L~ zV8UcP%smnTyC6P*25>@o;kx!dzZ;~SR{o5DDaXnZZRt?#u!TE7DC8joXiE&>x8#^6yH}*ht&~8=ewKBB%Tw&mA0En8CV$NgnV1}v1pC5Zgkq*ziSIYn z%1GV~GwR25N9YQMt+c9@US}1UrRwN%EJL>W$Dd|tWT5_<_2r_2uGflN5_j+;Rn!+( z3<~YuG8g@}xNzjdur#FNR_6;-$)<+~Z!~}|7gmjAOnb)3Md8 z;!j*c#3YBama z6{D@*?!ZE9T^S(xNINqD#L<9C!%(z{h0W%yx*lhc9C*imRtj_osgrE%`YqVP=Yd-G z00=|HSiM#rXC)yQ2ie z%T;Y&S_#|@%{R?E?=aTnhl~a_qU7jL{WX5KIjmKAovf3B;Y--rH%KfnIY_;R%aGIR z+F+QPO zRpCVJiXJ;B6J*jNUV;Dq|0uiaxG1-+ji8_?APAzgC|%NBBHi7LNJ&dKLr8Z^4v2Jj z3@Oqj-3$_vL&FF|e6ROA$8)~xIrqEgKYs7*mv`^I)?Rz3 z5CN^`plLpf2jjXIZtA8StFPK!X1&G7ChsMxK4!2!_#-5o!F>b!qbjCijfIvYS}UIX zz2#ZYHFBbd0wU-S>`({^hscti`Ne)%w&~rpgKF&Ln~u*>vR%{VB)VX6PP0hdbhOiG zJb5i+0Afww{iFb+9jx?{cT2nCe%*yA9T1tGm#mZ!x!qtHSD;k=+_6iiLr{oO+9V_o5m^v@ro!3CDP7K$q zbv#=9`B2&$#^3a!;>VANiB2^hn@Othb62*hp7DhiZ=?HY)=n9sNOsG-W(cRYEiYV-${mzO*vLdq&Q$p1mqigzuyUZnI4 z@DS!TmoWG4wq4WsTvShF@4R(vAI<6+1Of)-T_Hs}y~D8|`+n$Tv)N-Rk4{=7(o zt+$Hf`z;Y~xk&-%@l^l7UYjImgtHH=}4@!b6%ssdEchR#>Q^8J&||*gx;Pa zBBFY(Q~5eQtrl_Re{4%fw7U*pV=jspQR#qd1^qny1GrS1yPwV3bEh|r?Uf>N)#%^p zsBOutHCMLoHM?Dh&PSkK71yX%(p~NsC2&2Jig&hPvRPrwm2|*U<>{p99{aEW8QaJq z6%TB&wC5&{N&hGvFl0ZIR%;(5aIvkR_i-nkU3muxUe>1TCx6Ak_-^-g*uyjOBB=j7 zu5~n@3T__d%#Rom{4U>s1^$U6_gz=_@z0LW6vnP)V{B2#F;*|^VKlR7gTdeNv(-nr zsO7o|;uV~w@A}>zWRKBVO!3eYIyGSxFH+%Cf0M^JT2zESCc}zVP|&UJf`3Hq zVJOd4dO_ZlNf$Ur;MfWGg6l8rU(?l`)!Z$CmMfPY39hBKrPAhg!RcUds$$U z?;XDWymZ_vA69K-(4lF^F$+Q&*kl%Qx8cV_fF8?MXii+pT2yh<6OhPFm)M(~bw9Tx z1zjhB_ljC@V1@kgp_|6Ct3tJUi_Fm1wu&BuJlvt?Z`|BhZo|#q%~EQ`%YMHkK2$&J zPjgT@Wj#&N-J>(|VsM&rM74CC z4K#$sHrvP#Mx0dDqu17^pp-?=iXOk-rv9vYF?41a(i?PoIPUp&x?!(;#A_%(Kk>xG zb4>Wh(fCg|4`FW(9-rKH3*%Gs&@UJ| zK2dQzH;W~-JRY;uY}F$^(2_ec$+Kilv6?9hrn3>=tB7rJU1OnNBchn25pZ%x@ue?^!by^qDxgEt@b~>*`9G$Nb98gI`Us$vzP}J|RdbFd*APDVRZf2)bPwnke zmb(l9km0`tXfj-UJ)~1uCa3T~9AA-23o4SS@=QWhqIG<8CJPioHcg_&n=7AG%eNkO zEHP#0Tv_@?{28(t-p@&JywH4 zW{1~3T2%Z?1a>$<&sy(iyDfG^@M(G~TNg2^(JM=9b0F2Zrw8^u^*)dH^3o+>b*Z+h zmG1=l^Q+b*e;a5ZOWM;{4>IJTI#M#^reDQ63(Cu$U~-Au?L`?m3-XSgAi4kDZVcg+ z`laOEn-Oa5l9kU7f~woIk+vi%+iqXqt|~V>ISh%~O8O_vyDL6+EUI(YUiX@-wCci` zCJ`wjzIUeN8MvY{kzHvZJR!(1=79x^Sv>I?gP^73L5Rt7EGj|v65`AZFO``7`BD6F;{wP zd`0UlCkOeLl?NQ;g|09kwiH`|tE(vq8GIvd4cDCpWearm`1Y`F#f<(h`Z9e_`3T&f zhzV-$OlPy^>sE!1z;o5UtqNRLi)Mmt7tJ}OV$g7T`9D^v9^4aqlbJq=fCk8}@p9Xe z#AeBB5UA5*^bHc%?sCkAm$9@?^(8P_{~HW4nh;fYh&gV-USu(U5`?EXVq&YGhR%^p zn>V!4Rv=FbqJq5Ir)J|zGkz_s(tAD7uK-O2>Ub7c7?;j22+wYvki!FeLX6mA~@kRC^V z?@f0^=h-;B@yBAQJ+hiFG?xH`myBoGbArA$8_RZbkdDE7{b7=LFmE)zR5sro{d=LBi~yo4KuGkZ7lR}|Kpmg#|*U1D-X3M-O2gLgcqE*L3+fY^Sps;kEJMrWpTm*IDg_55&i=VyV8-b!9@W z26d0ETdMVK7t$QQ%#FOFs#{b=U)UHG;=-ZUE?^HRmu&@dC-5q&Fk7un|AP zRt?0AlYCry*}I;7?s^T7HfXjtY0;8H!n$hgmHYZw-Eh$yPqZd)hEB+>-)$E~G|tUW zADn}cAbCCcmC&a>?UVkY!5)-DJL|@IA-xR`TzKbVCXc}Z?wl7fTQ`prtrtR*a%`SB z6d0v%E+f8TFUx*c7^ickZf4xlBb5t#(s!EqCVKr}3)qp)*6jlTvCo!_A}gXHec+*p zo@|GLm-P9wv6HS1GrmiCo(qhK7P{KTrJ8pN|B{}x!Eg>!UF(=Eq7_}(K!{#;{(}%a$fihEY@i>(Qw&LdEWl!&MYvvw_^*A=?^W?YX|-GW`BgOjeH#_ zk?+_dO2Hw09A$f4DJNSxg>cvlva`hw{_2wf@|j3KVKhemOt*biH2~i7hBL0k%FVUqJbFL#JstzKkj$yBiU8xq$8FnVCy2aSe_3+z7#221o^>snS08#> zz}vZGmbTE!o|!@N=+B+P4`6tNeBa!efF80if&ECN!}Yp{oc-<3v19^M$Xk$PN(o zCCc)TpHA&=(_r5YlLtSq0|0Fn{03SCjT)wil{Lctx*Sfyq>=hS;0Wua2g-U$(Q2&Q z!e6aa`*o=uCpA1<|B`z&sYG~Un*pl86D;4W@P;Ng`LYpCSY36E?hYbW>O{ncMP5@w zrSBsO`-d**lFkgA43NpyD<@9X1qc}n_1cEwGL;W6oP#(T)BOi)K@$RdOZRNAANYdi zK-Z~(uh;+*vPH~+to_rgqGmD;aiEPZaKqvg{{c|T`miHdm z?)=2Xt6d&_nf>>QAUkQy+uyzTDUE|U<`->Af25d<am9H6qd3%ErKLb4}YEF z*yKysOp<%`D+5V0kqwDp#m9_VV`V{B#3cJf4Z&?fEP0`7IgS{GJ97#b*mX?+&{ww< zRC}2vdUgMc=gF2ZnJh9f!n~b@!30I4+rU;upLNhCzGtV~R|kNYhxpI%@N6p#Gz$>Z zVR-*Q%cF}PjJG`Kx;J>L#Y8lf1F&88SDbjQpjkZgL+*wY__`IUGp|UT(g2|05 z6ie$>_52TDUw*WkgKNnBk5tS)yJBxnh!v{aouErFe>vff*7OP#A>? z-@4Cr!3YIvoECJ{L&{kG@T>q#Q$|kP z_ypP;*D!N6Y2&sf3?7*~Q4Jyvbsx=4r6#Ukq#8$iq*XCqT`)A7Q$ z?{#NTlf~LgU-L!l5#W@La@<|Ey-LfD7mm6RIq1dt%+c&`oGXO=b~Toot(yc|>;LeA zU))?K%o(gex11yv(eDyZ7RrLbaaC*kP`}YjqjrKcZkNqgttJ06jza?8$x`D0M^7?p z*&AeVc=Jo&Ma~dpBtd(;rg$^twvZ>~(B&wTfRyUaPgq?W_-W?6!@-S9{-R?5AuKA{VfxShTe4DQwOtTd*#WeRT}?$sS_krl2Q7|bi? z;F(>FF%j+=WZ7afW^(&sgn9@LdXd3`n0OA^u?kQ42nc32WI|XpKN7APu&unW)jvy0 z>vXJ3Q;$NdeDY%AJ;6A+Gb*s}@xdV~eKGH9)C_8>e7J(YrIyvaBUvGhhw!R89m*Nw z(?)3(zIY%ga|S49=AjR zIbs&aja$3J{b{0+cQLw3_z>JqMf2L1_dXe=wICSy+>hQ_b%2vS1S3PkBeJ={QV=c( zh5sl%gV5p>B~L_Ih>9zt#%pmz)r$?d67ICiyRA^UZ~+3_y+Xk^y#T_-o5umRO_?9j z-&j5f9AvjW2dG#}tf@kSTMGbK_O}fG!`{!+PCXHb_}GF#jC>@of z!rRZDE#xkDHoer9pO+hNu3FvJ&^TCo%d`IA+_lTa{qZ)>d+X);EVXP4@df>tOUpvI zUpOIMEi;P7%^o7Mnx&d(NA-sMxN=-JFTs_SY=Ym8B#iL~k#^>R=22&S&4Sh32`uYn z+~@{hi|wjf&XVKl-X)-4<;afeih&-81}7xY_ib{nVUvZTbPb!bWP)o{=^RMs?h*Kg zD%b6P@#|#R>@K5L4GBDbqfdXE z+o{<3J|R;+r$x70gCl>h-`G}Du3*$d(Bmr}me=}&t;K?b=R4VTgX0Nflv>B@YnCSE zg}Xv%NXIOp1^&Jzu?2TVfN0{DF=alPa4D^&!>pee*z(;fdBY@J)PoFqI za_z%ee)gRi-e~}~&wQtHNO1vI_}%e?>3er1&h&c)B-+lek^lO%-WK2{&4w;V2bFV6}Q1ir+{JF(2iNzjB z{h6?C(n<#(!4Y*vBx3UNXc_=+8bZ8b&pt~j8i*}LCl@{ z1~@k|zP0m#7k}bzceg5??XVnLh8&!Qd)9<-KaXSu$XRP7vx~E`w(gm$dZ%K2%$aus zEYtQ_k(|VhJw;ECq<3MWbAxr;}4*4(c|!;{7Z& zaA$TG2O~NPy;fHR+y-K}ZF_}WLtg>;r8TWh21|7BI&6axB@vu%9toKr#5rda56|m2 z$bl08Kak_Rmd@l3DA2$Ikv52%K|q$1&8-Up^t)+dyO zAYEf#t28JEVe{IL@tIs6SIK`nZ)4kxMiZ_&>ND0<`-ki?!&(%eTNi>W4&2*syA=ZQ zFdFU3^-WbzOedb0+37Z4kMz@3b9i8b*Ps9G$_5N#m0uc>V-Y|py!tNKfgNuuiz!jO$34rU>TCIneV)@gZ zDTits?VsA`0okw70QgfInZ^2dI`OyPTvhy`Ys1?gxC>*%3?_8BO{nr!C=ST~&-K|) zeA>@Q=@WWe&;?i~0+8p+JOB9Q%;+E7CDD3PY3ToW z@c1^qlwRmf2QLx$fV-gKLeT&Fu?U~S_mYOWZ=`X?{0c$#=#iF>L{!!b6x0A|3Pwh` z3~X;~g507|2VaqPe1MV0&u!tk#ch0vA4E0bngr`Nef{VFDv?`9jfUg*X~TaI)jTSr zAG(8jOBO{q2y^n})0-zKzK#IZi)>vR-aqcP2nFni4S*RC-Mjbmmi)hx(T~ZGX7=%E zb-c^?^Umf!3HAR<>#)Qp=W9>RiChuZ<9f?uvkjd}Uy(CP+y)qXEPRVJ z%N}kWb@T!sxg`lOtY8k+e3E*iG<|FTD8C$Jm_~jRGZMv{j3CW2Am&A8`Q4>;Wy97_ zJi@nE!8a~Q`hYUnPv40cKDd4F-avxti@;Vr!Z38^G6A;sHV}b}R=7Ib0I~=^ZoH z*fHJ~p4eI4zLuzmi+Bf z{rhp#0$h7QeJjE{V!xAf{H>+>zXJWyNs1+$z5iYXehy3I22i30bfaGC5#>KuU_j~r zN}3d{m@g#_KFpB243%|s)c@;SZNVa8rycp~>$noNnV6{eX$MgIA4cYR)IFuW*s?pf zx+?hOS;)ivGt7zNn|EJIRwl=qr8rLTd&{aMLASa`=;9SpA90Pu5cjrAEv&JY$?-_a zM)SYQX*?nx8!FY!KP7b9?D4>$`!I=p$U7Upbv+tRf=Y6Y6eS$ReI_pSQGcOMrTLmO zdwK0B%3*c(EoLe>F{RWp)$*`sKglwFfxhi5W#=n^WQbpht+qTd1@s}(;Fcg59 zG4radCEgmjF~|Bj?$kZ;?U-f%7((F@yV_(k|C#Kp%f`FKQ@`?9fHgz6b}y%oa-X#g zhDgTd<>X8l3ewN&Bj`8r(6jM4u;)qV1t+JGkmg%-q8uK-9d;Jx_;tbj{W|=Yd4=4{ zfcmsHH9j&M*8d38w9U2V$N7?q=AklN3P0RZy4G&Rh)nyi>2NJcfO@gf%}XvEXZH-? z&WO0)$%1JV=K(S0YRiRTZ8GjdQ;ududwHJYix)C>hh6QqlJ)W z*{9u#aqZqe!gpOKma01o$bZ2_osSggEzCuMBvvzf@Ec%n`1i^4F-P_9zIbWUKc!Bb zXP0}m{(PxSdd^o(u%1-VEjG|c_+-K7Z(Sz}Zh-L&fuGXREw7Ar2{_zYc%{tM-gCQo z8?4|xohM%)&c$aTjEszjj+trFZ+AW>A3g#CkOk}g@HaDkSn<*CPlR8Hmxr!k|6-2%gcf;hQHJI=pS@?Y`&~Hd~|flu7%S{{582KV9;y} zQYj1jyHQ*d_UVd|0KTsm0K+QKtiv9!G#9L_smP7cMHoNcM*vnrr`3dTPeJV5Nz?z* zbwTy!_vXba_Q8zS_guCxNgyt|Wbs$NoV{qKLUz)G!8e6Xi5Tvof^97y5oc7$fFL6{ z%Hh?z{kdgt{7#Ay|KIdG!^77MZ@;@;i`8W>CZdLu0J!3bkC${jw0cBEMWdP>o#IaT z}4&bvwb} zC>q(jvi8pO8S1JH)?er8j7^de?_Bl-fl!m3b9|YA6Fk`=sUW@%BY|MOz_?Nsy_g3e zZ;1vctEkBt_x2Z;KUIVfli`XBh}%h*;i=6|&26~l{egdviKj>5BF=9jYaglp?`W#uivNZsS15#D%PU}`{>QIT|Zhie(Pda(dvnSIm>*2$f8RDVu zub|oMWOq@1Y5lKdFdlw!t_q0}CRVG;0 zji!+&=Fi0kl#=R>5?_L1+Kpw>M`_Haw?}fAqlN$dT=qa(ErS zNp@2#8F>8!FnWuSLxLaFPCA@Lig8_7{U2&f$Et3ht~@;<;!uzi+m-=&pEE~5E)o{q z8bc}CaJuj1G}aXMohzDoD{6jvI#F>HUk{OgBwk~orm-C#u+42V8txXhF->=AyNGmk z`+~H^`2fK2;&TsAme(vf-i{k@&A0GIcvV3fBiM-*l6fK2k)$${8X6of?Y8z7`Z+hE z;Mp#)=S0DI)gxYm(hu9;LsK$)`qt1Kp(;tug0lB@0jNmHq~WhrK)`lSSbiUrwu(ni zm}+7+U+O4=)#yRvx7)cmP+3wrBAiLx`FSxD!^Ht{)RdFMd3I|ucY1D$?sN!i?QL?) zEP+cF-fj-v#TK3W;WK&bZc&SzOTW*APxihD+8HRS&Np53j*LW1wT zSC@Ciz9vPgT5PQ_&e0PuH223w1Yn8J-&QYNEs7{XJ3Gf1Ibq17<)t&QO|3}4qeAQH z8V$602jQ&cRot3x2nTb@NQGw4#UvfBKH9+ln~nXJ_)_xC3%Wje1Pyx|@$XUHVrCwQyCCSzNaNy};F_p|qO> zDSQvBeH`eiCf9NvR{2V{v?>@Gw!sLJ-OLenOEYc(k`Ot_$2E zU(4HMMlex4rL1Fr3$h3Mv^u}Jz~OmetEPAJF+J5**R2%c>@uS7d2_>E!e*-d3buF{ zvi63l{!s6!)WACHDVKcnR9`mQLx01Gc$>BDJXreu*6gax%WLMmiB|b18GTcId|v1w zjSB)q(u|Or3`OqwH z<Q`dqoI_ubO+jdMY zjIZQyZB2vEyh`q!dWerlJh~{<5eA4rQh+j2$K5GR)teB(A87rp+MS0y(`C*lG8`)_ z6~-nxZk9EcGw&Nuh~_s*W$89+2f_c;Jo-r(-Y9mi{8Bn;@;pjNeL^x6vN>@1eP^+@~ zO751S8KTgdr-%74(6rWXDNui0q&zy1Wef(EOM(|pT9woad)No*oigbM1V(~BptsOr zxiXT}KcZuU>GL#8YUqZDurmW5kFx*es{06KWrqJ{;e_ zS9Iw*nDw|VMj;&V=7(FgNl1k(zGY9Wu%e!0Zs#rZlmkQJMld3AUhn004?`ew|IB%d z)7L=G>m&LeotJw8y`mt7x2Q1q$TZNK=e=TGxx7@ICM^|R%w*a4O(Pv$bJXvS{)7o$ z@q7#n@@<*B$d~3FW#VDE*Nw6MyiRC%Zv2Rj?zOfK0%65f`oD5!eoUf(Ch8`L)LSA? z@FR@Dz1`b$dd!Ry=DmBkeAl8J!bXB^WBEu$tW*LPqF#7n!24=&VM4s5sVlXnSr$-YqyuO)S@lH0c#` z_Q;{ZE43cA#lFL?_YRNd4AM@@B|a7j2-*&%jK6tDz4ZYu5`S>iyvHc8XK# z$@W4Wb~S@0p;u{>UQ)qRa9AoCv2zf?PTqOpQpchobi?pNWEZ|mJ!PGuwDg$uMAa~h z+hgA>dJ|)Ocz)_dR~E@HrBhyBr64K<1d9VmqQb^Wx>qQVPaK}k>I!f#T!s?t)L9%% zvDvs1EAt;DOLo`pe$gCPu8LH#8hCPck~+1zt=J_cb4VvGukdAXi@O3|o5!3|JA@s4 z*LGx(DA#aoh!0}NH^^sEhgj+^e%0ClYhkuXhYfqSZ;h>NAxk`m!e^Cj;D_X^slyqG zUi?=R(9s$zaDvT;E(q(%4Yz{>g-YXgu!A!L2UyybpJ96M&8m&t<^i+B7XxzMahzE>UIS(&ooPNUK8zRhhh!U?l$x!kK_yNZ!rTc0W0AO)@+6a)i#tUy{{>x1UD zAcQrlnKBkrLMKPZ!+W)eO+G|@C+4ZD)3DIE3M}Z%;Y;p{kHC$vfI(@&zbulJAet$M zb~UbCw!l}{zcTqjTg+`eAnt12 z5h*m3+!fgbT1-6-|9ZgtYe9@@6j%PoB**J2l{jzt4ARn5%P8 zA1b}|&EvS@)}un7I-nmpQ@G?FFNIRiGYRB?8c#Ta60mK^IQfMdA$zU>#fUQWM92Gl zOI;cGvZHi|vxZ1EGY``Gz;D=iWy=kZ+)L^Tx%<}AN?mZ_{dQXmR^XA`OqF%OZkF@n z(czGX-=`05W5x@ikX1+b!eT8BU9Jm$%{tBQ!%2=v5cB5vNg?zkx>uTJEhw2qMq~1^ z7dfEY21!K}-4eWh0^EuE;%G;W)bsi}uOi76IyXG|`zkMZ~`Jo8a6NfSJ#qAhbY5ef~d@kQG{7V0GE z>t|=@8%~AUv4cjA4$Sk#it<%1Z)OAp%-Eq#RR@6=4Idk8UXVH$sdQ>4<>ln@7w6so z*_q!91<^x9lm^{3qG#!!iZGg~NFGvCe-0}H`KaL(K&oWcao}_LXtVYjRB!a>E)(6rAa7)nU)AZ-4!W`XVXa=26R6gE*|Zon{#nVR zW3eXP-@wm4-~J#g7d+9)IW$x-n|veM6uBl*z8KM^5K2G67Re-34|_;qK1m)G@x02N zu=QPB_e(h0`!q+r<;dN-EoU2TMtL2lsVPy>nzj9`5{>k}Gy%hF!%=ncaAGZ~b#A-i zS3H7>{?f}cgM0{9LL|xbYwqz=`D?AI_O9LNIj#$H{W#IzC+g2N!j@j&rycWY{Uj0v z;h}Qy)~20vfybc*QBu*-X}c(kx^>ET^*+0myH3->a?s^TuP?k*h&rdS(f5KR@fs?6 z_1!FogpY(uk}_&bSCDYeFFXzN_B3NYtYO^5#Drk6q?|nRlVs+tU+f7!rPB5evqz!! zs_5LVCol9_M)<@Hu=)`C`dYC=Jjiy3)@3jv+ey8OOEAaApNkLjwvT6_ou_uP%);p@ z3S388n#T~{R%cUVO~~gUP~(t}83}#X)a>ehrUM^1|Inm?Reo!$U;%T&uHjY=wYB@B z?nCTa%MX$48aKc~Emw^KbR$AiN(9~qSGS=;-V#0;GW0ypT|#-Ht#ymM`vq=*8|Lze z`Hu_Y?ar(96K18$cs8{LM2p;p76?i{9S!8Cqp4d=SI0H&N5W~B=f%*~%Hp8G|EFlK z$nUdF-z?H_=esVD2qSQG$WJui-LIO$QAi`|d$nw}vsh=|&m6*IaZR^;Qx4T;>zj6&dwG`gzN%?^?q>f<4{Btq{w0I& z(`3OTa%@Kj#1}Es$6j}3V}e|y(OLoRT@%f|V&dx>omFToCdwC5b05-eJ))?V5;PC-1<1sBdQL`R>s!r8a;Q%X1PXZE1-gB z{C0&<+EoJfspVW{NVuD0{x~L9#UwHK!Z_gWcU4KUBF96u4^HF_>=kyk+#kF=t-1x* zOV^2hokfME<4q3Ba{87V8Wb7Y8!F*#yj6 z2X*A8JDpYN`8p%QFQze1b)grM%95jnMwXI7*Z%L`1*WRvpBMVwRJRu*F4t5FV%mNf zdTt-D7X3kn*`^ldg{1XBWifj#ub_bE0Oj^XNB?kN-s2lFCJFZFUroNbPErZB@{PK5 zw6^k*IA0Hip7^Qo~&E=)6&fofCq4zm_ zHfyAFx?Pc%3F%;p>oIcg7d-%m=iA4!q(^3B2UINgs1B? zWl=MjZ`oz5THA#6q`ZH@JnFG!2e2nS~yjc!0!iT?R~AyyluH1ALn4Mj?Aw zXadV-I5g~U`}>_8dt>t73#^W?RvgZe%R0JAXlP`d$I`lf zUwrZ*@}0UQl3BMV>scV(foD(s%3A&CL=BYa3xTSvQT}v-}u^xjTdKHkwz!cIRQ% zsoZza|GK*R3?XzHLe|kg4d46b0h;0`DC{FmwcZP1x;XLL7EJSiyk}OwwC;Xfx9yqE zVLqcD<=}cv%}=baIX-6C+j1wmM;ea5>7-mK6`4YBS~@!OoWr*l zo?gZS*@&$MYQ=Qs>!MXjWZ=8$$bpcMh$MMkXX)+4+H4RTJNsIla8mEUo)yGLQBH=L zU~iPD`g%gpcx6)m@RKEZPmpaSA-U}w&G3}S+H1+Op@nUXYu_P*{Oe^M&takdk70a7+$(+7Hzz!Z~${Ia(vhz?vjMSG`N& zT(xX$zDB9!tZ2B4FD3VL-}ph}(+uRKjREKPLU2pvH>GZ_qNlE2m#EL$Zl8K=5zGb2 zl%Qj?1>xAl>;t7i^aC7R|F`N7`J$-%Luic}f zd6hx7v~>RlubY6*qUSD=a=Rsc!NQK3O1n<~bp{or^}4*Jwk4hPV`e6FLDw33Zep5N z>zt^1xsr_&GHb1nbIGr}N9geBez_=Y*MvkS0vm41 z!g0~8+uM~bnByWxh2_%H^1>Z|lO#LghB-;HS!z|9`Bzg$r!?8C5Lliizq%nhk}54U zuFilYe-E}5y%HbJ*fA@_D>U+ifnm%dF(JDZDCYZIo%vgn#b8m=P>h>|kHOF}tS;#B zaQl_m#~4ZH&{-$toO#4KqA9AxL*>MVxfTc2%=lyf_V7uChv~O@>5%RqlvUJfvb)k+r*-&m=2pb6NahHY%h%CO+yk~%egaVvsAys zZ$MKja>-4Lo0|mj_OQH}NogzFkExdI{d*18(I3-=m|)k3K~@VDnuJA05g5Q(9I;^6 zY&Bo0I*#oFtA(t4XfN1SHqeW`0_C9EzUl2gpgwaB+>W{C|@Xm9_p z5M@~_GTA$DYf()F;dGtOEp=UFEjj-l?)Il@>;LOj_#P3uUd|NVC3z-v8Z76zCG5`K z6e!Q%8!{~wB2d|32w9h?cG1eHdV3YM7Yuk&8Q^OF?AQ8<@sC(kGsr1g)QkG5ux5L3 zpXMu&;ugnZ=JqE)ytv57FY_BMjv}Ax-siHK6j$gZPsk6}Z*0IsXS7feeG8E%&Gp}~ zn6jp8nLst!iv)|4itf-3UVDe;vp9v9gHL89=G3w0GJ>OG^MY#La=gQCb5zSheRQ;) zJzGSIE}U);(Nk%Ss3+dMA3IGmdJ%St_7C799z|>V0tt~BO#-3UQP99Ohu0aB?Zhj(- zh$O(l;l9H@FsL{?eC8>WK@h;5zx}qDp=^V8i=p+SAI-$n*+=a$YQh3&5EL3t%qa&1 zbQG~AUxqXer?gLmjVDR`qTWjGs1{Ro*2Guz&38h1?3a6u+?>}LUQY=Os^leeu z9!%JO>2we_hyYhp1QS~D7le2uA6r9bgPhlCmR8_YSX&c9t)hK~wgekmWu@>(8TR!8 ziN6}DU;6*kxZ>+(>H#=cj}wz=!^>RwGJLJTg;XFZ+fa3EbS=-VRC?!HoWw zpw-Wl<1hXo;B-;|SS4J@^7$bDORrmMMU?A!`$^H!0P`^)G`Q+-Jk`4l(Fw`SThp%- zRALqNr#qkjO%e1rH-K`1XQFQw>svgZyQNL^EnLp`pk5TsMr5NGYfKCiCEfc=&0CXR z8b2@sf~={B)z2pV{%WUb6K8pAGh#`In;I2WS8YY+z38&DysSdGl7@Kk=-&?wKmFgo z`XTcLCXZ&1dexhMGH*9-GP41LW3I#)f>ZvK(NACI@5QJl2fRlA&B18m@7w&vzadNp zV4#FEXzR|n|3R9+jFUdsXAF)IaHL)zz2?63PfJ?;?e$F5JH_+$l4V?9=1w7q{r!E+ zYWG9{-;yBx3bDIz?)+U)Ejfc8;V06Gzl2ck^t34^^6jH-+sp3zuGTKW^R4=Qk#K1 zM+AwS5iTK=&cG7I_kH`Q?~{=kP}Y6M22*|Nq4-C_za3vHl0kYULmyl2m^8TY+q+9r z6wEWygc$B~&fiyBwKD4}l^>dlLT?0>QpD11b2-3P>tD)21vPc@PMO1}U8><0Ma z-e0Ve|1hgcH*VhkUW9@ZLiLZ9E5#D#@5eedA>S0s+U-;s0i@2^ZZ!W36yHN;rK26x zGeNaQh+1#S@Jxq(69$5=;l|@Tf1W#fpsa8VEpPL6aQ!4!+(1XnvfWt@fhn93|BqI_ zmG^^T2%rjfjGN zg6=;thQ4810C4&R*;NYGobk9rnQjScp?VC&wAoUhkn;htiyr#I*?avXh^ayX%5|Z$ ziQYdG-ICPOhr8eB!27`@5g~B^VlqsV)p@Jv!9zxQV(#cdZ|_KwyG%#N$I(UCW8!tX zjjz+6<$TnEUJ}eEqyGnu_*GR2o7=<*_Yv^nyK6~jVa8qKNza-@7bve$j<-g{0~h(q zznrpmFTW(6CDs%CQd+{-;sS(K8?6TJ+V+M{5W}|`XtgMWr~EuZb{4=|FT@H78S1Ob zSqmipYS;OjXUy+>jso6Svkf&^cayn&t-)D6heD!TlP+{0+aCDMk2Tbrnws_r_fPfh zkFfMFJnicB6~iSohRXrA8yKsI34xTHd<;aBC8-X4X3(N{=wi}5h`PtX%cP%Z11oMF zRe&&;GbQ}L(;^~$N6(S}A^|^*i00{2)x=^wnyU4I1UU3GY4r=?iPrj7O7#0@t(O&Q zEmwvupFekj#slXC{#Vy>sLF-;bkOBUSTKwtbyVkPOY#z0wb+nGA3pBAvpaX@mQSWW z@$+X%)Lj?)|8}z_RLuMPE@a<3Bo)h1FZrsinJ3ULOco)s!$vO~>3tpS1l{OG&tYtz zP)WWZF?ruUli{+RvrdTl?P>z>@E`FzG!pDg<$9x;gM=?GFidX#|Eldvc?YPG{EWIg zLPd4#<*h`0u(hqZQd#Xqvm@sXaAZNKx;t}e9y{A-AoOlSHm_L<9}`1GE9{NOX#wn+ z)**+)zr>sQ%6BX|navhXzp#-dbZK&!jv$=()%h>Jk4Hco=hMn=HY7EF@L>7tF0*%w zP%IEM<$BCtmXn(!Jr!vUx1puu6$jE3qx0YM+&L_xHZ(h zH!hmUS7k3&$Gv6#60=cbn=j!#Z7Fl9N=YF|RNiUf*o7uI)pGMe;lg%h0I>Nbenzkcb*|Xl2f2>(}DHiQ4^N z(~v{0bUtFyHEI+t(nK5>8T;cXKsr|!tC{Eddg045=$b)6B2!C4BXQtt4JVn;I#J!s z^v6Xm4p#FFbNL{ND!klfEK}(4ka)~^eG$!+umBgIA`ayGVh_LtJ&f+h{_h^eM*W=M90Mb+gsUF#KX;t-I_JF3xyx<(Q{Ia#uSpi_?#7t}{<>wYQ3jvOX@Qv0 zv5NMBSZPAT7T&*18P3)LX~)HrvKKK($x20xeNa;5DXhM%NCquB=zJrbr%t1k`tqzi zR_Z$$7Uz3LO(f@wZ+%^pmP=zwG*nC!N?(zXHahzjpEPg{GyC}joCxQj_yuQ*c?joX zHC2*>nbML&MNZ3oyLd&_ewc&|50+)<4k>OYAVeLhUE~+b2uN3caNB*Czh?h>AkOS zNAbG>442`}9U7YZ<1~GZTo}V-yNC^eC#i{UN}3#2l+7P!D4AkEE*lHs|E7xRs~c5V z0wN-6%C#!2SHrN^&*PrWr$b_oR8PJJuIqObB(^7HiRW%aTJjm}fe=#cR%E&6O&`l_ zE*7G`!G@$nkH{zsoDu2ylRKk6dxIP$3uh5h2Co{-4CXcwnl3soCTfo^V0*?_9($e8 z@hcrCgI@D!wq=IZksRpx(<7AJ3j%-q_HbR-Lai>By3GJ7u+64)n( zp}0@9W-iApeTCsJ!u483uRhvsk|*AAo5~Nu{5uIo3r8{_+97b83&+jEXC$lV(#bhK z(PS;!Z37o321jLfKMm@*3)K*Y-Q99aCkYISAl@dV(sjP@pR%2NY~4}D&~#K6 zgn|#YXD{pYkHc8jlN#8G%R3Ah+hq5YkV_Yqr6DM&vcwoJ6UNowdAt zi(MuKzQJ-wWLXWTmmCgmr)sS0vAU$U&$C}Q(2&(@<*&nrH0M4yxfm=A-CLA@Rhp++ zqrnX^wit<*%7Cj(AH?uq%xR5Rf}_f}C$gb{Z^C#qS8HASV;c5vJ0<+N$8dn14?`!b zef0C8Znw*DAN~YW9nFH0sB@++h-K7bfou`d=CG_ieYQLM^D#M)5z%~EmaQ+=@tNL` zV{{fqUq+Rg>gtr-RemYvu~xZ@KeLs!1Yb#Z?a2^i?~B zxXo#!ULEYDco~15&jwWygBd)uRFiuz`eI|!45rG3m3Wk2&+%Y*tjhWxsKAo|A|fK5 z8uR9qduU9>>GG7F8(tIw4V)JIA;AAaIsmAn3ZM4exeQO|@>Xk60hias@Qzmb=0M4v z>Lp2rvA(z$l$_$s+hd}-IkW6yCf_b9GyyKx%=EO(ZK5VmdQx7JWZnIvXKKuUbEcnru0ty#|3du?X51&ztMsUkosnx*~i<4 z4+lBW->@3^081BoC$l=3mTj4VG&5XDpZ&Oo$p>qtmI&}zZNdrYZ&-G`&}xJ)2F9+R z>g$Q=1FK+jxB}7Kw>VoG?Fi!^nRWBO#2IN-sNVErFqo-C-ARqtRE$KF=6UwjMKO}nfc>I zy_I8IuUf-$v^FDQNLa(ffA;MbH_mQ_K;#-J5&UdHipz3iNBp-?utz7|PS4)Pyn#`` zdqvOO6Mz5R?Qr61?d0Gk6$synPl3qh(CB9&YT_kzSbn+_2pM}pGf5ggCy z<##VoMa3%&p=tK=*S>0KaJ|Z5D(Va^pH?X@$2^?JVx65W=xZ4%!t97?&1{QHxdD6m zDy1fuOWLb`le_&MY$mD8O}$FGA;+o&|0;d|BH&aj@U^JH)MHBJ);pzTDm)k2XKITu z$Xd?iWt~B)s|s9oV`kPHcBOc{dYWH~6#7c!#DWP%aOv_H3$(oy1Nq z3KsWbio8;|S-GA+mlEYSo@AndkmdjIGrh&!@C_K|=&!3{QfFSCp6aiPt>xG@0^P?Y zV*q=aF62dS^=M6koSZ8CF#@!z%xdz+YqY8>jG6q)Cs5CsSUty-8z)RnnSgBP~>|TONjz96^UY@xzwh^zyyZ12n<08Ygm6XY99+Rm#@7yRFt$ zrJUb-0-|z+*^&NuK)PyK{-&cT;$;N0)nK=IhOSm3Uy6E}g03lQC|oGvu&RQc*V3*P-2pjDmJ&Uj)+tQ1#sXiqn(uSdKO3TP!tG8=VY)2cMK5d$8U; zX6P2#yuq;?@YHR69qFYVB<=%qy_#;ww-yKXrJk7~)i9rYyvq zqmrAV=2-jirdy8iYe5yR`?wa!a|V7GgIz4~yb+I4u|yZLW##(LeD>l3?YFreyDg&K zzk`t%SrYS`C5EQ_#Dj03+`Zwn$pI3W`#^oVdPuk}Wgq!mBZ7-{TB~&MrW->*l0d`~ zag!Z=ofEuyuRU|SizQc0a@5O6zb64yJvFdfx{la>X5V+FGMvl(sIro7>6Lw>ZeY$$ zZ+o}Z*yz!sX2W@Sg3~zjY#$?sZt%C3SPjBAQ&AJX{n8z)<7C>tFGlC?ZP2Z8>P48U zmRvz1PUU-?TDLxMDIu1a4%0F4mBmh86u9^@&Qwc8fu=J+y%|5elz(9)170dmwD4H> z1}cWpj^`|s_HPy1U@uIRSXa>>@sIeg& zbz6+6Vha!Kgmqe(pV7xZtMKT^+;xFyv#jN2nE*xt>xCP<%WtHdRA2K2W|YrXy@uus zY?d{g(%*=S9vIZ0;|$N-MOAd+ybJ|%@t{b>cGTP5z43tnLsqlgC3ho$_X+NPRl>3*!*NMc&Vs6^nKRD4&Gf2~7 z*J;i7@=o_4tr7S6Sgz;YQ_i0AfQ0tT^*#Mfb>-#XI*NYOUpy9ztugz)zPrB{Ib$)g zPI?(>K52R$yBz=svh=D^ll&;P{{u_|7!oW`SSK)!i1*v&S0{60E2|B+IT5{L&lh?0 zmK#E)!CyTM_ApcRk5{4&i%R&ab*Mo+aJ zGk)FGEN(RaI_inhI1!(89e48HY=3OJtrz*^7CgdzsZJKPt5m~6a4Ofy#p@r1HY~!}bMr_Kxj!1A zo^SdG6?Myf_W}X5B7Ceq`^LF)xDvi!u%<}xL&MObR~XaP04%+ z?H>Y%Kg8~D^4MeV@#hP*Zp_LUx5$E#61daZW5WJSvFJSgjrKDtMmF^Xhq;>$Hs{29 zr^AGUYpK49Ei6u`F{~Qm=+by@b7_yLk~Bx&2I3RADIx{TyW3vwR;u5MvmkL{l+AFt zjAuM{s$#x2T${J*74mCS^gP5+61%hye)u@{==uRYayeEyGbY5s-G`JKFn!Xk%=2_R zq`?r>3!|Tz8LSto%$u&!;@5eqrgo8O5L;3J1vkPO;4aw(#G`l0cwQjn^hgb3p1lo! zU;Ut-IANTB*$g<gDGBaX7y|YLgnQevx+y6_S*{HJIdJ~F_}2b)~iXqVc^YK0BmSTz7!{bix9>+5ieUjU9gItr2Z=pZEB^D-r< zZE`x9{<)H|;DFaszVS@wMuu+93yW;s8g?@awV4ucCY*l806qiEkNbxQc!&}n>>Td0 zW?gH=PYn$t=ZVl*Xl-YlIQ@(|01LH0D0e034d9<~DVMC=YYTi_e_V546?zoc8;ALJZ&U2s=X%X9>r07hB;rP==NiwVA{sF5#!9t``*R~PQ&VS~G~ zyttU09Q3MX;ii9(Zv^~OJ*De85C$HOdezj7z#8D0zPNGd>)v~uazGI^mM;6V^6 zq$6r^T*m&tP28f^>vx%5A}C|o2f)$g?b#(_XZh1LQq;XCl2HP#n};pjlI2qKhSiCu z`1m-qfP?Q4`~{aS%l`1W5{$a1C(jkC)jTRoikcU6DI+NprczVh^|^vtb8`+4FVESP z5CYIYl5IIaQ2Tqzh?K$nEHuKS0Vi&^dU#v_g^Shds1o-6XrJvuA`>x~f4!TKBFn0z z)M1XCaKD9Sl#Ro#>gBSVdYSZ6{)O7|$d*^Z)edwdLuL9lz^T4%UprO3HJayre5PDW z3WX9s_WEBIBQV~LBFys4NSABF#hc34J_IfoZYR{R%Hw;iN9lBqG~9=Aii(4hi`D?g zy#0&V_K2Qb0yk)Q3$pku(Dm}JgrsD7qbyJB#|_`6pCl{VB)Y$!0)TBB1kc% za>yEw0#AqcG{k?{ZAW%>cgk4712PhL+IEUfKN@duJl1XsFK`iKC}4$Y$GA*C6MLPM zqa=2=)iU+!;aBzf%I~Q;nwfQCsomxt1;KX^=@A6m`u zNuSr&ZW?Y*S3|+m1)%a+74h5uB`5#z7Kn^gp&hToy728XfuP3oopai2oM&lo zvt4wgON48(!CH$VHL^O zUz6DFcuq$;mDw3Y$kUgd=6QH+HX|ROu9_Y8j0?Ht?wM2f7x;lmEQ0aN=bd|A@WoCB z9jfqLWs_dnP=lqn)Z}=f5EQm9l^e4NIX)}sW@FLDtbm#k!F2a{Q$v83?x5gOuv^Po zCQ(k&uuDvbf*iw zy_rIL6y;52MGLsw?3=2qocdGllVvU=h>ohC+G(;Z#VQ_y6bY4F({TKI57N7K5 z;wI%R=Ow;6zUiCw_ zJ1QN{>^$5&lb7-qM;u)v>+NqUI*KFK11M~wy4?Bs(!l$zT)CJ%v3k{&oX2L?$#s~P zv-68|WVQUVI?fluU1En>fCB3ad=$B@c}))Cvfc~tPIxX?fBL4LT>`EHGC25@JPLN` zjmynFA?b$gIS2n1fb{Tt+@EFmh0^&%$wR8HT$n&XS4@s1s`7)<;8RLv-l?g*gVo7= z;bgsR*<2bPer`mkss-i-yiUYyocmD8_fv4hLMfZceN8D;!mYCve_k_6ZV^Bj3DKG689!t(@2)NrChg5enjMZxRc`51TXI7b zrRF?|0B;Y@e2)`Z)BYg(Cb@~#q7xGc>^%ulcs94KE8>ZCj=5#>jbjGf1-4oKY_=%= zO`)T`J6toFd~QjVI=Fte>_O3yHi(SWz9x(12LmX)&V9l_&IQz-hBQ6ATmgIh+jm%^fvY^}T%kYaWaRNp7y^=Be&9pOx23 zNyT5tFm9`=K3HN824hVB90I$8=huGJ0{C4T{MUZjKtVe$L9`=GFP2DC>VbeJSEhTz zhYznDwui<~g5>^~f%2b`i=VXi^#>@|0j1AD3b0yhmy_3foVgJ-XH_96$7*H7Vt3V{ zIf?Adgn!=Gzv>?b9%sc$nWt!;KA4|>X0X`d*a05l@8880JT$+D6>862r*4)-h)1cW zGoWA!yThz1@DImX4j`&@F54DqjZk1unL~q(#^IDbl}NVn2_%AQdhUehcLlx4iXt((vcIC)r?Ds%}lrZ~c0)k`pC` z*&@e^mzVc^4tW;vWB`WB#!Q`7fd^ce;9Q~b#(zvy(lCm)>B#-V^iZ?-e#OLK^q+n@ z%NXhAA*yOAmD<>$&Y+26l~#>^Y~lw*oWk4nw>}Mcsrrxq}`q#O`vA1pD@ z|A)<_foA{aZjJ1S{u;=3FX$Wl4=jS?LE(e77D4?!A^S0dVKmxfc?}dS5>9yytH3jW z5`jStuud?J8r7MRBGdt@OvLeb8#6TJr`IH*Ley3p+^lccRGDwsQCh-)6~E(@<;hDM z=_^QuhlY*K5>oR97?wBsIREhZ)E@T%yqX6R>eXF_cE7cKM~M@)k>-D^&VuKB7Y-Ja z1k!*6H86lXGL}VS-bBU#I4=r9+{3?KJ^Np;lA!N6wTECB08g8MGWNZISAcZ%pD~mg zHGu7OtngR6`KLwm(^ry^&;YS<^AVoZs~;=i&%gLzp8^Nhx!(Riee?H*SvLUwj*7d_ zH@v?K$^YKumyW*k{?GS6_5OxOC=O@Ld&R8;fn3Uu)w6oegHE4@^YPv_!J3e8vc!>G zmYr#BQ3YswWn~|jjORW0dDHsQeSU9VV19Kbv-)lbwPi!&v`5YD`AfVY3ziFCKTcuw zeY14y;1CgWP^lrnhGC=8{5!Aqe=3!CLf7+%bD5XNC?P3Zq*+y2`Kcl*6AX%zeydm< z$7L%Dv1a9;URs|A#ch@Tod@9Op5s)00#paX!uNvlLK#s$MD+-LlIDXGlhK+{umX?| z)@INjA3V-}WwNASvXc!>&3N0NK@~GmJ_AnG3>~NDh=b3)P>eUE(eiPPcBZo?Bj5+Cd)vCzPy3?Q1p=19Z6vV91;- zWujb1r1y5}g!S8J>(kjsM4nq38fGwp%7p4KzjKV&T2x3onbpo1JQHxAr;+XS4-*J4 zBd(Za-+g9pLvl9eGMH+HT4Tff=euS0E9ODgv^@7{9!)}iKC2b{g@(()therchM(8@ zFSG9?{U#yyNXyd{bOd@yf&1f2-Q89pF*xG;Ak-zK$fZg-{^tME*W%J=Xjp$Xp zqQ&J;d| z#yVPeJxSj?|Kjtl=a{Wfa=NB8s<05}WTnJv;X=-+OUx<=(jWWe&m}so+0C*N-QL#? zdL6#S6#UhBQBBVO1)YGmg5G$Kfv!DLGc}FLaxYyk*kl6Wab!bOJ~eQeR|PgiNSG2d zTQ|;$cQV1xNBJ}#K`=^ zHhB;d;(>zM5D6we-7QxXQ10tOC12mUwK@8coYM!x$}kx>$jgM;;+H(}!nexSK|pH# zZh@)@A}GKw1muJ*8Axv8EU&$YYu^u z25!l=Bj~~iU81_azuQFtCl?ptO$!ayDTv#0J{R9R@X1x}lgG$(Vq_4F_; z-Y4AMa?r0me(%(Iog1Q;koW{wOm9Qfd8^|Mka^_15XS!gRn1Kv}T zd%gRp`8gDGROxsN2l7+B{HH~CzWXG)FaaeMLz648?5=q{E$Bx|6g1S-63vcfX)d{j zC$eb?1(M9YBG{DOkf-#2AyH~kwBzFsAJPmdVvqpYlSB_02PDw(^hnO6{*dc>6It?Q zV^>2Ekc1`ivb#6*VSaZjFo**CmZMe^%x=s&R5x2?!PDGj_pq_CfrQPqhT~JCX#RcF z>4PKUGbu2|YpczV(Bk(knkO@8$Onf7SG?lh1+`8wzgpjAf5${h2dZ$Nx<#GW;H0t4Y>z+Ep=xNM^*R^aXRY{JL04 zQ7#Y>S#0KLlLB&E)-&|xgV$dRkg>=`zaa@TyKQhyXu+bd*-cOo6`W9g?q*3#nY zQ+^l=SrN>S9-`I|yTaOnOMMhLDaJb0DtQyD4OA&%k4o*HLh)Z*Cv$^&d@A5aWW9HE ziV6|5?oOhSx|o^Gn053(S5q4=5y1Her&tLXFGEJ^sw~w4&juVC_kS9mdLBF zoU&iSEvg^|)=WIS#jfoo&6?Xe%P)rCzknT{$thGvIS|_}x2ZA_Y2C~)s7Isz3vkIa z%UB+NVPjj#Z4j0Fx`E|%nr?~N-B1hcB36Q>*j5lDtzFj7xbz>e?bk(5TP$cipKDtN z4(&I*pk$x`*)utCsDVyux?NGz14fS;r#U78R)+il5|`I+!XkVBLu=hXpD@RW4H6}gKN+B^s4fUz>s67f^_K zq_kdZGCOj5{Pw|c`PFO)uasKN<>H+qrMo~(a4{&hV8~+ms)|i6@f>4ta7YB@;fy{> zpZOzPXZWUT-!(1_Q6+R>L zl#WVexTve4*Hvz-da;^U=o8nO&dT{<#QI@Tn`smv!(%}!i%$jq+)ls$WEP;5Z+fn; zsQTaqbq#~BK5=&`jRX=a%(QN<_v=f6i^)!wy&B3nC--RXEf~xb=Cak^S!NoE zef#cDAr5_&J9j?JS?oB9FLrawzRy?B<_VKXj>CHi3jwzp?mq#Z9w}&)B-55KMw_#3 z#mt_->h8&g{5am~p8Wb+P`F8>?i&ub^KW#~`&}=`>&+5LIPcR*h|YeIq^B*Y>=4pU zp2q2ubJm>;JW;N8ZB(%bKiEvs1em=YZT`4BncT>?l_P|42STHx>45CblbQ00icFOX zlS$d}x{5O7Q(~*?MXx-2WZrjAw-q{*$=UO)C3(WYO98dqi}x6kN`&|(X*{KzbXxzM z)$Xx>&v+&Af`87EqLBr{oe-0E>k^6Ag6y;x)c3=M@RJmmC-E2#8@HC7QEB!YI`k}M z;<)K1rJ5IJq~j_^ku`4?`)gg3l%TxnQiG^hB!P4aj{zdp-P6m1aaPOWLOjw2M%4Xx zkfEV$!J0Wr7CDybsZW6nPua87wCdw`Gn~0?gU*3y7~%d`t-;~<{Mb|AFzpHI?H7?M z=gtO~kaE(Sm?V|=85%BjAuizsnwi}8U;ZK~?}2b5tfyHs3uCJ*K6KC`vzn7zh8m*v zgXr>;73}d9xtMhbO5K}3zAdHhy~r4bV4T7%Z2Z)242TAR)Vc^O;Qw5qOPcGoli+af zx;m8%_ASVfRWEct?9fMlYTu?LWG;j!l%~ z>>%SU<`~h`d8JXW+Gq1yC6Kwbn5*4lsyCkZZR+Sl2P&MRJmB?TjuF)*foSpuxZAkNWt;@-CUFoJ|^zf+RWV8+7a8&obh$(tFdb895C{<{O%Ro z*xISG24-7`bx6l_K^Q<^e$sQxB5{z+R)ggvZ+mkukvlNYRsRoIdv@*~n%RKv%UA>f z4Glpy{)DD!MZFM(zClJ;M-~&lc&-_>`0`}K=e5I{)#4bLGs$HhLy0gQy@N70b*)vq z$>qVLYL1N{l=9oO>DNP-tJHP$vFob{!@|MpopsguUu#mV#IGp7k(6AG3)xs+r_g&; zr`E^guy$bMRJsl5-p^@i`dwmmyX<yU^|)WLkAXDqyYQ;U)7XB}m&*4+g3o6~OMGw-h&98aTKgK)4RlUP9nrhMH$ zV)FmBWR{tqnvA^c;MF_3aV#S<(P71LCyESphNaBq;xH*U1*_ddX?GMEQL*(Txn5;) z@H6)yU1`Dro9C&f3U(N`>*b!{dKcBVOh2rHT$5V<;f&mo;6HK_v;&Z22h-Z-2W^j$ zOgL7UtnIBw&_cA9H6E&X#!B{6fxhLMe)#}#QjhY1I0)9}d=Lk)U?5E1r2m9}H(yoi z+^j4jMkc@b6o`?RPYfocf8ISiT%%5OekMkS$paLOSOgA~n@(-*bpEddeKIhl~CU$1!uoAuyWn;E?a^K#9J)d z8pXxOFGLuTJs_CBSN|8x%gT31(XFU4DxJkxJ3TsGMd7ciAeT{lhQUI3D~o!YYnlAW z7yC0zd*5$QW_F(b=m7!(1y9qi&U`HQu!qwFvFr`lSTM|19#L z-qurYj?bL>5l(l8EwXm)V8qq}VG#ZgvYe5!(Zv=E5Cx@{8?)8pb84wYl@7`g^J1e! zLQc^@sXYBOs?L<)Q(qq|Fh%8dR=(ENJguHlLrka*y`=88=3WAcHli#}mJc93&K64= z;O(eQqe*JLt$n5>Z~Dhz@7(-mdJS8anwTSaTOU?e_-uU?3(T2`;jZ)-6%=rZyuELC zmC~_@E86(I5^ed^;PwC=qmcAXh!upQem7q$Ka9smn?dYJkff+#22hy{m!pE!;oyWh zm-9g|Rt;!aO=z_Qp+lDy!~KVc|HYk?m4uW{m}y|JaQRZLz#gSLdN2_2a{hT3^vR)# zljjBFAB!JoA0VMWT@n-2O`tINiD=!a(`~ z8X>?4FS2aw=#VglmjB>s1wNn^Kw6}qEjTY+0Ck+oOtwsNs;Z$( z9yU}v&D{15Ps>Q}lH0!=gkq7FTdb>#6Xg7}m_{LxDqKQ~(VGXa6&_X-ETdn(|9ee@ zS&6KF6bul2Oa=sg_B@{X$8zMekV!u>(((KMSLk={Y9k(1TXlrEjclCwChogg!FStr zhlEJtg_imL7ADXBSq*pgwLIBr5v)ASnQ+5+9FIjL0ZTLsT^2jpq%ta5W?$A)!UE=@$YtcR@l5_jv`v$=45= z(ZcnRI3GpK2l{7w{N-hG_j!53aruYTXyFt{mzS3!tiLG0pj18qU%Mq6s_%46(bzyAKWue;MBabiWMc(MVDVo>@9{-xypXw%rg znIz!x9{XR<@`Hcb`v3bZKR#~e|5vLMSkngXK;I2sQ@r`~?-uyu*C@z;((+vevf|da z!Q%MT?Q+6lR1Xs5DN?h+&-R!$X%shqTN!?_jgxLcsXJei=C*EcdNtNj6lSheTdqz9 zn5yiN3v4s9LAKv=gLzU`h{U9Tdn_h4ZtV$(bmIV18Gf(PI_nk3XRioS3!;*kLA>Y2 zpoUvi6A){=C5|zl`htotj;%+jD^%H~Z`&)_Tw#=_*~@rDRrBbY0(T41x*bx$0t=^4{5K_a>1!AO1B9gqD<|~HPk6a>0gv@juUr&`NROI zO+f#EUimSN7^-bfDkGZvjeq`XbdkQ{+2H)*5;(`e_%yFdlMdm>T)Nc8QhMkqYAHIM zA=WC(PFru-f&`p8$}ZlF@y~DZORv5>c?U`J8p>|wcA_fkQS$`HnLXfZK0Jx~?ydHLrpx?^mj=IY z_;(&aP{8m2|C4o9)Sae-;-~N)~^wD z+KE>&Xhq~NJ8;!&-qcJjHIXNu*b@;G$SpH#XDlp)J9kB9B2{fxWGsD&3zZb}UhL9U zvr3sh3yz+G%K&90|M^fgPUBW?GoDYA=1-X=azR2|LHpPi;|}iB0$iS=ceWCp-YkS` zA_+h>#RT#-<)jGe#azpBq9&~IM;g1Qtr~PriJ`}v+a%Khe1>R&L@z!wGBMdMkpI|7 zzX_fsbDYA$!ioR-g@pO7nFnY#NvQO9Sou~RZv^B_#4C?(mAHA}D)r2J_r82gIm@>_Z-WWh$ zdXt6^X>(ABI^jWTrd|LCf)p<22Gl5&m1A7(~5wKS(7x7@}Ar0CTCekAr?RKihkWdOG zTIXe`(=+4)bfol1Kukf3Oa=!BO<|9ZITBsCyn%7O$~{l{82+$qWKa2oO(8~=*kU>H(D~O!!et5 z9VRmU-YcbY;n1of6Y;5e(U01u@D!QgoJZs``FGq$xruhU)Cu~`k zx=M{-J(*+v9KM?45#>p{x5Pxmx8cFc7kqwO_shl6U<3}`nqConP-HtqOGIs4=4@0Z zTQAm-&1Nq5U=h;L?>-aXG19`-K>nEQ?-hF|y9bQBF3b0V^}~Q~`v>Ctc(zdOKEkk{ z!6Gdo63+QF%JRa(eaAvM0-`;6G80_O$t~gQt9?!&FyeCF1`RY(Q3(^K8@E)oo{9Bm%=Y4VxnJ$DTwK7V-y z$c_7t2t^{M=oLqDBC5i8goog3`sZwQr{s)k(=bSu*e0IfH6WFr-bvpheis~G-@3mu;l2B#9kjqm)ASF zG`E zJrT#4d5!);b~5H(_RyHmsa|zL~O`+Zos_{L_7XYtb3fr zyQh|gy?S?Y^x)IQ79#%UKI-p{Jf2X?pPRBjn-wQNBI0)CKHM0LoG3ScN!ERp_#!ag zjU{3(BOu_;=|zE?NfU>Fse`Hhj#u4E^lmT_+T`$y@vk#GVy6R6`}N8DU#W75=+$g8 zxk_}yBN4-;wUMi^yMtz&E*uH8cl+QEYE@n*@rZ8hVmXinovtaZRs3Z?S)ng1wT{%{*rLcoU7jJ$|Gr@Z z>7}^t$yJJ9MO1$JMph>y=v)BshW|8C9yL_*1IhH`lhq2Zx`DG(hT?f+O9~6AtzeFJ z6h}{{B{)hWU+XOjCNN(G9U}~!$^lh{ROX{) zW+68lwckc5QH|S-%dU31;Vpu?XxvrlY!F4iu(co~#L=s;7hhYD8?e3xi{I4l$b2hP zecw8Cum!c-wD3Oct*eeqOKtwZ=7(6*;*RtMl`3*?c)j#9i2>@aEZ?Q>kagNS#Qy93 zI0HvaWzk41CtTSDJ^~tkW}UNH`{9ux2}U5bGe1u;rfQq{JLzbcuTM~%h*5JCvJ=Lz zjK0~6_m))#rk70-Ct0#Sa^P-vw z23$j$=VVrlL0(2y2`o0`$+=KW6y?3leFcyEv3a%FU1oyp^rwD$2`Hd?QhoWn6wVo# zeXaxzg)D)w?Cl6J1GD{Cc^}UF+x|xsfxS-EZ&^E+LvPPH6EOri0gAkJoNE-yn3n~p zd7 za`n9^1ea2z#o9~sHyz=$mz~11XW468U|RLK;JupO3jB|~=ky#INwG%zwGdhf<|SSP zONVvDNWJ8%oy@OX_Ir!~loHfbSf9#A8K0Oiol7%l$V;bUxC|E&!*+7#^?tqGzB}~G z`E|l*H{(B0m1VYJsl*OOnwwah<~b;SX1^@*=@3Gjznreu^}Cum-@(+ZKTdUq_F3!c zx*+`TpN$;mO0WnB=$JK>zEXdRIA|Wd-G;@oNyce3$~VyelOz-{bvSb`t_!f+O6In4tznz5o;D|qs} zzT^&MuhhnN0BTFTc@>t;H6ufG)zHomcUM_Qw%k~erQmJ*&$|7FhcaX1PSRxiY3U6ZqHq%&v21eJ2$uJm)WvpF5;KTF z8+cYbN)Ly{?&E=#rRLEPld0eDsD(# z$;?0(7NmU=9i*d^jowPn@QGpgGc~`xdexQfgv)m;c3*z7G7od_ z_Q#ZKNGjl5naDY3R|@_(L#U&4q;(l2lklPehm2c!WGxM|sl7FfVWnVRs*H^tZ+1F> z+RiC9?pxOr_OuMDA-_7Zp{B0MValqFk#`UIrffWMzG|4QR(znf9*9(%i3aqaA6|*GC#gZoH2*yrWK;e{9fri{b0IPGuFBcG{t@>$?KK3LildDI>0T(%1s&hZ>j1kHW#O|vFnUU2g zctufT+xXbpbZbcYk=ZD!L_ZRvG*(wF1DbbIu&dyz~gmfe27V8LcVk?TLJ4(LAxCO%-E|! zwtW(CSyu~^36(OlmaEOWBFR}MOd|CfZywbpc+p>5*J+EyoE3DD1LVwo6I1M>?m_xoVT@fdNgQF8FN*E<9Q7`*e<0;{6@qd zJT0;Fzs+@eUo3?dXn`&Gr;a{0_OOYSLJ7u*?VhNr2UM6I+Z5j;BTK^!Io*M`4ys{0 zi$;+0(aC?hov6;Eky~us$Xle#5&E`2bCkazon=KJIL3+N$&BvpOEzsOmKBoWO~tsg zVS&`E@H6`)oUbsvm&%s{ZC9|51w=G$TKz7cFiU9P*;hmlr;E}!Tt?mfM7G3kF=Te1 zf|D1gKT2mc>s;6E%{+r0+xZ90fSHX`jKlnC&}GBwZnHaYMX)M5E`UI+>tu&__dhCU zI~+NDmo+Eablei~!(=^2n@W$oRd!X%KklIK)Y&cGm1ZlIuQ(f=Y!GVLcUyQhMD^a? zt*+zC?_F-X=?rtXZeS(M-GV}V==&^LK9Evf(f2Gmys?^f)GRmZ<%@*~C7!`cZbp6& zVv@AkFeeN|(O|q2Oa2PnC-;!9romyo{Qg;ycd>4Lywtt6yQxjs2WbfSmzNDL2w%pM zji`n+_3rnz<`zNF0_gD)h2z@;3bwcJ+)Nw-Q6$NeaqW(e3Fqm#Mqw8vVXhi=5`LBZ zU)a4jHi~s47igWa?`>>utRTpihPqAUhE~2~i(`HaOC%u|JD4vR$96vu)-Qqwbj%y? z(pU|0l7DjES`8`HZ{o4xd75}jOm@|OnaBv2$p7NYdQN)c;J4jold&ZJ;%VUQUJ3H4 zwL`7DYb+NI(EDdHcn0GWPpo1UUUr}KoR1HezHy)8=Rp|4DYGJarZX6K7F=>2<{b|< zm$${=&<1a}SuOR?dhm@H?rU34m+5M@D={5K(|6ZKV&Fc~a?=k|sb%PT>m9v(6|;* zkv>$N= zXPPxP`s!5#LGyXPkML$EsmW`9#GP-hah6OU%v@r7sNJ?LYjeD30*xfU$4vyqG^I45 zYSNOeW|=I7tE{g?;eTBVB3e^fHQ1Sp@OC3zmrY6DY+%JXvvz=F#theew=2uh9)%8S zVNf2v73eK2v3^j?Pp}W^?}XvnTCQKEd5i@=kKdplN>bhb<|E3r?XiA(FzkC#X@_E} zVU|`%`Q>8oi_$g5`Spb%=HBAUf)SFSv(d&>RA_1ELpWS3N&tO>e{8*Ti1zud0F&^m zv6oY4&92XYPVmDB$IMCirooZ*oGLbZDcLUk*OzbNC-4m{GVR@q3>oepE#SCo!8bme zj<;Op|3ZhbQvH*(jgD{@cqIL_NT`mUiM@zyxu~*0q{s4LwH-WIolJi;&x?;Di>o(` z`UsIHBy|@UE-R`$?ZEAGZg%L54ufSc$%y2X(R2>eV&!nDA6CRl^rP+ETnAeGkrEVulVb6N!g6R|MP>Hn=B&i9w0moLUL=si(o;%odtHnkomL@}%2NaHwr0 z#pdLQqU(BM+F!~OkhcSMHgIUUkW}$AbQ?6+w$OFg*#fCcNoc&qPD6?+trq*Y+lPum zBhAxmur0rbF)7Zcu6Cd;@{#$sk4=0bsvh2;O=Vob*x6AE%a&nZ1Sv{t=1?CI4;Z7> zO_<~WlfqDm_G~Op$I5@+=BUlCT;nY}{^@Qaf8owXEGmI9FqQN3POz9c9o=&V;~o6A zbGb9;*N>0ptL^8s4^?4m-m_27v|CnlKQg7CCV=mrz9v5cJX|YfvMe^Zgu}pr3T-W_ z8c)#J`-YmlX4X$4tLdX$&K>&?j`|4ADJ-LwmA>@D8=|O>=1;rzlW;!P^=}b+E&V^v zzA`F~ZQV9Tu;7-UAp~zA1b0b*K;sS#G(m#9dk7FLxVyW%LvZNE-8Ila zVbBMG?UUIgC?-Q2I1aZ7=GqMe#a58y_ZrHy`0O0v0hpC!3j zSaSish~(GfkgqXSF~6H{v37C(O=NCY?CpenMJk?y_E;C=5*uT9}dU9vUGd+R!OTJkU3MQOP%{mZzlU-@X}?*d=tE2$=u&B z`cipui<@}vYZFGS?!9jLz?bD-a1nrQq;jP2QF+I@xrITYw|4LsL-4dT5+PW{cKANb zL|NDn{nr(BDLNRRf<@_oh|YdN|J@5`--e2A#HL8<>b?pXW@DO zz9VgjQmZGA{jN~LzzrLSz<|*Qfhwy8n*tq|2QpOHI!Q=NgmEPF*4Mfpir%fxT>+Tx zTsQog)4`Z0U1#)suHU2PV-LNn6O5t9ya?ON|zOWov8VdI%{-R zh5X1{2+UErNy`$#l+DSGo_`Aj__h6ZIC1vy0+e7XgX{&Xs^c#NP6>SFeJ zkvejU+{N!4J=kkogE^$g<)!CJ=hOvvHY(Jwjlwxqb(fe?Yke=$1VC;M=aTKY%8OYx zegzZcB1@BjifPp9_IV@u44Pub+N`IsQNp_%ofC)rtwt*~r&I-b$8oVA|A98p;d?8P z>BO>Zvazul&+dn9%~CX?Lp{?_>-?Pz+_L86xMrC_W;c5)qSrd=J?n?gy$N%{Y1#;x zI9M!#o}?O<6Ah?qVlq2>Tl*vLIpO0%3d&-kis;Ap|Nj*Twd$HCW4 zkK>2QLnSE}2%y;r_u!Eu_Y#kVF0_lr=RddG%%57N2K8Nlv|JvTo;$Y3Ld9%CX($@4PS@*cSJiA!S>ZtbB z7)1UuL$%p{oxSA__VRlDP|wC9i86|B(nEeQw`X!Lq6nMbRu?_o>Yi0m-J4W|h$;py zSn~P}A|KT)tGl+WRB&=}*M$SL(Qc(Y1UCqZsE4or)N8#L{n1T%B#SH1>Wb^|u=-fj z!LdYI-)3yLl+AEqVwm!C4NU)Y!(Nj!B6l&cn5CP0g#$ zd}U;=v>_N)@~rjtAZg;Fm$F*m6UTh>_dm1%8rptU@Z*ojF&2d#&cvnN`hBrf7?K>W zOUNffF*nZ!*U655QvZCa?&f0^>)O(cA`O$kJ=K}U5sHVGyPv8-PC1yv3hvFcinm% zavMP`E=8`BZ}*2UMsED`*CsOMy(h4{fRcfN zsbg0Zc$j(ju(ESuxB9kY!ilW-Ep2U%t>+S1r5vG=W#M7;;%;Fqs!Liv3*f|77wTNd z(R-Hxxsj6PGQ!l-!RW4$E7&_Z)-->~#DB=qLS0LZY zy;<>6qv%_WwUJ}Q3{A71J8Yd?fcq&u6<1wVR4oS21g{i@dK z*hZl5>qy$o$sSdc4IaRRFXj;ueog@2y|)Wb;-U>9DrTJHTasm?(P~0t3)WsoN74`| zZzd_f=by8;-wUsao*;&Tw>~;FO=IzWB+urjsEc3@l_jsMU?Eb2j+G2?cw2~q908&74dQ|@m|z=W%fQ$ zX5WkFo{VD+%hI25x|eq=D@6r`Xnh{ApR}Z`iCJ@7@E?Wl0!u27WphMq~UqdfjY6~g$DQaWPFv0vfSj8vfgLo zVK;&}Yu@(TxQ_eZVKv)=3woJ^7X|7{JsNa%=QE0{rkom$cZOUo0ApQ798>GsBD$;y zn~c*mXAOMXF7w^g*hM5e4Xc!J`I4yl92(f1t)>v9en2w6M|kp&I3aL0umL;XQ35J1~9H z-otC=xAR*4$7TODjE;Yk;g0HA6j51Pv_>fB?3>V3jPC{A-C8vA+1Tt00d2BrmhqXW z!Zl9<5Z~|xr6*B-pMV0E(u?d$X#Qhe6=x@RV5z!ver`w9fewcCpBygv{}(7X`;Q~?7OIj?RX|EX1IbH45mw~YFk*elkSOJzNf06{IEIWpEgr1 zWqy#Dfm_q4zA-0=O5#p{1n3I*uCHHnFMbd0e zn3a*2^g|v}dp<~h`uEAgZOKJo;#@AuQ>zuSIzk^b`(K2h=eKTrWz7|~ae5&a(gWH-|3#y1MzgE>{vaydhQ zB{^e&4`R1oxF}G4eI8RnPsQP5nHHD_Z2Jy%J_O@L6UNa%xm7F1A+dg+lIA?Bv6?*> zxWX=5gAP_baV~J(!+y4_xn~=YKJi0Z8SX|_ar5O{&iDPazKaWD*|mGXCDDC#KnIJ_ zW0mczb0ye=VtR0yr^s1l$ffn#pxf*~nff?oP$0?4qy{mFv}u$IUS#CfqDyejWOc6u z|HqcbS~iVRz*|=%)Hhj{z4*@cp_u1m4D&VRaHPU;UtAR1GWZ@s9!9iUf0)DPcsbQn?crgu+~m?opQC#_N+XE zYBS`{Bm#~$?pPV($(&Kq+m{4n9}sEiY=^E{3YN?Zoix!J(_iii>ZJO@Nq^}Q ziromO^EC#XD|M!vzSUcO)l@Ab+wan5w|$LSldt94NVjk3ywMl4i+m2bmipXpnGb;N zDT15dE;4sJnxBQ1TwGn*XR&=I2}JUW^ULACN6h>;39^f8as+CK_U+glR3Bd=9jBgv zx09k&Zzp_TeD3!zL7SUz434X?>8O;+PZU$j!VLE3gq*bo`*1A=w0t6Go@D_s`&x}d zY~M26qp4Im;H?QuFr{>V9Xi+PSOcn{9d65N(Q+_F#F`Jb){USUiHTY=uy{?J{+f7M zNl1}73o>d`Ry`?y_Q$;UU6^o8P)!Y*zfY~PO{lrx9h-`s+Rz!`TAwwZm{}q*AlK>=l9o{(i)jA*F%i`AR*k09VUP})J_<{pkIn6((xNxa< z+<33#4Oib(dZe^gAFRNQoc=lB4XsEYN#-dG zq0a3A+9WrUJMzt=1?1VOG*atCVwl~LCIzSl_nCF3`SnHmIk>m<>YIa_xb_HR2{jI# zKB3TD%2Al^==6{^S4*aeWhlh;)g<~`rxfrzO&jS1H`3KbB|0jpAD-lW>Cl%sXG{$W zg>VKCyjY7|pUAA#iJY-Cm%%}BVAAce^H&R-fuA#C76ZY_^&!ZEKF!^!sW0_jLO*CiH609j^<3qzvyEwtJx9ff$``}1@ z0Mx^986)Yw2|NZ*lWsXz)zo|vk<-_o6i?L>iP?sOv<$~YQQg|=b2Wlh^% zZr79i>z1;u@6(v3E7D~s-oID$gVwPh+0F-PrBn(jH`wZ9O_l=ok$pGX+ARd}rjEZE zFZ(^1Fk78j;Aa>6$cPn!CJ#5z>TF&g@_8untU0|`?9<7vMD)(c0|Oyp)p2dIs$QYo z6I%`q<`Q4SN0_mMvn&RoIvSGNB+aiC83Q)hO+iM@&RZvJDHw~dp@hF^sm86fkP3X> z|2dfFqAHIzX#5Ch_##gyUq8V{-O5H6U9$#{xWhT0%tz-5>OwOU0$kdSqhfO=Ysy;O zx(1seG422s+H3JdTNh!|V`&ntxyp-BPfPR8BZ}hr3kAfaB|t zrU-0&18txDZ9u#}+@ZlQ8=w}PoE90INuX~=(9=HM2?i36o+D8j-^}l`lDGJDK4ZjUc@Mob&L>EAuTkKP(3|fC)?E+QJE+ z)^xmC+tB;~O5sBY4#QAD9~EOEXUSUq8?ey&S`JLv+S2#*A_4Y@g1To?T#vC<&!al` zvXd!lQ^L?Do-hBT%3G0xPKcfRkw$+LB3f;X38O9)6RK_QRlS%Q`AG5^N0q>&f}8Q9 z`in{ATC_{eJfea|GnJ#q2_^Ul?c0;!DkQc^46Bci0jL;OmjTuxn!V{7h`bm5rT%o~ zZctqJcb^^1^v4daqdO6Kqd0TvWMljzqKMT|>F;j17HD~w~ z@HXUw$fmBAM4L|ZO?0dFdOgogcu~r+te~d404HYIJ>UH6K)q@5*OU1+K86fMS z?bxSu6L}m)L`y=V2`BDc0(Lo6`}m2FnNjYVu~uQKn6633hN`}mRWRgq4!|nsE{}0o zFTekVw$?UA8nn~kWs6L6iBw`iG5fPuK>i71d(Rq)K#Jxq62t&4pH_}Fr!4I*S zZ4XQjDVGQRqhTZZAD&}34Xb?6OtiHwVX4h33a&~_);BSs-{ndyABoS>Wk%iBu$#I_ z0V+iGyJ0b)9So6y5(b7@D&Cz849I^4fk2zn%_5#et5b1EUdUm|naDQ|S_CYi0Qu*kWVXRA&=KF-lsbxIR)K)wq`y3M)OaG=c8v3w#Vp4lY;)I1Ibv- z0|{Kg=iKGx!^$h?OrPBQ>07PUI%AdWwq$(_3gP2QZ&kvlo*~ydgB$y`oy-Xz)-fwS97 zXht)c%0~U*v<9<`qS;$(#+NwO&i3I`LyO|1zF);K_%ZL<2$a#4HwQ+Q`Eti1eL3>s ztUY?)rKO;_lAcu_oB~Tn1g5ZMxgdi=G$5(c%}`uN+c2jL9^nQ4=dUR$=MGw>gcqko zwzSCU-hhdy1!g|&UI+_=U31piH6-p7N^T0^S4?Le zR<#qcU~XK`Z|21W6{(u%&aq9lwfGkUjYQk)I#y9l+!xmKoy7doK5&6gVDwRnrfC%) zQ;zWN@1caVz3LokajK0*kFHOf*9tfInLDR2PtfyxH@LAR#QF+iTX2(3X1^k4^(`s_ ze2Xh@nbgNQlDtV!VhS?2~{PAq1Lr~ z+%`h($Ur4U3?CCoF@4DtiXes}FkSh6;~csR`m9_GRYa6kJONW{;CFg&tX`a4?-?|Q zPSgfCts8C)23H2-nlVy~T_p}fq|qHD`yoV(grfFpBR@;y ze&;K-8}F-4TNapss<)D~a^Uo^Wmko&$UMO)=lcSMiTVsLktD9BWW!kXcRV@PEW@>X zymU9_F&mXL3y<-?q!Iw4*(9ZcY@aX~r|r$N$B12!^3xz7wDa|mnyj@YVCh0uS}B>P zWIK3eW5tdAi9Pxs-2p*4mXAd(M`!>25v9N(_#(Iy7kaBVrz}sgR#_wDG2`;(G1-gT z)HxKj=KCdWzq6-6%G)9lDqi;wP{+{s<&AOR*Euvig6Or zoPGe1QqdPb!a6yDbv-g>F9@&__X4`VVWHTpQy}L_s`cc~iTp`k7aqm?1cbH{R@4n` zx7MT!dB zKQ|1b_EZ=PzCKw31=eK;%k!-&I+>66W{U|44((#j(lXs*93D^s9&yr4ZtQ1_WDz6P zW^!iT;WrGuX-kv6V<_2P#O~;{2zw~e2p?zog*)0LIXO%jw^qG1^6(36c?IigFdc#P z=9`6tvNxQKcTGL0+xK*cn=zVcI<6V{6dH+Ld9he>d{R;i6*EYqn5e#uEijoqIMgyv z*=ii+nX11>CjoQFM#A~`7rrq}Y~KC?nAL5XhYb|p1;F7*LCvY)0D?E-wJ81i&UrU* z_PexcjKW*0kJ(Y0LX>_U!?mhi*%7E*zdRiUm_;qI2z3iHn+H3N(})KWd^CF}D)TSs zl{~h_(JHYF^ToA)07&5TqrKBQGp`YCRBMj~064Lucg$)QFwSdzxrX?%`n}aPEHe&b z*aB$PJY;9koTJZL`;^|gq+||vD_u||Mgp0}I}Sx1!EYwVD)+Bu$lNquh@CC80ft)M z_LyHYeUdyahU@q0h@B0`uFcqLNcyUQof8XHk1`WjWYb?+4P`Z;w`S1<3*mbA%Q^f*Q*y!6 z=9vl|gDl9kLw%I*cVe{&r^A|(-orp&8re3FtW)&!h7`hvEc%9Otyu`s*@mUa_i>S% z$~kGvXDUAn2+i58a8uLfjd*_ua_GP;A)rj>u}A=&r!)ljx3`CWK~$7h7B*R94N~&b zJe1W+bAJ8!R|4!!qOhLn4`-KMQAbstgI;$5&4?nOI%jXP=rZcY`6Ei@#VkMkT@93@ z^onvh2PYCd3I-RuS8irTN97zTk-pSzDMo^LPa3U^<%=4WM=KifyE%&I;$*e&K+WY$ z&a-Pv@fCqM8rsU~52YovXYGLnp0i{TJ|VVPk@2ocq1sm}_YMUYTCwm|v<}?CsOtUv z)lLs3i{CYH|IITkswA-r*m! zAx!?#%DFSh4m!RlCM0ttdRQ}7@@-|dBG&x3gmP#40D;*e7GQ$yk-OsD|o zW;ZMiCIx)+BQ48>`ihY1Mk!T}3Bs3!W3c#UJQVOl)m?Mi1D8rRZCp zd5_iu$$V#3X&_CPj$Z=zxHHUMHD^!z{Mjj@TTh z#S%4(rie%%qE}Z6f69f6ey?tw{oRY%LLqCSzpe#;%%)?ZB`x?ksfOR~4PZ!$=!qTS zh6`HoHz&NA!sHNcwQe-@0KRV*eQi25O`}QL;0Rb@M2ANy!{xfCMl~T*Xj@yVwg=ZD z@k9%5ip5H1VR^hW7m>)KVI!Sq z&lS-rHtybri5w8lB=h9oC1?_E2pSf8aL5>{hkI-}+Ok)Gz(z^Q!)3fEb+anI1cSkK z%w_-$`=a|c=gRJvvxea_bwS6sWkXMa*!2{sGmRQhlRRoe<8i0rLiI3=WyVw=PXwlt zKP5~byzm1nj%v+~d;KQ`(d3z>_!0f6{h@8`;avSfixx{E?Ebz-%C9*4Z?q z=tCbHZ2drMz*@VnJ|iFRqI#}O9xe89iG9|hvV4Yntv}8ff{;JxJBT$LJU9$wt5#8` zE#3f$p{d#71J<5xHhsC{RMdV+!bL%r6+l$v%s9WuO#%Ni7X7^tPUL-Qu#wucRGW!C zYuJ=;cwyI))GE8@%Q#0^BmQgBcOdx3py&4DSiU=gR;nO=HM1@5o(@=+tmdm6zhG}X zd4FuJs#BPaR_FSVdaVvG%2Z;if||wz)8i7P-DPHYPTupl7VP4t=Uge9zlHUkm}`Sv z>IV||hMA&8B|`b;2zOh

O_->lXyP%eTzi!BApt3}VuWH51|`$DrG)QEj4E_z+E0 zybh7tm97j>OS?t^q+@nS!~JJE3bW;w7T1u=sob^k<-sX}o>6m*%wjcOyg7^V`H?@b zj*KA>GgM!(^SI@tD;Un+uwxtd4YOwOHkMzGqq<-g7-9-7wB+cQg`HQX+f5MHL;JEi zHdyVhFGnYp2N&gPW8G;ZT4yN1~Un=Xj*Y+597+m;VO^#-wotgGu128Jj-0Vd4Vw0|OI|zye%Sn_b*IESy>Mj#; z7`HDZyt2!ldlnL0T^u{2HF&6$&cj1ZDm3BT`w$#V&Cl#N}TL?-vLwXt%8w^pH$ObhWRu0fV+4{^d4B5|!L&1=88giMPS zW?-ev+UA{DW%A&Y4(UTlIXNQcz>B`m6ZxAd`rO_`KDOC59kq)PKQG}uVw#f>JQzS- zw5*jaZ3v9^eke0`Fgkea^4UgaB>Lr~Jaf92c1uKL*NhAjxtoITW(ZvJXzeI!mgQo? zQAz1wq@apuLd9p}Tma37bC85MJE=-B-esz|)V<>5LWoZ$2wx-MrK>}&_H~0OPVm_&F_CkJQy^V1$vAaAiYIYs{h$rzJ3k;Sr9H% zHN`OUb@Kol{m$I4ri~mU&y3||=@PrUOGZ(tn2+Ty9#%=rO&ARaIVJe>Uf9zn*q-{u zt5cDty<pLuyboSfUP84`1O{qR#` z|FQzrjq}QoMip07wZ4ZeW(=Kh_`>qxIe6?R;X(hN=2ocuA7%XzoX3uPu2h`LL-`HZ zw;ZLp{Oe}QH3T(9;}W@f0YsOm1vV~2iP|071GY+iqE-GNerjmdrG@K*SYrL!aMrT% zUKc8qowN`QU6Je#IRNgB_KUIjsVeSNF!x(Ds26GJf_&BMl(Cg`U6kSG->5{0}4l;Of(XL9Z3FUnG95lS|SL) z$S4m`lXRRwFuu3H#iScVX&Cz+{9x+fV;`$n_KK`&*oZ8x{!odj zN6K5v?DNqctbG|`_bjhzw(0oIa`{1_?eb7Z?klp~0Icic{^=TCkBgB^clZwaN28RY z+XXX8MhEw1ti!{s!cs^BgN~D@0o;K!xtsOCC)ElUy^Op3tU;wSZq&(#Fak&kNTXSl ze50~3Ia)_{b^@oLH{{&atAF=7ygZ02e_xnKyf&XJGvm!v8`U%Cyf<=Cn!!c#rf(Mz z*VK-V<#D|m8{N$^dkCS|?u$95@enW3F6766>|Z}s;mXr3??rA&Q3Q&`b7FWWpS*4Y zeOb*X4)vu_%d$1uCw;*{Qi4AzXgE>aUdVGZ0(tO^bGXvtYdi2CYs`)+Kj#9fry0Ow zNzcF2-DGRw2tB;iSk$!Jx8guY@JSxCp(vM^3TwBTe8g1Q?d$0IHs`J!#-PsFdplCX zWdJlm-G2LH9eoE12)>2I5=m>GprZMlo?|0HQ z+-a#q;_xa7)he4hm5yWq=fbPQ8Olzb?#lC&ncv+)?>@zm?xIpimc$yEA5fKWa|q3*uj z!)6V6PA{(~u;dp1h}|n&sCtj+RJVxd*uy)ehEd>css|R1$?d>{CsvMV&kjfiN7!Pi zHLMZc%xr23Ql4ok28?(WyvY5Qn&7r$Dj5U5`|A^noQvXlcSr3;&P2@t&}uNtYIB%o zvQRtMdE{s@%u5RHyQj&4Gj$zJ^5kw|+I&Iw41O)i_}$diSp69jqN=zQbIT(Ci7NCZ z3y9O~Q?rMsPEG0;S+_)U^I}7s+QN&Q#KHIE@_D(cA&b-(d!r%&I2U6QX)GDjxq@Pj zjzQ&7=oQx1n|x{G(Scb7X8y67Egc`pT@dnhgEzm+Y@3~oj751ovDvLg6?xq#L1)Fa z!-SX@6oQgu^PJ<(g)W-_L~LWK(rnlUV7^<0aj;wd>RjuNv$d_aqm@w~Q_>NvHJL@Y zmAMzGP%anh;Nucgo=A11usBULis@@!!NfIB_lA`}s1p>!)s)VMh2l!Ja;Tv(=nJg^ zyr$c~ISc>%35%d2sD&a3@GFO85mM@~T|i!hQAm^hakM|2Lt0!!Bts)5%xg^ z^|#i+FX+>z33F$h(>&Hk$O%11KpegThm?w6=4uHGT~EKQiq_|1nXGLIlfk^}9?VFUVW7SsRb{aEbQ0k zcrZs~*NSDM=lTZFZ*V!l-HvdkJc`Qrd-Hv8OO-z1hbD@RV<~G39yX2!==mnX!>j4%~{m810EFRif>= zi?CJ3?D_<;8PdzUsU|*jM{;@pA!d*UcTX=Nq5d6_i}+wBA}cBBK;TZd8y`Cqp@fqc z*ll56G9X9V=}CNCA|hC(Y@@iezS;LqO5IwQ^{jAF8x0A0lI(St#nT^B_~vKh9_ZUG zCnRYSsnn4tr;J*XL<(h!iQ>_HCC^JorY=ZQyq0#Z;RP>{>_HD&;tI zIjy|azF(Z&WX|tga697LSPMVppk!_Q7y9b6?>XJ>M$Or*zq zU#MF&R7SnQ%c?#Pp`zHok8YgYqPr59vqrgd{K8Se2gXcdu^^h-5r;aL4{5%meTG(7 zbH<3&*X;G9D|d59gfkKwZl8JULH1QT4T9%Y{hX=Oc+a_=u_EV(UO z>W%s~Ek?|xQ4FUyp|6JDYVOcH^9=2<-9yv|aMZJ`y9H7=Ee$%oc$z54EVQsg>+59} z>Ooslf6{{PzrRMsfLX&&pyLA)`~bea36gG_r?5-AsfwfU>{)EywB~i+mN4Dp5*Mt4 z*|48_2xIm#LYrg|xV`cEQQd)KfMH|n8<&<|rI#Z>oRo@3@b+Gs@gxf7Y}D~D4$Kxz;1I=OK~xx`A-^`lX-6bihFZhf|P;Z`2}T;udZ*vCdbf&@k(%X@Z4x;Ov*g| zYr=&PuYCvlscYT)hpm2kE5%=fphza$^ss9**2Yhqf4Q+yZpie6fQwa*Ty)5 zID42c?_ZRt*T@oj>_*&QGn`IrQI6TZX-XyIy240&*m$pbd&hEmyeqd|KyTI0+SN<@ za)N$93g*a^{U}7tH%y2x$~pAFKQOCeZ zzk-h;c=@h6k*4*M)8lAR?lcA)5Sqjv6`ITqlp%RO@>M5AI#i-&2d1MC?>Zd_kRfEw z?BZl19<4U&fjm$kZ2G)zsXbV_au0@X5_<$LzE^W{o7`F-9~}cS$wy!OWgz`|C$2l` z4cpe7P++wn%rf*(vWeRYd#Y69dpcmxlq#u_W^PLVw)ny#YmipxH6h$aUZ_-Z_)`wk z!(a~Ebo32T4P?h>oQ|LOMl2w4X#Y8aj4jsuPEs+dFS$~aTHCkV-}D`MQ)1j__6O-j zceV*RWicvbmOEi&7UmdHAxkBKn~IG#AWoCC+X9@4n3hjiD^LdO5u<34$zDZ{=2cFUP-Y5jI=~ENBN1qk1ivqbKW((9 zf^kHv@-SvjagaAPmJSWplb4-989A)MX_m=S{RNN{19J3-_3Mm|y|tn?3<5n~Lsp?~2U_qWk z_b{zlQHnA*)zXtNh{Q(Dm;C9_oyJCYw=?yifR@}+$cDwL=OKcY_&)XX+VyPuJ5>Y* z7Zm-~b!I4gcRgXQeRq#yBuJ@E%7!?vKu?J9S;&>$lWnG>0>Y(+gXyTbb2G=qtz2HFVGc* z^J8JupdjE7@4-uo;nR(Y%g_~p)6xCmk154Bt~SlEpAHZBQ=_pc=z*s7QJ((d#l&*$ z6_5oUZNpblq(Hp;#LU1uTN1a%{nNrWmL*$5WRKbhpaqG|yt~%sUheZdfu_V<@J`+I zw0|2{`HxO(`>4)6iwr#GZu@=bpz=Qouc{7jzY&%WjT^kZ| zTZCc!{JiK&{`aCa_l_&la~-~X8tBGQx`1sz>{IP!9@gd%W!+wf1>vM%j-Z+1aMATq zVilH6i;Y=Yj!%X}wO#d`YPxm1fMv*cD|E}V4Qzx6#5);b4^m>H z+Al-5)saK%^$zbg}sq^*5Z&yr$o>^V(+-;g5=gMuTi! zwB1sZB|q262Ew35VvUu99hPKqhWoHIw5&8|ai>=ws@?nJ#D>)%#X~M z$#Tx#S11{6uMf9Il}b-O2_d;0KQ-zdiwS?{>NcdtZo2g-k^NpFAqAY&uGVRMnmzqn z_k>-hI^%imWi0~f(ri2wI4)ArmWc({UURzN+SBW3avSAhYft9*NIYb1S8a~rWszqb zb>M<@KKCwBo%YeU8nk`vqCv^`s8cIyA;w6)g& zex+Wl&pKZmN26FJem1dYdo^6^A^M{x9s!t3jIvhz*5TP`o)EqBi{H(G^_WfJPo6f% zPIr`!i+)}moY`Zw$}7N=P!J{)dt7WqMZ*~?sB!AEDXKnLL|xn^lfeC5lfx{XM&WZE zJy$qgv2f*xRd4F94^~wIoo~=ImF3ujFH)nGP?C(pT?}jM{nC@Uh69wlyAx4c2b-xL zU557vB=Bu#NKq?i4Ab~PLf#w-NYfqs<7y&7m_w4;UvMTVm6*?$lx!DcY%+$8h+Lpy}(hYb(s3QZ|b9qopJsq~aD~DOTk58ku zU&+zb@mvCgb!=Ng`^*AHPr30Wbx<7z=(5z&m>v>CRY9`wYhp_anxw=zEgtl)Q%sYt zSga2B8yz$Ko9Wnu5`QH!Tbsk&{Fz|Kb)sb-kVGJcBFp2&j{S)b_EMU z)$AH^hze1w(8lSTj!R(Uk(NDR7J0tZjyKvepHnu)wO&?`E#bH~4|w~2 zC$(!z{ry1Ab@V0=v3e%eF5)ltab^l%@K?r@nS)uo-qN!`5xCXIS40Ag=DASW|uGIJ2inuDhM2S>|o() z4`MZP8dEvd~@tyIC7}Jj5K;8qwz~K zN&t_|x1~m5cy#olINy^B&0(KE?1v=NUyJfne~0OmeiK95-d9_W^$9BxA#+4dCQkYC z8o<6kZdH!u0k&d2`>`i63RE=lE05>R(Y?jeqejsBg61(-OW2ug#n38gDQ(+eY%(KQKRF zN@X%yCYha8&bDWjJ?nHQVMM0fuOnO^y3;92D95hf7_cOs{rvpO6@LIFk#j^ZC>di> zdfHFdX0e}%{h)$FYz!EPf@l+p_YlhC9;Tqcusl;iod&d2v^gDxXf}yMwUK)?#!Dq6 zBh)}id_r4Fqy2O=(1{?N#FDH}!Rn0Aj&l_)2W^#&mIA;wu+;KIuh&)qb zY7&(DJDI@kq+D^#*DW;9EKN+dl_&Emz1^1G8`&87c*OFn#Dru+&uO$t?ezKY5m^L9 z_$J2o_=OH<(cLW?(U(;I&2sCFRpcg=+8Tx1UfA?>%Rm^>lZg>{#IM!OM7W?cORs;c z$ciV;qxR*y<}+3&Y&z{{hoVjP7GaSuWfV~FrSyuovdnQ1(wycNEI$YlOmU|t^@5NT z8IwVJ`gX8MiV(t#ZWAHhIZUca_m{-~1($aR+PxHGO(gDZOwSdO@}XL!ejE6imntsZTq6}tCjf@x>0mHKShmRdX7hkk(HWdLD{q?rVOuo6gsWJ58>LTqiSLpc9R z6Z%K$@Q*^}zgv~Yqf_}NH2wiVR?sV(N=?{UF|+U8sy#@K53&JOc_PY8Y|r)=X%Yk; zDUYCBsuqbJw6ThTB>p4$PskI5bQxNGOlS(P#4Su`G)MkhTYbH==H=cn(k@2#-(BLr z+5R&ep&j>~-?o^e7?jkSeDXckQeTPNVBfOz`JS~8fp$ubBhH@{i2po=^2FPF=-fN)3GyOB%L4k z=TSbtrOE*6-)<&+OzIDrI`1$>oTw7@X9t)Qz3bmJyT9vL=@Yn!p<}|5_{EDq()WM2 z>)&jzL=`@a8+^k07ly*WZK;3RfvyIj3oMa)O~)nMi=buw_q!F|rnP2ztB(18h`H(% zi;xOYnm{-G?Ay0*kh9x&JczkO-;Z-K)Dgw`&@0qEt))4$t1Cikc}W2`OY zohUY))wA2XyI=z;gp~5zm!l|uobi9PseiYM&*Du=cpoGssovZv{?A-)F3Lx0)$o3D z%pgCZ`N=7eg{Zbkm%KM}XUy%@UuAzZPWwe{>mPLd0-Z&iMav#9tqR z@T=E|PV#Ez)5|yi;Ug11^!NCiCiHLHF60s7+95y1JFx%kTL1Q#|FOeQG?f3b{&yFW zUjA%|bXf}8mZ&(hlmFj3+w!8%mXh1TcFB@~`CRr-pIi`mmw2({M)`A%Pt3N(%?0`w zH-Ch+amzULTq%|0!I!ISmLz3zxzen}2FnB`Wg0KCU0FC9jP&UXL&I9j!%V}lUi~EvgpCXFTEI z;f+@!GyD(Fp^wnf+^9Ojq7eEpZZtS4)u3jd!6D?CQ9{IS%eKQQ@ZPadGc3Z~f3+fP;l1Yv2x+ zK^}Vq1LXhhCiTC}etxeuXs^VgUqbhZ3<7Z;ahVSo_#1+_tlr+~xR0T5Lp8;P{gLcM zIvYdg4Noswa-`&u5;*r4Nn9n2hEk)PB;=w)Ku+`8z0f`g_u-^bPRuI2m{bUdvIR^* zqhK^V@8D$a7dXFo;rinGKVAC#l5oD=l z_GYx$Vt;4>BoyMR=VZLq3W&id+T=YQ|I6L}9DIXtBXDR|JRy3ZqWF7a(+M`U`EBz0I%z7riL0G_MskM#fvBwX_hZ~B!}`+c_4>Qw0k<$8UvUttzIh_4fPM7 z+)tcnEog0=5v5fob)x>Ow-6qOH0QX*WWn2NETC6VIceGdwT*AzQ+$WLQKD3*Mf7Ld z*|;#$pH>?yU+`bAumqQBnwuv}$6uGE3b^d*7U-!=?;`a!G zq=^4a)6ml&>KlEUG@ulUl1;y|sq=G20II^Q{UeT1tu&V>R`ySi6Ypd^U9WL~W|?}E zhTb~Mr#v^%ZyU;u3C>nI!8ysYX0rotfDWaTY}07F=T3oFeA6ePTi~#L)wet85dVEE zT2s(+iW{bs-kMm|?4?<8QRQJ6!S(G`9(m=rJ7u3XoE4*1a$sy}#il$$Gw9FL{$ESS zmBM@1dq0My%_)#e)sJ&t988OTk+}!!cp9WKksvLRw23T{U6d^>TgY>kLKOdL@-)pD zPKEIOdw&biNZ-}e=t>vLUaY}{nTUjm`t*MneklI(}hKuNC)y~)J6t`!^RJpB1*lO#pWhY}q-2@*Uf)P?7 zFJ4f?)*S0`9_=)}gQ^WBS~)lFG9Mh=cs13r!pWCCP|TVbBna8eS$L5U6lSJ0FMpE( znlqd`G*T9lpRdBWV+>a)+FjqK|j^OPL>Hc44zU`?uD8)iHW8wL9 zSoxL2&^I!E`(}~vty};NBRVu~OD%5pZc)tb>pa)^EL+O!nmd4!%|n)gVz>;W1zvk6 zqsFRpdY5=okIq?;_hARM$k>@63VR}SF&$?-)a%=p2q^T@mNm?TR>yGn`PUQFf*V&V zXE!+1WJi|!6nc=K=XYg-V%c9mwEp~No&cg9*$XfTCphQis8>3$Z@lml9A_J-@($Ja zlx)6?pULTvPteg^9Hh7*4C<)j(J<%pI;hXctx1y(n=a&zG+kY&zKyP|3=%*JhX~RE z;`LP5T^Bz0wd9n3&e!WwpIQ)7HjR6HjoE$jl-f)$%fPof`iXDlXkXxoTMS9t4-708 zw&!7${cd}L<;NQQ^N}R|6+6cgZ)AD|UwrU+g>@Wpi$?c>@u$k}8|E(8xae+kYU|(L z|LRl1iVc|t^@O*^T%P{ACWfFCd{eS4MVbZAd7{w%I4Qa$qua;D zN)1Gb;B$~Hx4i7(_WElhGc(SPf2giow*??Cf09A9zAU?L{_w`&zPx4>aX#toG;bmb zC@bw#1qt0RFE{$sw!y_tg(JvaJY@E)_zliPa`Jie&n3R78zL1P!+mVDoQ;?N{WP^4 zT*q509Y14J={zG&>DD`eB&dCqB(hrQr7nR*a+$UFKCG1Jk$Oi<gEKDz6!d*HL6I%%*lyk;fWXY zMqb#;xvU{|YGcVFFG*)Qc3JK89ww$iTn+#Xe?}V6W!ZAc2 z-cntGEbpUB7ow;o?&sW zSlikh2+P+{A0@xIPUPLJ@TB)nS9KrD#5m4`k45hdpEz`Z zWzQwIbT9(VKcBcTjOpANYS{|U!zW63rY>?fY*X*#?OX+!wTVzysNC`He;9H4ko>?Y zVSItvG9eqG`wkUaz$d=gZyUQIJ0)wl{n%R7l0{-l zW|-5`cWLSK9$=t>p{M#ilPyuYML6C?!$p{KQNXd~u6uOb?&y)C2h%4Sv;BK29iCsW z3)uF=B^TcXZG+XNTOV$ZGS7Pi9vQ&HbxRsDjnJo@vzw_VUp1RhjIgOt6TMd#r_SDk z>6i5kb`5+8d-*Q5zR0Bwgo(S4_!e6-yJ7j5j7r{FllS^PCSCrFiZgWoRvLfhXG(HX z-8(rGKb#(TDc?NWr_LuP8?jtSyr@x07I7}$rE-IzsOd&}sO_#zQc@PEy#;jzA5YzqHC;OA-Fhl4iqOMw_0NZee z-4pv^#e>7@ljWIgkGM_uPDH=B>SrGWEfO`@TeeRs8m#D!(B!L2^N!I@tE+?d1FlXp zC^NsWHg;k<;i_JS|G6Z0=MVaYNi;eBYaUw(sR4wZa?ox*>2{V!otb{Fk%V(T@zL@M zzbv6zpo6ntWfRV>4yN~>IjMTrKdW%zJiMQoxF zu@ItaxD}SR4?UWil{U|u>kv>bvF-OeyVnpO@UyNl?b5X$K{BB) z!AtERtm|@&YLd^VN@;TU-hN>L92X=$xI-43=>6KXjgY67<58&5mVO>KUjYjEgufT) zQI|%%^$o9+#8NLYa8P;ufWpN(qmqJ-oDayKzEVP>g+1nr+Fh3BrP4aFkupIx@sTk*ZdgN>tUhss+V?Yef%j~t zrJ|)3Y}l86wQg$LKz97|FRBJ|(yet&_VgY@TGVIf+2%`^c$wfuZG8nnK3}gG??L*l z+NP_I_~4T_k)|{+1EZrv1~;ALKl1b)tnX>4#!hH(&tBzHl}gHi27hot%rnd1S$dwR*nvOu@xPw19^AXQkCv7hA@DQ>*V@1<{Is3A620>f-o# zK1wA6!l$A*KP@x{=Y>Yb_!WqG39x+I4j&mOFL%k7~*2muIGnNFAgA|Nj;)yO) zPZP_}BUjbUZxDF83muls=w!1JOQaI!V`E#M{(Ue3xWIm<#5Tek7;{V7fwOg$@_LKl zx1l0jS4LnxWcRG>V>sqG2JSPxS;|(t(W`&$0H0Mfyv#F<@Qhn}kCv2TTi1u{aO7El zBsmKm9(o(3r7t;(QHGhHVu;Rqq7D_y$v;|}>f5NkF@To4uriuCn|a>5y(Ro)+(vXQ zt=0iMsHy1FCBf&XFl}*S@>?r<98?p}f^1$jHhrpIi02qD&9jZXpk;RB~GT09XhC!ZEAvXq-1I|+{>ZHWMQ(D5_7iZcy&3a3yN#r=}*$ysX!tS)u z7tRl8mM{@(O4zN!}^n*eu&T5BeDO!|K_YkO0c3KyRQcxwt)z9VB9S zP|>U2mLDUEaBC^|P@-yVT-`z*)E#{x)tx~_tv{QboDTlPfXlN=^4jpR!1A!xyR{U> z3LUw$6RTp|k=B?e+1hzgu~`-&n;dFQ7IulDsF=hIq`Yz>xI^#TB09`yu?3js!WBpxC!vc#&Ey?J$Ojgiu>q9)5~8~*QnT)LJ&QW6;8qZ=~d6a z3y5SDYP+$~Cni>FHj|Ole1R>(C|IAR_q;pDLgGM12-@2%yaAgc=GPOqdWeV<34VOswP0xcwz6hnr+D$R~e z`}EKbA^L_J?z$fn2QvN7REn?Vi+-OJTgXw^b`9dNT$gM1ZJD_@5V1I*G5G;)PBX9_ z(i&?Od%5)#rnrIg(78%e8!kcBFOF5)#8WP02Ivbp3R`6Gz}xN-hFFIgs63@_sn8=E zmWPXmviEl3h$64P4;htgS)i>FfI(2lZIdMu}W~2 z_PL|lu&LqxF@*zx>W!^$*&5!1%m)cgK&I#~P%9}GxK~8Gl3;9P74F?y%(E4kXfIj&c{28xDr~;bbPHvrV+ITEz)mpX z5z~K^(qDKGVFDx?PvQpowct&wOoaFQSPM#PtxEW%{TUFcXchDL14X>;YY)8*Y7@m= zZj6G)DfH@<(_edgmCQY{B{s2{HeLbQgAvfoJbz80LX)&%VGyt39%sQDvh(b7nvXhA zv|)acUuuwyyp;DdiK)6lBain}RUI1_UC;UKKk*90B88Rdy`JZ9Zhv6~0DuX@7G%JC zvO)!1YVmc08YflrbLIde%{j(xe=PfEFV$Id)f$x zk2{oXmQ~?);Wi>$a{bM3ti7Fh`&Bpy15~5UAAPFxtq#^3Tcu+%oc1PQZ&+n`mn~kW-5R+gr!vp ztyslgFqxcCcLC3K0@ba{OW^O8zm8#sEQ#}NNpodFxcxWPqUTJ(*r8w>5A{>UX*UYm zRbBzDE2TC!v{2->E6J}jgPdu#I(h-f=6it~j*%@4I*FDW*wrW;rDckFLw!E;P8^Zv z%(ZHd&rUVvvna7XxG9(zuyz+|nJzUK&t=(LR!+1^r;7==VuiH}Ak7Lk;HM{w^mcr^IaV2hdzB#i&iP z|B{XZ<$nN#pRC)ZHpcs!s$CMq5()R!eRz0|Am*yNVEFoJj~cdxGTDSsI?V2T#{HT@GA>9cI}ER z{YCBXm4$WPRAtd=O3t@wV)uB|wygJLPVZ+eLg4^}v`aU%SFkuzp?G>FtF~G>v??n` z?X!JG2t#RfH5!Z6Dje+$&^4`RFl^1|fHiwl)Nr;9ZWT79X>;@9%dZcRcC!_6tK!#0 z=n^hedU30wW_fvTq-%Fornf|7w$y*^iPTDU4bI)Dq?US>omq=ss09bk6fGSYr=3mD($qZ%hI#QfpatJ8MhLg0|&1?g}h}SpIs>MAU^FrW-ey&d+_b{nIG8E*eI4T^C;i*Z|#2qiP;a zGtOw75xV=*O|>DkKOztM=$*|AHqA=?(9mEZrbua5SESdg!DQvvID1Qx@deiKjF7Yn zvX8I(r6>K4qTrQi@%IGc6^ zQm|&k#>XT%>B3}9Nvg)>=}$O3N&(elr$`}ul^w9a+01NsnM+z zWMjQz<@!34!>3Ux7LHq35wUwJp12X5xV>B}KC$fM4Q1ruKVsI6&BQsA?z7AgGd^*`1UA5<$kD2`!FrYd8eVNZ?R{W*) zSz5JIt-DuQjRiQXuTXlKM|&s{4S;97V4NqVaR{maW&GBa_Obt2?@L-h{_?j#?v+VY(s!NBs}tT5MQ*Xt*+f z_Iahn{cq5g>NgidosJ9m_9Waj_5g;MKZv^x!8O?`$fm=UI8>dd)~3^ z`sD^7(&%t;6^V#t&3Cl1d-KH{0JY zZd*P-O4z_`nlprL^LuXAaU(4}HCiVOt#9kGZXf-P#Fp5leGuo2eK^ zYZOvmegFO<7p?YDHWvC-fO4#FD&+NMSIhF60fTM~xa5mj0T17Yj#Ogd0IU0yXf?{| zmqfhGFd`KzFTDH%uOM;q+#%T`ulL>yTKgY!KyfAohof(_Z%TL@q(1en+8Xh4Hy%^0 z`xGk_|HJ8!>F!8(NdHma=ZV`g^kT~ z-z6B%%>+?yI+j2iK}=)z1yP*pkKbKm;|LJfGqa}l0!yIFwO)QCc*MK6|FA)fdn68z zYe%bXbenpp9Zi>B*>n|Q1H1bB&9+m+2cX+EbrYK3AKF?y)!j}A5smX!Nqm-eoP4Ro zC_$RLdOfU0r4BsOt{cND>=!syiQZ-35OUE263IOi5z>kIi+KrVzZ@FnCmi|4!eoNP(?lAxrm1hBJUHaeANZL77x z(5!sEMnw01G&;JVSAtE<1I^qPJXTZEpw)JaNpEkjVAVD3SbiWyf3eleC`mN#eaI6W z?ZwrhR-!WuV{d^T#)HfXM~mvPrY-%6$gCMxEAzXKN+nFVGTbO>7r_8)ZoQ!0t~7to z()ebjvyt%>Yh%KP=tFT$`IsC|z6BPWBu|`L@;={+qs8d7X0lR6-Wa32kg+Ie5nMW~^2VvMs$K|V<2K4cbx z%|!9JMTUb{{lm#uI?ctue|9n}Od3qyX)GAw_rFL5HVJML%vp9jYByw|R0J@==9xH~ zah3~ms-qDuu>=jtk*}(uu@5)ww>AW(>rpkR;;Y%x2%8czI}N|2TU&jG6?VlM*@KVF zeKbslgRS41?OJ~g=vpcdO_i=j5@w}IMY59<-47`kt`kKf*?ni+LVkou11w=11;vZH z-NwvRRM_^<`F$a*r!$R8>{Cv)oK8~gqh9aP%Fph}re{AOY=qU_Y;?ZZH=ZH~J<+Ax zeV&*%bcRne>^*44t3;p95Vu*Dt5YclUBY%Y4Qgjd!>5K_HebAWw4eupE9CU#>agZ5 zpWiH}mh!NGM|D_)w6U}&ZQ~j%Wxv!1;@7JrAhU@99%X}=GQbGqfhzbW44vm=$cdnk zw+R2KZvrg&(!$$Q^}U;e3!Y2wt5kX;2{uRHRR)sqqT=EsmO22dQ`1$e{F$|!b^Zql zbme(iWq}u9)emU0&D9@^^*M^sM_pLb71s6V|CLT*R3Y2n*fdf#%%Y^!5P&SIH_V#t zrarJ^qU(2gzM{i)r%ilSp)G0kur76c%;!f{i^I_VOJ?)^4fy&l|mjRNBf8dh0(UTm6B&}@ow%iDWcb9d17d%TY! zf*OW3htsz|Dr%k;D%EUHuur^a(ie?3l^uPhmigSC)v(AQ+oVG}L3Z=7@j^`pZfU81 zLnhm?=f%3Yatq7Dl^6gpOEb-qcQD$fF^MzAx>Pk&Y1lB3;}~BC5){hcIZySEFC{Ot zAROFAPq}aRrZ>1WDczn?eDQ)FYfP>w!PEkV`||XDZA6v5xG{9czDR)6FVNZkKOuE3 z2WObeRWSef)$MgS<+&PVM+A^!YDTF`guPK$b>O@s-4caIAx+82*ja3JN6Z8ys`*-D zIF>d(6rgvaGP%st!a`B(1dHMl&ezEF;00m*QDYC!f76TRK<-*k}6rsd;G*x2Q zY_McmJxLrLSFi4HTn6Y32i`3Li5 z>WyRw$v$L5P6JaNap*%}OL4sJ>P-w-H|D#Ai<=w~=ge~?8*A|phw%h^g})lr-+U*D zA#u>GHN?>0G&i#uLF$U0^O(yG01wb?`4^F{7Mwf62+%sPZ}#*Ht>4%CrQZCHbNBCv z#q0{e{Rk-NpIP#J~b ziBCVS|4)vZOo&PLr;Dhf8h^?9_~WIT;*S5;)(6jc@M8%7{pkPyZo3r{!+Vx^nZ237 zj>sgn-`r-uI9a15%t?Z6p!?wAI{l3sURQ?2Q14CSp zM`Z+N7aH`L!l+1hyrZZLH51SN?biyvqYnl87V3v@`ASBdpgxRX&%F8j_3~r8{N+)Y z73rCsRs2b26Ffcf)N_|dYBg^hDn-Z~3iv?&q(4NMfg z@+PP?BMz8Fp8d5^XLI?|o4%6DWUHQcSrFydfX&GajIY#b?%@f)_^V^vs>+s*93&TI zfmxEVz?+ZI3d1gM&}IUm?AKD?HbR4bBEX^U!jNg=k1_>qmceaE9e2rB$R>l zS624VYu%QZi1^UfpX#mdqd))nXF>c&YQDY_KBs>1`(IqF>@3NdoyxNYw{3s;%YSj& zpVy4Qncr^2xNQz*MV=OMW~sVqO2 z9EExDmJ$L*J-Ce+}Ka|sjn~E$zY#c*F_HfnL#JT zEAVQBp?NT>omWSQ&Rx1!kLr+~NdEwuBCi%tB`d-iWu(NCw0DIQZuMZE57C`&(* zAO4`3YIKKZ6Mk+r#!}NiwJ5D8atoh^Dnkj#TGP7O7+4n&u) zKE>GP4%WCnK^s)q)P4^=PB*(8#0!#{e(kl7_HO-L*O=&tTx4yKQF{-~#Qn`H(7OBbL4-YS_r4UQAIP!LX6iwk-?n931i%K6<2QK|??;H_Z z8b3c?bfN#jyw>@MQfHy#$o05Hn&e1NApNcI}V}$vvt!+ho{Tf(o)8|tj7RJ~X*Hbd7 ziXEdTKO1@FQuu^+=^GM~sEZP#$u1^6axHwCx;t621J&DMQS7RU#B5QWed5p zjbGoy<-}cOU5uP54f{S>fv&e9j}JMQuVH|C{o;;|#I%#qDH_eK3V_@q-+Alh#@r;! zEz{A@DQd|t3N+XawM5CLT%S%))GIh}3hQ{XZJ#^)HrbzIa`H-=T7B_^Vb%M(_+0)> z#QLl&RgBkeam5QhuAKaNbJN4^q`wG{q-~WF$KH03)-RWdV zP2s-iGfWp(J|`Zew$Ul2$^{JK4;$4v=BL-u5nj?C`vCyjv>gaXhoZn*NnqSyQQpS# z?i+ohxbpPDicO%(ymggZ-E`$M)67-YKQKXq&tHaK*n0&1?5UlDg~%RJbd zDXA!qr4=VVai%6!_~IE!&x)0cYxs?@ap*RjXvK@&n1yLEUub93pex^5y%L$m*4GNK zID>?^Xr;nVn_^VXEfK6VDWbyz3_AzTED0Ce74NKQ;#>VC()!dxn{3Ff%#B~Bvi41W zpu@%(xEW$tbWr| z^)*pn=S7QLS~><6qK3ic2G53#mQMBYiG?L5aZ<6RSBc^<<;$G<(I-ZO#3F~MuCiq5 zKZK~B=gv1QmgQc<-wk^feUL>6sQlg}I~h8ucRQthi_PwLOZj&;?*#Dx^D*yorp*?J zWe#;}l)Zu#ojGiCxJb#a7K$6ia_tTWXMZeRkvKw#PRX^A!&_752|=8dGoCKNSQAKQzWnnfIXoE4|8x?vR5Of8qp3Me&hMbU_h_&e z0eOmkM)EP(a5d0N7?vN;<3FjfwsrUJfv_|D$l2>jkLT4A_JLwm`s)XY%doVERMczT zs0yoAnJd%P3}($8R@yBJ+ap{|l@cb(;o}YODSQ$+xLoJhndlWfV1m!uU3o0qx4V!o zVzDB2nS4GC#mxA%s&a?17aUir=G~A>V)*40vD7egwR?sOH&2?@`Bisq*C^w$1ZRi) z6RA#CojU0oKAHsxvkql<+b+T3e42MuAohbw(cC>A0!Ot~rqLIw3?DO_*j!AWPI|7Q z&p2h4VdtTDJXwgB6()wqb6VoE<4uE6e+T;ENic4tIM_!*qTP*dgufw*OI6Lqi)Ub& z-!p`PNg@w?Gn607tiHG3kso^H7nY#ugJj65{l^JnP?GH3)~`hmQvuAuxiKT9ea=Xt zz@MJSd;cCQKw@2982l6*e_n!vLTp&Y>{}v>ip0ocb4op)Wyk5qYV4SZmE&OwZ=p&!*f6D5#svI$9ZSkdR$5L1dfRbe7Ax(E%-B$Rc!A+EbTdoQB|FmxNU0|n@?&zq z`TiGn_8hTh)LAqIa8g_v*b!!ZSfg(aVzJPlRJ0Dd4_km; zzD8*w;HfglZ6P%0yz9E*mZg;*zg%frhG5u=wia^wZky|{$P20R;iv3z^Mq$t7?N@= zi(I+`S0J5RZ0T^b%*J*kfOOBRQ&!F7R&5NKXcZ>ScO;GK3)6n~&%PmcEj={L8(ZUu z4kB!v8y2%{%27*pE9!Y=R^|R(KjQs z!P0W%DibH`Usn+RIY(KF#98A(Qc~2fWwg3ynTt zwG1l3@6g$pj(;@f+4;0wfjv*GLA~MQxuE}+Fk=oDf0kR62gQu0q*pbRO8E2vMKm-m zW4Ymwqt|_>!F|`!_gF8k#W?5W8ljt|Pd(t%f;_H5NnP{ zmMBIwyk(k<+1Ng|ypjC{s^YhJsj1T!UF(pRcFgKquKFXZZwHq?H}^hDKN+JzJ);&~ zoVM?*?)Jia#(!#3`9%^B@)KQ3oW@5T1W2m6vP1*zgewk;T}34Bb@7IpS&*AYYFw)u z`-s1w59-g?Ur4VwNR2eNdD6EfrP1MF?K7&@f){?(&WikQT13X za%t)1xwEX58}8O#Rp)7W<$x-#rpVBdf|I@R7rvQJq8zWy<-GvfTi)%>(Q27m%q8b? z=V0W)e=9%FH8u@?0)2VIW8bvG(P!WJ?LNXQudYnM&|9m8w!oq#8DKiwb_4I!w!U>t zdttaveYH2b8AxM>Ya0|o3S@;?s@IRc@Kt*zGyKkG8yLAheN!_RDLrO@-;UW0+Ww$Y zYRv}iAgCefg{C$dbTFAFefj>Rnnpu9OjcvB0`}4%ZNkOY8Q!&{JWOwtNULJ4vOhkc z?8%VVXJUl5WvVP;G*~Eu7X8fR?jtvqz#7O0ly}XeR~^m2WeX!qEe4tLCE2}s_StH)gV*XUg9vKWC$^JHNkPbLNNX8X zIa8baX`;1r%vVFji;j`gmBAlt3t|p#Dj=<|BH#rc$TTc!o;871fH%Y87a)wE^~K|j zx3R1Z@r~nMp%}2Se(QOAs_bjSAW}n3$~~4td4g;y+Rc2*{IWC0Rpgrg%#$q1Be5mh2}n@25tbplP2Kl=hrm;BhY3*HU-Xu}=q14Z;oyw)yP zaIk~a%-h(;mY46{fjn&efN-uIrD2*=fj^eMmjIef@>V4%?N@P21s{I1P;FW`W-GeUuH}?0&#$AYDW|i*egM)Z- z@DXczX5W)7gA^1A56kT)4XXCV9@&`Z%4gl$TA9=cnO#>%-PY|gZQge7x(1&x??1C}5D2P+0rr1*tiOn}X*_YxwnRD%r z(sG=y{KO8qD9NaOCUeo=jCBHMqm^ee?vGSl$sV7WR)_%NENt+3Sa$AUwfYu41tleW zCeM%<;Nw*ZIaaj0p%(g8>8@rr3>(z*8}w?%mbulO-|@&^ni_OP3>Q-0Gw%y;k^7SN zC@14%xbwa0u5vkb3yo%EUZD{3QF!S|(#gx#uqPP}8I)1)ff~fjTKSSGSA`=Vuk+h3 z&a`vqVQel-_eV?&5Im&(iuOq}6+aOV{~%i68LWuQZ;f#Qg^fV{%15Y5d4|n7dq;&9 z!EIb>EIu-PmrFzeDD0*dD^1$KN$5dEu-Wb=79*(!(VU;ezg^#LzW-K`jycrHmaFYU zKh!-c?J+Mm>&?$w<6&{0GY0K$Ond$y&U`stImkrug@+Uj&}(uE(u_{y}TVU=LMld~Ew z6!#?BNMS)63EnagPdl)V?Nkp+PuOw zJj4}MA*Eep=rYFEAtz}8b6-HrPH)~cnX0qzPTNWiS8HnIR(Nrs*jQ4&eiQ+>5>SAD zBGtD9oV*0LJzn_^p)ow*^IXc6bUO9z-qT^LWNBNK2nYa|EzV70yZ+ z){Wn^aFmI>k0MiE(K$j`d2Gu_{4Ck2T;H5ylF@U;WT%`wl-}RSv(@g~;!^9WST@A+>u%z> zHm^`@ZhTC#_ zyJi75q{s@Y-z^@uhg(M)2_i`C@In5dIzu!0=43gk7e|XjwgQ{h)-#ymxcJ3X(_+bb zh{$5Wr&lkN#7s!mm?5&~`TdDC!R4N>#XpROB$CC5ysoq9m!`Kip@vWd zdmnCZaS-}>UdHgfC;NHce*%|}LkLswF*%l!{i&pc(ggL1vvnx z?g7fM!lC|?gb0m+*Nkcc&US4y2huByJ$;25EWppN{3>W)kAjXM&5*!-k7>P_N1Ak& z&UM0Zpeeo(z-m020gk^gd-YN6rPutug-&Tp0fjA;ts+YuiYWkpy_mN>cI~axQoj~{ zwxb^d&eCUM(O0iLJ@_-;$mmKY+KsHPDH8kCI0eM_IBN$0w1>wY?K1}oK)-u;j5bdMSgWW${ITv}5Yf=e47 zOUvkP_JBaN;{hnt_hC@nSVO>`llSP`dpJIe2bSGDy@_0g;y6q)B5}PF>Nu4|r|W$1 z&KTve?UJ>p1Es!al8u-NP+IxES-zmv>>qqIU(q7IGd{$S$Bk$`4M459q10ICkz@}g z)_M5uc>;#`cLwB#uK3qcZI)#InH@ImhG?O&P>9vkM15)yak#2EWXzLcLD6Y`82}37 z#lLRy6WlzTePfb3TZ8DgUuG`4QnQ*!B;5p%wAAk5yp1gcb*!V9$#Ln9#=UrO&ug9I zJ|hEXp;k|^O_*787jUB?6-OkgLbUE|_GK4QJu|PPINQo?PV8%Wmy6%w^vq}-9r?SNnn*}Y)87=1`#o-Yu66`jwFGK;rw{sl9Wp(Oo zPNLRyZ-~AVij3ZxuME8-ijkw)STI3p>8$g13Cw+J%(ZfIgT0CJAgG?bx42BrFWckn z`sxOvcwcNNH;r;}XRiN}hxOPwv8e?JY>ux%yjQ`C%wtd=sh{A1*{$A;9Lkmc52fSh zSo1e{&Tg34wy*jDKf4_idyU7r;fAVioy+1@@_l~ffS85p7Pfk&Fw^Fl=DoOi5juB% z+WZ->x5ojs&GQ&`eGbciF~@`$-13|!G5X2GS2cwk7R%0GCv+m{8y^A*8i<0Vk)m~# z=<3u_sR~fbUhLc_K2T$Yt})tvEh1>JqtlwHf(}da(PbxJz(hGp-{;hBt)0y5m9zw~HS>EFUaPGX$Jkbd9BED#9=*p9c9TLZ ze_=*|ieelPZ4RbjE!mveP4xCKK~v zXb_S?MyzD8Z+xKf5Nf95o&)0kKxH-Ww%Z z&d)D)>c39Cx~Y*+{r$d{iNxGmsc0uY-&7l+xuBh!+oxuN#aN7eS2_9f68wW6(?cIq zPs;W7?$`AWWY9G50>u=dNon#Z3h~%{-kDfz8|Eh}SMBI{N@FgIGE_->vhJ+rw-lky z`ueTU;Xie^(T(FEwNSqnTenNEF3z96oilCa%hmmIQ6?7Z< zQp6GS?Vz@7%vHXXeq6V7diA5M=c18hK8mfkriF3(5A(|HzMpxkQgyfS7SmT=7!lwB zaqCv;px$J}DB=#Xy$-z$g-iu2saJ5VRNFB`XFrQ9{yxyO8eF;%a^3)~XL z$-2BOgzw7mhAMlMbZ<@@3_tc({n=q-gpy^Rkf5yHGNhuMOD~>tM)!xoGWyo$aY)u0o-#Z-#(N2P4z;@qM6k|IZXV-7SL!O7 zx}RnA2OISl9}B4%$2vyp#&f0N@9?{GQd*ve`(UPZ4xN5mI)qX97jaC*udLo*Tv1km z(W=wRBJX`8p{hzWO)Ge8!3PAeo7fAwv%ySqW$)i-dEV9RQW+fTeAE(+@8}Wz-{d-_ z3@|PnW*45hYI`TazAcL~u^_TZ1AnHn{Yhqjo}DRjqL`|+>!PIpi|7G{kuE?Os5&6v zo&Jy%`t?WmN5J1i@elCVe|_E`es$Na<$y}is{m9N;FsDwR=Z?lzJsyPpFgASoH))z zAC=c09_($3t$NN$z*x(ONNn8!{>Cs~g7^eum=y^?c(O7ev;93+r%8*5iUGP-Rp5-w zZ+_;kz=$@N%j)0nuG)Mes`QHVT~vAZ^_-gG0)Vfx#|}RGv!DE5mzE#x9iYt=c?!h% z7nfNRA0mBEef_9+5-bD8Ssh zYmPOAGr4bex^a?RENceD;uKkJG{i~l{uH1Rx$o3ZcmD0yawI!qPLWklhVBr%%Q6ZI z3hE}lIu9H8(sb(&bM;?74m~A$4g8N%&dmfG*ZZ4$TpKHCswY0I_pLj6JoG4EL_HZu zfjX!GOeUeMHUtpga{#lMmi7?nJUW$n0v+n07H0)eq;voZm%PGjaex6PcTCqw|W^bY9{fb#JvGa{b-=^5?Uo)7cY<#pPZ2j$7f!>FQ zZb#h{k)xC873Q|rj(5Qr`qCQ-2F^Gbj!Vh&+YZ#0@pOUwEA)-8Ge7ma`X>W3)$~0N z!!_xbJ3M|iIgzIdq}nII_pD%27k}e3SxvXbs8m&p<~e8mBd($Ep zBYXlRN~MoUvFLUhXo7jEgD^hG3AdI#ylQA3a^;xGIw<&yM8#ih{My)B4yskxo#&bH zL2sfe_;O@FE1J%;QH^o+W|PPnGccHk;E?WaSHplxi{44C{DIp=YgD<`Fr2d4pC{!P zPX@F$vTG&XKiI)C@IX6xV#Sxm&Rzf|jWhTn^FGLTGzyE*!2p#3i3FgBQYyd@ zsBdfQ{j~CJz!ASZ_qN!6!M0MIiz#xlRyzr<)nN7DAIwivykw2n?$l?Hb6N#duG|h6 z@vVQeApVU|O!?#M|7I%<{Hy)ny>`R>P_KKo6$YPxc28>!)i!rSRCybl~g1l}C(1P?DDmL7`%M|YyA2~mV zy}pWHLtZ;h2+%F|6@wqTn@d9NT}uw0qpSxdiHYTDs87PeI?}yaA{#FJw=Q}Ar$=RV z#}Kdb@{Eo-Of6YE*UtvusWcq4-1xullXjCNdbUGd?w-?>06u8FPl|JbT2&fJZ1z~v zjapdVra&&&I`#6G6V5FLrvWV0yuHVXN9Bv5qn2 z=JW?kMSy(;HnUEvG#|YwC#&eh&(X&3Y&b98kr=4#7(ajhtXO_yX<6Gf-)M&ic4_zu z*4rDVE%}N`Ng@m!K<`Y{44uTKPf3n_H37-UVrMN*gR0b|dZ#Co{5Du-phB}#xqU_t z8t0z30G^eVeV^##nIfzllV!jsi+W8LT1c1-s%^bnm!UH)$%vkG{^ItVlD$5we8kf^D?ilVce)+6c*ZOY>G(TF7FeY7s^mRWsimgQ!o8{-X z0H88@TO!d?tPj6yD0jRMTi|R!(}pA-+ZOhmj@)Bt%wyZ@ktkXJ;muEjZG0lg%7^sJ z4yf}CypOl&MV~2$U-qft@tCtZCS+PF`YZ1PtkvbCcVL!H~dkjbe7 zmM2X)xMjk^bjndH-=L@*1S|P^-7j?>@%%|;#lV!EMnM56pHpuA#;MMKV3drwjJy&H zShwQvKtd1 z7ir?9y?^qrb3FTl(&$poMCBCoXbu#2OlWAx%j$NU+nto1fYB^kIfm;C>w=sTX;?n% zUZUA#9kFtwgqC~dq#}k2l3r-oh1%xby_Ryc;fS^9rAvqdHIe?khU!|%*6(?UuU^e9 z(C|Jxas_4d4;9tKYiW)kvx0&2J2YOlyvdm}#2;<0cwO=40I&DjB#bObKPh&vB>w96 zsGek&XL+(ZbY##J!W>L6M;JsJbdCI9N{T-}hhDN=!Eev4zG@88DlnEy0yVXNS>&9nz4QM)4u zL56U`Q*$jmW+q85l?at1dUbg|(Vy%M>0ylmJ~M%J3KKfu{jIH}D8tHZ5jmf(G;B^# zxg`((P}{>vmmq3nfEK(V2a^O3LM8=kEh#8~T>CtFHw&XF{dat1>By@H8+K+%Bs(D+ zzAm@FB2bu9LzOv4Uj0~i+o`wC*vbwwaAX?-AkP_zMf0$8y_D9}8gkLP5*Xj>Aw55j zUQm}Pubxln`@RPl2Z;w~-YH(?5ILh1McBz5a!4KU|M2U7570v5=r%IMRR%Mt@P#Fb zj27E?(>8MSDlbu!@sAf5JqS@B*hHHEfdp)g=a+a#EFz_OYb{J)2YBLXm~-1X1#g#V zD#BL+2e3>p4?l)x;CIz2*UYWS%y-!^_J(WNVBH^zht;M~kEz#=sGQ=u;{JM5)5-1a zyyyX{&x~|)KjqgcJ%^Df$mbv9n2JI{#lm3G!(9Ro@8b;T`S86mttGfdm!vjf{5^zg zZlAldV2CC-K|UEHjx@biZHkBT3onvLBykNd#duJ{Z`8E&qwhC?h~rZ z2?zq6hEeLQM#f^5M#OmjOmz64uhT!>9+eGQipJiwo!cY4Q~P-R?ZsV#=5ep)w~{3c z%=eq4QG>gyw}EcDF(3@~!w1dMVkF4PA0fZ9-Na`*1?dz`2yw{xS=&-3m^F0Jqsbd+ zM64J^0Bfqf=f{Ia{7%J2Mnot1x3&Kg%pza!0DcllF2glwnYK7!EK8~%SjioSzJeL6 zJEh8R{w5qVTD8LjGV5O9tnexmKzV4pRy03wfA1L3x7W!4lo2Y4zoQlp3`P7vOj(7` zv!O)y{z5C;gKy~;p9<#9nEiRD%kBqr4KO$Ls!vx%OI*q6F0T<}1H#=j2Yrzxz zB2G3_B-!#ox1+hK@wn@Uc;1^Y+5+ti(8ZL_Yf zsrimX)^yLGJ9{u$Up`rBItPcuyi2vw&pmIKamB4`@GIaxI`UKP{`}xqW87dIP!JuJ zdHXQu7Dc_Tl6?6Qt*#x#lSE`fMs(ZJr}$XdjWinUjYj0 zTl$!9^i*pEFkWF=uR6z~p6Nf%WhXeHL);l(aNM5wY(72NqKmb4Lj2qsY6IbUc;pZh z7k%G)EA}v0^MH|IOtqec2js)FB>~BbiP2hj=%IhF;P%f#@ddO@l-<;Uaeg8~vED*T z6X7i+ujTsOEv7%!weP%%OIQ9xgALLM0&RWpI6>!AOZi0m)2@wH>V~m+B8+<(Vr;q4 z5DEP6i}rtmz1f6ci>{su-U`^qhEaY@OQ?rmTobgiI^*SKtTqRVtcJx)-RTc{<#O9-F@oW3u<(=dtZ!?yIL zWX8IZb~bnKZ4J|QK26<)MD{E};xR6nZ+-?E-FhB#qdvp8boGFWuqLLHy=AB_tCWpIC=m>zk?ydUsG zs)GG`t~cBxE!|okJhNk7X|B`c+8Kd zC=pc9qrZ}8S%OebJbz#gD*&PL0}A1ZON-0Pwx3YB8geqkba7RkEx9eq(3B!IsT{jl z=cDwHtgtt{KDTb4q)^Yi@+ty)fnE#r)c12#c}1z0o2`?7cyB8EPsHO3b<=Y7i9IY? z4^2kp_w3-bUH5QtzfBWrq^tm-@L6h~e zP~2M$<3vQ?*w`b#xyE`e@)q-qtJt|De+MZ}{A_cJNFZ&8KqY)*)B;~3sIucFTnV3^ zKoDOU2AyZ}zVxl<7w>Lv#&n$V?@0z&#bUt?`coIAg(^-OKytLsDM;m;=eaISOVIx= zQ4c0lll_*Oy_Oy;aQ`PZ{*Ns}`TGU=N#}Q(GW*w8E$bs?x@Vu2FVIx|F%K>2KkqTs z5xN&TjQe&RtnOOjcYuF?!}Mn5!~>qX$1T@9`D&JhlUVqx*BlZ*dp+g$Kaz-#)HW_C zbimXnE}sik6O&SK$FoqaclN(%|~< z$X|QUh1&Dc37L@V8X4KTjpZsl$VsjB0Cmlm?=Cux>F1!tTfW$;@>-=5~(KO z(m>mG582|VfT-Qwsqu&)te$$>X=GC}{ zAb-%AUU_cXViQ$1ec9oY+^dO0Rqm5RDZaj4V({b-griNxc#eOj0Ey+PB}rm8qn@b zN$FZAl%l~^qD^M6&6yrUXm-cz$=vaUR{mXzTrfo=yK*~huG>t;PPU+G1+Kg<52ebOLstK+el^HD&BBx?x))sovsWt0%=dXIqQEKtxtqB2R| z#MU$VHTLhu7+|KyR}2E@X-DzhI}z6R0_Irm3d&(M;|c4Q_mf)>sv0L`kHQ62Pa4dr z;L)0mjVA&B{kRT^zdW2mrr8~ratclindxZi)Uh@GQ8^&Q<8qO{p0zB&%F94_2f(y! z1Zo-%iK)`!L_mTh4Bjsu?Y)}9MEv**SGiwK8>(fi(;XXfC4IeP+GwA9w#X3zshmCx z8xokx{w?*Q1l?wT*79egCU2tP7m2igR{#8W2EvVvMnO*IcGuAabCJ{w%#8N|?baS1 z?=_Fy%wj1ur~dbD^55TzW-L2L%cvC8Rs%UBb0V85bG9TSMMdBpMi>WvEPCUC(4&bN zQO3X2^z*;+>%v#+=KRa-v8T_R$+JwGh*uQU(HRokN5XuhdgR@w${&7vijJ!pTD!Gp zn=^%p{)>GJppjoCewOLaAZTG@x$co zY=6efo5n3LQHQ^AjQ`Jx{O4Vzt3~VOQa<)L-FC9L3ot11l#OZ(RFJJ?Moqmr*FEx z02ZD+)I3`ftL)<9(wKgUgWCHxR{Ot_wEvhiv`0lIAx~(p6O5~T6`@7^W3*@0Z$2y! zFaC7C5_m9w0x?z}qp^-ahD~XM6wD3UH+|1S`ySF<@V`Cl`yZdRxUea`p#D~t_EEj3 z?G8$NjU%f5%9Zf^%@}^=b{?O<=*yseil!3K4vG49`L7Ke%>sh_|G5bM9@`hbp^-=j#r-C`AER;^f~G3j*Dv&A4O{uevj?OduTK{S%H{u$ z`)CN#Y_^Y=R{x>^tnGY8yEX1CTjsI*QQZnn#r<)i5s1qSbYPw6Okl(%TyHzqd*tI| zG^_WXh(Xf4GEV9uXf^I%tj12Mc~~^8`+YaQLegbui_>%L=UD;BRTWu9VL7JneH(cL9)?5YK9GyufuNMS z>|a}D*>h*RH;qF<+S;G;3i7e$e*XUIZ`!h;(;06Q#vb(y3`jJM73ryjo<1`*nF0UT zp8Q8}{wI$Lt7yZ+YjM0-D%t(H%euOrftfKjC9B2JT6F#Emv5aOJ>3BrsYAzJIlrMV zLP~7^Q_|0$tS#fyg_i9&QbxetpUOF2F4!l>)7@j}Y}bTZrxAlEVSSS5nyf6?MegA~ zi!eHx^T62d#Q?aHF?%>-n`NRg0Jf$jeQzE6D7pQYlWoH1%s=^rf45nNsQ)Uxewx}! z4T2v(aL&|BIjH?62{LiQhfw@`ib_SjC#fq{c9YGJLK@Br79&Zx>Kdm?RuIRGbZ}%E zCZRRmum$U#1D0mMYxLbu<4(kMT2l{8oA-O?e+QIaF?$KgsFF71Zhjn-7<-(GYD`!P zBH)k3#iIMdKbivxpQiQsW%Vgs5-~-XD;25wf8hjKyyu3YJ5J2aSjE>jMlGZy;{?Y* zCyLW{{aol5ub)qfoo2(wyjQ<+5O}1sJ-M5;nEQf-fevKU`-{NpO<)VIb$vf@KqxM%wM`?F!5{X032E{aZGzBylOy2JkS3Ac$*aZ02iwEF||-}gdk;MvqcU=lgg17KE(k8sIj6_fdJD85tKXrQW@`z&5VZ~ zwED)(XZ;drZ>io>^5qMT-1j)>#elfuyls3^&}HdGnjer{q{hKKEV!4ErSLQNGfPO- zqiE3G$VfFid@`Wv)SadPyqU`^o+SAqwLLDuSy6r_4`C3Rx6 zz))lshE&n|-r1$XBqVn`s>`KpOao0C48J-q_FVf$-|bnL{-TVDAx%*R`(S)eLPCoV zy4g6{G;2^rIr2)3&E<)0J27<#Lio6v|H!R02QqvEVQK+NgMq6efIcMeHW0av=r-Ow}@6&*8kC9<2ZQ_d;{_ww! zBZOPq;U!&W@Ub~amCCg}LLvdPQ*94RTIg=&SzBSpLlmo^41x2lQCIb$Upd=SDj_43VB zZP2cIhazN}8b(tT?6h6w;<)vZ6D8h-+{+jCx9HN>|JH}p`1&L)Y8r&jY_Si(i6a_Z z*s*0_ig9)P9gE&R0@1XZaOvq5tgl&Ld1IT1Z(JtffZKOE^Ivox%`FoEMNKq#V&Sz% z_15?XM@p9}Jx@U)PcltRtzoeK?g=@e;7Jn^Q={WeoCc?&VJy6RglDv}Ia^h^M;I51 zPra|7pE>{na#Bq=kwDpElihUT8xx>1vmcUdVAv?`dvIFQbFXtvYy)Z1daqB`)^1#Y zaR|uZueqE1baO*44X$9UPMz$7inbNLKYc#9r_adpP>3JQ5PlxvWaa$bpTDgX_F z7`1xs)$5BtdRn*TH8tHDc+mZDN9s$LiN$>a#e4mSG!oP_KG;z{UiB(XLM#fC^`G*y z|8eAeXZ#kv+{G7Ivl=nSS;I=(LEJ9igFZ^=)ZP%`*6pxLj=p}!A!|~_-^)?-Bw+na z0Q*kxararN#f63L4?IE1k;GtooV~Dxe}^)M0$#naU9xnDaZm;jZs}2>>3ayBEGw|9 zQ~y1Vc0Z>4js=&NOkmg6T>K7F&;!3VhaK%D@c)=GatYw_MU}ZLjGO{js9$33&W~bp zlbiJDwA3kT7gI{vT(K0}=t19A{P_lu9IfP5nvQHs229hoo8Na^L-JI4=*{|G2V}4} zW`>3c$nPI0xN5ashlL(neU?_G8F?>^8J8}7TrF;6-s}QsQqco!}|EbHvS&ot=s3+CcHFn1OJ>q+)3(nb!0edkAopKCZytg0d z-ww51z?~u7736Ok_U>Zf(E&ReG=0poyW2B!r|E_itC@Ep=t~BdfR~ciZEWu+l{DV@ z-*!AZ!mkXVKG%%-$tcl#PD~3q!T6k}9J`_oyBj!BP8FGj_6ZX{9qy)t*jtdh^7qWV zZ%i4i+$fhcq>P(e@>_bK+J5`hd^EpA(Oy9@OIP%QqPT&{u03zHt07xVR^0eE-AX9D(4>tetwaxG#2y1ZyF$7^jQY8Me>7HzWY5 zbFAARX*JIDQ^s7m{Tfk;6e9;S&YgokzMIZthU^+|7(S4pGgn!T_rH09d-;iq)~`r1 z)01+5)<+#d!l{yR?mi!@_pQgie!L^XN&y&5Rsx$Q$4ZocXIPUBU}u&#{5NHK3_2`T z&XHozL2M0@z6n?RPCBtp>}UE01ZdDMEDR9wtri`cGU~1)vbwnEd2(Fb|G>9(0-bZ~ z%<=#yo(w-dIl@a7E=|8r3{(bYW&l(;?jzoU7`LVI4ldc%$>~VuSF-NI6Aw|f+_0quh_wubxXoKkWrGT!q_QrFN+&t^7jksK~?E&j%3#$N!5~%!l zVIf_>Bf@CmyVbzG-2okWsh%4u^sTF!Z?5=F?)G|QmZDvg%8*Jy@sU>vtIK_OXXkbqCs} zulKxMCJ9G5|M)jvv~IVu!&U}e<&2=0r?9IEid^UScY3=}^q+g|MY-NcHYcw$?gcQz zVr3<;#Yfkx9f`zk^pD7bj}@F_)GMsd*!Zz3zUut(`6$VTG+HXJZ7 znC~3#JVvsZ4Dhv_k+|x~H6L%3Mmc)KBP@*1uvH4S^t;mK3rSuu}9Zv{I5EfE^NUZL#b-P z^6i5XwG;`l+WEb?{8nF^#IXD1g+1$y*lz~Q-xO?4UGZ3MXV16t1(X)Af@gk!7y8=x zCkeZua-aigCCfS71*K(c zEZZa==AQ5t!5d8?9)2&=1Q+H+>ZL8i?y?H)kMl4iE)EJ+=bpH!thbwO4^--`-x=QT zYkM{zM(8ZC7A42AtYJ$=Lyg}QUZldK4pbM0p7msr#wH-^He?MLI z`D349WsXSsSZ(#9;W6cSVQ1euroH`_fMn~yov3DsL&n3kQXb25$F#Z^^cxLUBG)mv z)}c*}h;9XDCH|(l{k~CFXI1+7L<6tOsZ1nfh&501!x?G;5-32PV;O4W4HmHw!BOPZ zo>}>QX(^2^fPzCQuACB@6^+O80TdwnMX|{Pqf+I%!ziPoUlf*=RefN^Oq*l-<+Sak zmr-YipNgku_m$%tU@Z!Ut%s}3?uy_b=B!>a^@Mu29*5c>x)ti0hl z#nQ3j$r&>PA{xV%_LP)VgZ&5)^kelCZZ@37%zl~&&Kw!@ir83Lm3L=*M?dexD6A32 zLz%A2x9yxPq_q+2N~UPpgFQAKM=wo{>&oI!GK;LxA;B%?Q+$(ErgN*CKO1M*xW}F! z)pOW}?(8B&gz*>S#LcIH=I#gU%Pb|$Gch_#*0#)-UD&2|{J+ht4EMsXEwJBbhd@mW zs1iB4_U&vB9oS!6s~{e=n!$HhrVD# zibX-i9($B49E>0L%JXtDkiHVCIK=@7&JR0N=Kq69Z%=W%BWvih6Ij$dUHM&b z1FC=3+m7fyD(GDV$gnEr`t^>EHK|cv(ze0B!mZl{ozlSFIIF_)Xr&>H$%BA9qJb^y z$tfqA-c9AU)}Q)hpX(%_Pu!|_8Oc!0FUovkrC{A1Ej|p|-Kn|K($$Mlh+24kbcO87 zl3@J!jKW5%u!!zsZ>#OCuk4!UK7%7Ojb_q1beghkq7o$RIeEeTAjxmsfvXME?u}~5 z#~yz5ayLr*J$!W&Q0Fr;*OU3^sfrfbxg`g#Q+^kBwrlBL;k%@0GLYzDgpT22SLOTUTCOj(w0n-T zJiq4+tYClqYR3Q++(=a~e5uqzMqGk>H72L~z!fHj@RG76NxK^(BW3tm3FF1VG#H$o zyt+04GGGplE8H6mw|sxG5bZ=`j>1V6wYAfS`$K5wm>t;~D{N z3Q6fX)Cy8bafVc{8m&ru+6EUml($RCtI518_u-$1-|wDA)K$@wxr`>43aEZ{3`Yo6 z64&HQ-_y`L25-_l-=r_HW!0)Lfdi45eib?z6mH8);*c_?4f1R>BXAIrybkCg8DSv> zik7>ePR#Ux$5N!N7_=vYuU%9DN&1=0Bze&YFoM92ujzgjRj4cHMjJ=DvI-J1xa*ln z8b&Gmt=3!FLV6r0)jI``D%gJ>106D;I>|zbjcz|in--95&wAmM;1pRd`H9@sJ$us= zJjUxc)A6oj2+H^HX&s8=B4lLpl$<1DOo;us&IYZ+ci_6ydMujY<0#hrvgQ?f@1Yct zJbl(|L#Em6PVZ%dzk30AC3XIYRTxlI)A#A?3T8?MpK#wfLJ)Gqr7_i#`2Crdd}vsd zR?FaUQJAPws!x6yb}y_Lx$I{|2rl`?Nysj5(6rQQs{5L?E2Z_;<2JkR-4R~OQhqB6e67Z-G7)GY!0d5kN=t%|bn8nhO`w(ap8X);r3r9FE`cIqXu+`2D{ zHV-j`PtoMQghQPdKeh~z1x*YM$WzY|4W6IuBbu#c1ZrjrqpT$7D+^f1(M1}$H8;{4 zEn0PfqDWq2;EnW0%@Y(QFSEssh#g?jNdBna1C9tVrjfj~@gb}wyMA1)6 zmDK2d9nG<$q;S&9Hb}eyd^1D?h|)N8XREsNKANZaoqLvM77%(78GI*dJ}5St>Td+U zTNVdceYj@V{2q(WK%3Bnnuc&gr}iwSW<23l?-;y1t%q|+(d6Di&y)A>%Ybb0hS_=@ zIpiC%Mc@vnVjdPCp2YK9sCJ-aqW&cN19Tf$bF|PfGFqYva0aA+YpSak7hmwCQHaeW zE$k=XxeOd0egI)Aby>LFe6&xj3KToOH+rwW4tQ+srbdwcMpV#si|^Z8S-}(pQTWW= z3T*FWpg^Nh(Vs7VFrrAZDi`)OI_yh44l!zhl)@TCWG5mrUn(ekcf}^b6}iYe zG;GT?10!R^YGzeAMzUM#&$MI&d6qR`ZfSfmv`*R?1F#Xqe3f|$rL4;oIi*S**y^dy z0r6_E-Sx@uW2+xNd{U*KSyi4;bTXTb@6lWxq;U72&J}0k-7V1GKfu=2<;j3y6N6y!Qo(9&U#ReXTdmw?>n0{)007%kSkW1j()RPx-ebS;$*Xh3nTW7HIJAtE^}6CMwY^*e zS!$?+kwD;n;muRT zJO$!$9cQS57F0c?ve!g!G}#E7|9D2RNooUsF2=dPbf58d{TnPJO*xB@;CQWlWO8J~ zVwaR3Iba>Ur~jPifsvKv?-97bS}`PGl(ktav}wXVpSjMIED!&kjfElx?qYdCRHa zdwD52lY1YPMN9~Da>6M|1;a7{i~O^b?rypLCe+O9LjVsU=L{!yOrcXK88Ve6;{QU6xh#`V?A|O%a789l4MnF&Q%I|y zuWn5Yk&c&?J(|m+tSc;{d(8#z0qFJd1U4=++;=O3#N;kwH*BcU7GpIbxE8haKSkg+~^?L+5%FYyw&g>Vtoyv{Pee?EQasa_ZY*jmiDn zgFLiD#3RX(#iUCHTL|qEK{hxIkD)b29DHpQxG#=ovRipl? zcdqcv-Z8XuV60Ycbgo>hTF$-Eq~p`AQzIUm6GoBedrs!|lc1jKpuH!rdcBL6%>i`u z=P(4wgY3a{_Ukl;uWOgRC7P)=O`iV*Y7{Wv{MIreNzQQf7dh}=eqgW!s6qsT08V#k zgS=vvO1P}$Q00?*8A$)vZ`LWuewO<=T|ic7IV6K=;^J;8sHF1@r)e5$&!csmKswGY zP*qeL-L8JfS2U}WaY@hNxyyBwvG)y&SErTB-wf^r1Xb_Ono5=_e3iny;;M@2N*%{0WIT2huAl-$M}6E_X(%m=J~5|G#-3Q* z0%)<8h3%GAyO1qbr;6H4Ew3e?q$}$Lln(K>^P7a8*;b-w-m4KN3b}nEc#Bpe5@c;I z0Z|on`P}u;uI9J-*q}ONM;1z;o(_kF_%NyY;Hx&eblDZY{M4vZhO<=?z3}>njrd!S zywrjQhnkLRUji)difUDX z8%j=$n3^L+_#Lg?gs+NUW{eV#AHGV2PP*S(A9iP*&p8cZ)Z8OX6bOhUFFw@F>f#}^ zdC6v!Y+hS>lzsB@JMF+f>DZi7w97wtocIXQsrISo1CM>*GYQZt6YKe;)Kq)#5kLyU z{KkxqTO%QUsD<|h-WOrv-ElJ+G*xcV^iW&zvux+t(dhu}c^@B9Px6DrY3<}r*R#i~ zq$@waR0{^R**Q3{%O#e(n)CDN(@4p(4}QnIhf5K-MKq56-Jow_VcsRC-r$|5^{7SN zIU7&%H_!Nfkb9fDXJAWPsk8rG;`L|0MX>t3yIsGo^tosQj0LVtLcd|L6s3sjbca`Q z>mSoo{m=ubSv)$*y@ZM3Vhl~1n#|jWe#h-;Ff)c^+^J?Et>_1`sT2+}lr;pjp#b!9 z+C1{XTMT|)U!QXLk#o2A5{#efbZY&1ZfwZt1(U~iDP1630$I#cH7a zND9GAInB1GM@B?OfDk!&js1GUN&fUYJ-d4R^Szm(-eVufeY7FGnXB5jkDhcN%tgMZ zh5aS#fur93jjrkb?a_w&Z}B$-wHNH(Xa1r|e7Tl$y%^JnD?lw>V`BM|3jN{ zUWC0lVES%#edB<9ERE2Hi>}DQ9XS4n(GEdAR^~Sw;aK&ONw3DOCO~wj;&B!P1LG-m z+B!dR&$wUtax$@D(h61NEy%K2?}$_KxVs+n8{JqOLL#kOqq8k}FCeKqS>R6gVC%Ap zI@Vj6Aei^~sb`lZuUOG1KzkPQSzV6!o++;(otP|}Gs3z{7A%V0DN2GMx@pBG*BW|S zEd(1XOhwik*5%l$UXowE2)bSL7z8sK(g6(-$}CAGE^lN@=R`{tlVYLVVv}r;m_=So z*Z3?WN>-2k>$*j_2VJ$e$2SeS<3p53t9|0b_w#Cyan~!vUzYFM%m`G%cJ2)18IfOZ zUEICOs^hPogeP4iSzDj=v9_h4=Y>}fHq@jO$80(3PwEbM{OrDR+Jpa=siEp%fJ~Vk zFT>v6x^Rd)5S+i2?p+f~@me&GFYK|kb%kIyHhLdQOm7}6<0P9D6urHwI2fnboAOAl zityKq>FMclsj{H`eI)qH3fM%}kiBuV#nb$d6pvnvhXp-)9Ly6arm{Ok;}l|Pq`ZLt zkwxjs{Z|mj{0iIG??BftIH%2*7ZR&J~0+NRAV)$9yTZcTiz6Kw#E;NUA8tpay! zP1Q{Y#ihE=ZLF`aj^7$GKpx@Ww6&pXTiMgGI{F66YwuBx-=8!;~b#{Pl+RCKc=uG%xB#!&jveN+w;2B}Ie-I3B*?D&rk<&2#cftO*l^i?3H_*8C4a1LD=h zO+;u@azp0l=9XbUZh+}!4FM~@uozcE6)lZ5|Fuk~qgR&lJD8=^;s`Cns~Sc$I^~R?hqTnL8zq%21kVMXq@i!{Kpd)^TL zKwqR8<`6v7b3Z;SlWC$80NnP{UNW8Rgphqv>ks=uZ#-r?rqL#gfd?MM3iY$c-)X!$ zGl|m&eUC$|66Wi-NUzeewNWT$38V@Rjnbq?&C)?1rWCzHR~((ld#fT)e=h;#Ew68f zp0MD1zw0d$*_o4O6l9Z%v#K`|Qv9aY@Qe20h~10k2t#I8FJnwjp>-<}?mUoCk)?Ao zX_4HM6(s094vuBdY{q5s4*9DrR-{$LdV^kRU|X>3@;jk);$nTI@R6xDs$|z%suZxQ z>CC-8zwymVifCMlBYCK>wrg#&CCm}ANlfIM)uF}g9!5mhrSWO&MZY$MQe$J5P4<1F zwok<4KNml1JD*Z`n<6qE!3_M~J;=C=G(EOpC3Ko+tPfT7;~wZyKarJID+=a5epJ+u zH)uEY)m7n`AwW}PPp%ize4XkBq}>f#tvn)Z_ZM5!1;^QOp1{c7XDcP?_hRn50ghqS zenc1Xa`%YvQu7kq9XON9QB)l|rO?vaP>Cthyr_usNgl~(nF*HkBfgO63%giP=M&b*q# zc#M)nw4Ji@?x+n=f7*O;-HZS1W$+bMTP=I3s43sO+^tbf%lvpG@50sMq~vl7QR{*J z;CW>Shp|QFS8+|2O(@0Yg7x(Hsq{e;o~Ohbk99s;c6Zak;gSQtm%}eWnIW?)aIYbM z24|o5OmG((IV)s=P!|&u1Fni;S&FTuT|(Y$?B-(;o^8;he4oX|dPRZl%Rim`Ut4~< z0nt8VxxnP|uM*-vWJPykyr)RiiY#N@Te0V^Sms+&c6j-~spC&B-4I|ZToZGaeW~N% zD`--->Ka|}4T~w33~9YG2NRnC7X4Ct$T_wqBG5k6J6tG!4qhxbu@0w;HlHDMjgpoj zrYhs>Gy~%eYSnNw(xUh+T7gOS?a*;$#$3r@rasZ9o!Czf-Am5m(dfc*UPeNHnn)g>T zoxaGZJERgAI$%%>JN8JHFBhY74fQ>mXz+Hf(US5mET<^5*2X06=Z7uq-aWMKyqK&M z5C>1mH6}GcEW#vFN{Hj;B|zhH&6jFjx?k`)ZL=P#_3tU$kusOt($6*@gKX3|A~$&D zJ@9!S1Y0FQlYBL!J7Jg35ROu3NW!v0f>~InP)hFvCAgI{+Ga7}!W8tQVXK@U>nKJ%38k^&25S4@mo&6@Bbc0lie6SCFGRWyG)C~TF8-(n`0{V zSn(;=7PUk0WU||^*YzvhM)*O<|D?V>RmD)ng&ftm>q)vlHz-Er!DGJ)6Tmh0 zHEOq>`R1#P*I0;XL&o1_j?$)k7v@*w-k( z(Miw<8t5{E*>lY*XnR=`*wE+sm)xh}kLuCif2)&NE*h#ak9S_*+mX`J2dn_oAcEJ1 zTdPj9mpKMQTfz)4zxtVdrmKoayN1pU|MStOwbe%@A?rU<$f`uPdKuJ7fLp978p%fR zcp!u8HQ1yn%7546A$4!gKTVY(@Wl>Rov+dZEn;To?M$MZ@+}pXPn<0ut}7vs2J}ba z&s#PNw6**tIGwCuJ$6Gz=g!6!y zxVsWPea}{WKi|aNjASZyfR(JdOGV*dYSs}{Gr;iZ`FK@P>=`+g)6Q6e>Am2PH!H_z zLBxJc!72SaU;$k1Z~fa!*(%YpX<3OQLnTa(cAPp(dRWf@n6gfl{q~@C4ayV*4YhE1 zQV%yqnR9pc5_=OCC5#TaT8;xsXc@IsD*QZFfz0%)tP0n!dws;cqK}1IVu>EigfU~rBokk)# z0<281 zFl3T4%1Y3Lo?+jAoGkahc~}30u>7voe)e*`0=#qAse4#kr?kwAd#<#aa_P1B4^3F9 z)P|y6fT%d^OJ5onJ$qlI3pSu?ozv>}rQ5P;-v0hGGs4f(7KQ4deXUCU9>+f=NI^&S ztvXLAw%?FQV_7K7EAzz4hvo^Bv08 z&&>Xodr0J{uL#Z@D|SqK2J`Em^fq^9m}a+9B3d!U=xi|fQw!} zTh#wsE?MbLre6)6Wc0?plcrY)d5@nF0)1cBJ=c1<_eo*!p}f7-EY$X^5!=yx;~LNJ z$WFF?$cjnOlN|3o|8DW*AU>Zp6Hk^_YXwofaArFZoFmbM*7b60O7 z3j`>5Ls*(-lgs<&M=KoSLf}0M=v)l@A~Fh>>>UeVZja8A(QZW?-;pU?eWT9duM4$a z-E|Y+G75nNT~Qp}Q8h1nr+7dDlMGj`Ir*Yv#BpH3Q;_3tG>tJr2{+2^a!qMDTgKbS z32q|pr?$_KjLA^pHKC&;1?7G`K#?AT8S$~RON(mCy zB-msSd3pTYp35oS-;;N#0@&30dXP!v3pLG{6ee#ZuR_bK+o54$r-4(`QmMA9?QoqL zsP>c4F!g3OShE7^ti5Gk!7W9y=KdL&y1a78Ioqs2)0tpUvx(`thatn<{Vb?+HB075 zT}$t{LxBFz25i6SPM%>@0bkS?L!LIDHciT0K#iD~G%yDs1c2ywJ^`26*;(mR*%&w45j zDE8*21v|pDjl+D|d;5ER%|bbI;R}^SLtPzfP$CHH)3=~w{~1L~)HX*={kf}^^8K^B zv{AY~UD_s{_4;N8(u$_=ueL{P7Rnk69#Q60({Qmh2Ai3g=e*szwsHu&wQwU- z#GWmn5M95XB|4q@(ST>-nYz~A&u^7hMALzOkdb$R%ZyZ)P!n>zIG({|#PmFR<5*kv z5keDMNw~xA+*$5QK=yycwf{IYklso6d5YyNG1 z1Z#+_uGx|eExk~)j`<--YOSuthdB1`j~as7;vq0<5Wv4ETb|2TDvwg<;Yz+@k2j9` znZ@O~_}na~&TL`5HMa@4_{ndyQ7<^LSn6pepaa|}Q2m^Kfsm7YLWEr)Q3{2FzS}X4 zg}hcUd=N!-7xFioN@#ZR->8MG<=#$eGuck{0*gIY@Y#-K61cWpPFP}-9r|2Z5SL!5 zE)X!qPm`whjGKCdGi>hcF~yLFpwnk<@Wp|Uky9y3Flx6D!8#ezWa$(!CZWW|xodAgb(JI4ok@D-k}xMp#!IG;Q@Bg1YFx6WT!KcIVxiBl|% z)j+4_mW|CHb{D`bLRP1^M>XhHS%v%=UQrxM&zy0F>zfLaCr(o^)6*vZhrPE9i?ZL| zK$S2+K|w%}Qb1%V1p#RkY3XJtrDW)C1`JS1NokcBx|yK|q=)VpO1gW7hV$_5z0cYI zE$;v8`Eai5oa^{Wo~d8|)>`+y?uBhv41*`%`Q-8;`f<7{m={_pr86-g=J+-FBm=5o zHN3C-^SLR0h;rgG;EQM5<&ZBGq=ww>fx57|oG?W@(^%1JTJQxZL+e!g4{}JzR3?sg zUJGcE3m8iZQ-XnD)Z&Z7gAyH>WS#LoaO8q}E!r*2g#CE0ckqdSM72Jgs2av_S@f$i za|uN zlsac7HnBE0vS4nKdY($D|Bbhdb? zNv(74K&v})b-XNRl+Qh-65F9xwz$UJeq8pAps!z>A2Siife50y|46yxw4j0t*XciO zk$?U<$@b7|QP1AmpIObB@AzjVS>SzpY5A6eR@5f8aoDZ5Yw44&a*`6+12DEd5~%)n zzc(Qob>S8Mp~*aTwA-EWQ0Ok&w703-Y(+lRvK8#We+>Xf+KqJY** zPCs%whe)JYwzdF_w(ryCkJ9zuZtqTNo{dBzyx3y2TR^=(O6G8Hvl=Q~uC)@)m!8Y^ zXn}yX`O=Bs6otqIl)v2;vaA+B{{&#!){{y$AEx0YBEsNK?;HO3%evH=@#IMLIWq)J zG#{XU0Z_!MB?AyIqJlP&Luk%G55=CMkWAt)xos9^cPfaM;!2jxyKBGVw*p*`9<~Fv z?m;E!SBZaw*C!n*n%htj%x1a_yF_l{kjo7~!i%Hr7b4n2oA znP4XvTqmGWi+(oEIwY0@EuM6Xlz60iz>BPLzP^T->+_zWNzsE;Y*bp@2!ZH7Tlp4L=rH7=6_`(f| z^?9;@k)?eJu@io6DLB~VvJv-cLA zev%~)F6K$t0Cmx>wl?9Ku%8KqU^ajgMcG;+V%~r+i!URUff)ORg`qDBOC^?mjH!&x zMCudi8my$2mGZ*+QD%1YJQ|tq@?#QpWx^ZUVAK%{w9Y*-4`Mty#i?P08kUE zIYuUJs|+TG0IjSqV7-|qnppj*gl23z6Q)cR|7TF?>dUg7%A#43et&hxMnsHgXJp8I z`B1>e2KUctn0ejdg#=zDiQAcC=mA70_QCksK>!9-_A^$_I098~8YuzI3%i`;-mb%r z#m-r9h6Cwv#fa`&UxCTh3dQlz4;v3Xyp#zAniUo61l*>2xBBGPpO|F#)>?*yRH+NV zOXSNSm`(I(!x_wt!gbN*lnSN|H|O?-U!Gg!6<2#WJR~SJT*RVtNaj*uX7lES3gtM! z2FcFrtwQKCw8!yq6d!gQwQQllbZFQOIMrhf?WwhDj1JkmFQ<|!+Ibmk-0~*AXYx|b zNuV#?ppiKZvh*(i@z?V!SC`k`fvfpVi#yZFr1w!r4XiUw=HZNw>!a+4M)D?CH|{i1 z)oJ%8Ab&0mbtG&TV|!!BlhlpiZ+LSKS~+hs@Cz`@-*#Vv#{qOXJDEjhcw?3%hy znvJC$M>Tr3T1IW0CPXvLKbW0xH7$yy;}SbPG(?Jw$qEReZ;E{-!HdcshXj+{G(21v z=H-Qx1f%Yr`Q1No?Cjutw_T}0mg3iOuBhc+2$70hZtuP!_VqQ8k^TvukqHh{ac&7r zbUg3?Q9&&_7);(PY}8937i-oAwUNxZ>%`Wx4*MAph15q;n9 zu-f(~-YL!4wPNmoFDnGRr6HK|fc050O8;LT?Kp*f?6~ ztF(^fU*e2@5m6Rb&4LhLV$m~77EGtlVB6oV(P>ffRWo6Cvfx$jL@za`d7Y-8HU&UC z$Y<^_wtpA+;w(sHrG|i066w*P6X3a6&Q1Y<{c5gTk+Z0x2+JD#1d0^4CdQ2Bdb?&n z&%EX;aH0FrP>)@p`_k;pP94@FXC3zhQa+Uol2)Y{Q*<>Hn39#uk_pg)I);XHd0L=t>oiLsKW5)h4iN$4!)~A5zY-GOA6^ww! z81j?bXu*iU_(A4mJN!{Y(*^BSxos-B%STMG`Hb@% z8R9>(#tE%BEhJL$jv}3H1?-P%Y8m-s!|q{TegAaL6=VgS?aod2L543@WM718)M!I= z%2(85`XK|?KI{DC(IaZ0-Ef)5*msAXfVOxvvGvU9S1^{{swkb_`xT>Q?__Co5u882 zLLylK4aBJ14`CIp8ck!#IHQGyg{jsrQhIl)nW#H*+W$&q?7_!BsED7^a!G3v-P;1E zf?>Q{OYiKgF$OBrh4C=j?Qt(S^9}b;h6Z%#(M=|rW#nk$cO0c@QX=W+?OpfQ-=5Cy zJ>kWH6`e@}Z1>NP-b~lQseB18$-SKXbkY&fZ&+}wK8P2@QJ3>+O3ri=mgSB;1rlc; zPk+CTqpp4?1mbRTn*v8&(`Q~veRo7H@^hq1je$BpXQQ$(J_vZ{y%ZF~ zNrojvlKX#87ykEl#{;?B4%i3918EYZZifv`E13UZN#@_?^WT!J&qnS4aV42EqeNhE z^Ok7L5quid`cLcKmf$Lzc2$2M@4XuAgomZ&3d>`6zfTu%`eo=jI6C5qLd5$QGwnI9 z#K!^(D+#fKLCOqDH2AJUnZcTo4$mh0ZUH+KxJ&}6e3u^7Oe;H8X2z4^z@jdMh{Faj zuZjUR5>7Ecq$gq2Kgf;0ACLbzJn4lvct`5Pa&sgCYn+0Df?0c6K=p${ zQsD3J)xXxvD|`ZzJ=Q$wAD?J=tNvMC$A{IozL+Xa;9kvHQH()77G*$KB9f(MTFC_B zz=n6@fdX)HPB{+w{o|?fe|u`nR5KmQO11$N6Df_*COLG@(PIFI(ER;}j zt5FVZh1!xqx7Lpesq#^8kot*3=PPrr%rd+_;;>Y{W0tXrRo2!utp|F{*f8jt9$SJw zzqp1-shYc&$fTM8d!x>Kd_WWCJtvX)Z+I-fInHZH=uGwMx!*BBj#Hy)t|vI~sH8kCqLrs>YmRV;8eqH0!UuZH%giF~QC2R3i?+m&?vS908mTfs{)_^Fj383GnfL44T z8Q3<><*{~eABtVW>qy&zPVNm_Hnt5lJ~X04-#)B1Chq?Jpcu5@{p=G-*t=U255B1t zgCcuCYK+>2>w-pn+@>q0bZti++q&~@RopHbfs`Vhr#ucFZt(5+-Kw?q({iu*!@kEj zbgH5gB|4{N+scLJ%hvo(bJ=QF*jW2B)k1&x*-6g!q{=le+dX$);is1f*ppsedOBr* z;!M6<{3mi#Zu*mHgnJbudCf@H!Pe3iZl^|(!!mRU zx>Uinq!Z0gR+xS@M%(R z>2mr%(mh9L>9;FzUjneosH;^VXe~0=G}IxfpIntre|8KI%DeA%sXDEwH)pI=&)QrI z4FUXx{x_^GJrg3NK*=zRgWKCDdzxq-8x_TI>dng1isA9JoLHXFOmz_Iiws?BU(7_a z)Ow)3R?{0~l-x{9ul^hm8zDQM=|vGB?H@0xDSq3| zUtbXOzF`RMrLLCs5Aj>H@?5XXc`u%y6x&hs)qlr3J1)^}Iarv#Qhc1+IR_{;tD5z~ z8*mRZE67%0tbqnJK-Iu4lVg*nwd3#bV;(ts`$9IYVqF95BIYab4$x811>~u)eUkOz za+3N%Gv{xPv;S#G{KMd&M_!$gtw|ziR6*xRN;JJrs;?(393LWbX^53KuYC1t+IW8L z#=gfe)o`Iz2yJv)xbkOrg?{F#dKPU6P@2PI{o-GxIoRq1N;pb7isrv_t-z>U)`Zg! z`||Y0%qPBnGI2WY?wVFW7pc06QGA4GnSq3L&UR zkJ+ASne0vpktr#RtE6#|4nhi!rGq`ZxUyJ>(^`hDbri;aK{`F%ZkPoO zU%$EjERjSj*D>}bhvYytq&CvY%eDS$mfjfij%!M>rH;X;TiAl7_V77GUoVVx!18g{ zlRvrQ_GRfc;X6T(aW-iegtbN{M$~{ajMYXy(fls^;W1%CkUakT<0m!+FA^T1A>4}P zPHXhX?xNY;^D$0y@d@jt>a|i^n?H6*Q>mSGu;CdpLtCL@{9n*K_hJ59gF9 zc@KpB=XcmOp6k<$YTDe%PCS)2szAS$`jNe;J~;|z&=2*)Sm-#-wR9W(W4aO6%`5B+;XGWvb>Lt@U(+tc0LDE_h<@@ zj+w+3jOC8#46;GwrU(774pbk^umz&Qo*7BE9?!|-m6i^{mr95kg`676Z)g`mLVI7P z9?N|yBoIv`VO%I?i2QXo{LYr}5u+t*js&WC4D(ryrE>3P)MO2`Q_0r8!OzLjG;80N z7dbvK8!Ksgn~;)nGavDCn@AfDd3$7xhjZXc76$VWF;p<233)Gtn*1Ejc>lfzAog@m zUq7+XbrC{yiw`pTcz-E*^VOvZ?YHsqo_oX8G%K3y>EEx>bfIO4n=&szMU_PtBZcq! zJE`_II`uAxU7~B4=R(Bx^rA#W^>}t$VVCkKev0E>*>FRWhoI zmUxt*A%LP}#K09}rTx&X{1$c-zUH>HqrG^#1-2h)rXKpL;AD<2{22elNiymfp%DB| zvbi-6$vV*%PhZ_A1PD%xWhdcu!e+v`w^LCT^z0lcfGkIjsjYd@Mo z$z~81o!$u-^Hmxm41pbIc@avPgLr+Dj^$w0i@R=ts4FCHXg z#St<($lVx_WV6{iEzn#f-njHtw8SYN^)MPJB|FF!v)!5`fD5d}BnrjOH!AhAtocDtTe9zfl4BC)pQ~rO$3iUH>w2#+uZ3|zo%va%oQmhBa?gl4*Lm{ z+yV>T!x00hM>)|CaH;Eip}tv#WsRchq_Z*|yh{h7$LZ=5h{3m9nCZM({|WMzBsmS) z-NSYG2$!I8t#{B;|3fZ1qMI%*XmFO1|G<`5-qWq+H_(*zQ(DI>0&SKeW~-u|r{K$s znQg%k4}+q3eh-Bta#o%CY*`1jS|5~>)v)dlgjTFG=zqCd|J{%CtzFbU5 zSEuHxf=^5l*H#Wofg7bUbFxlwq@=6&9bHJ$y!a#G4_H0&-@qyp}@_SUu>28p;^^@r0vtj2hyz`i}0OU3-Lpl zAjnfwRd_Hu(5HbS$jV{0HK0?71LU&_iSjQ`!`qG7jvc6uc<9pEa2`Uu);#zR#}-@BGii&V3KED8nv_@X3(a*}qpx>4-DK{N?gtFbVl%Zz z?mzqst>#6t0#I{LFsqvN%k~?I{WVwrz7Iri`<8G9Qe&`R!$pM7kA;H-lf3*yipGV? zrBMSprU`C+4&^EGY75gwgvJ3O2orDa!?>m|rB93mM?;=lMIIuh)*^|lVq@foMWP-R z-oU!%hu1eYQi2=(K8={VHb#p|eWLMLO1;!z){76r>@FoXOimV;X`@rxe!oy@tcfO$8fStIEphpM6t6ww}H29nM*OtbRc3 zrCPl=DQ_o3wvtguZmPrO;b1Iqk>*df^e9EMHV9#n-vOwAqN+l!hjIq_P z>9HCePJqF@QH2-bR_1wgR_%tOJoA^6yghheQD0?!1#lB$cPf%7^!v;528=hae&l*x zkLR|JX(@cY2r7$kRJJx(lkz}3YTJHg=i(5~OuG!azNE&-#p2h}m{XBsa-iGcc-zQW z`$gV!iH3#yba~GfoxaijvZa3Co#rAOw|jL4ueIJ?qZ5>1xK7gr7I1#eZf%`QJ`xd9 z9aSbdhs@IMBQGVDB#tsOpkjq`>~}W*a0w`ej7(;ZeSf@njQN)qfPMmGO0QYfd1ail zrKLrMKEx#QaHDtKcr1-Zz}0$TyDC*T=ECx@ViG^|X|Aq~-h3+Q?9lj-x3*QB=VX_% zxh^tIKXGJBE7D;;8Ph$S7Y^%Kg4o!gojW@Yl>R^g^vaC7uh8e8>l3HEualTP93wj# z$#b!_-h%|zLxhTQ!(1m^o=S9H0UuRG!?%?eF9)RRH?5RyW8hWu9a5>*g}HK<_ZGMD z7~=HxK_5R-%)s`c%Rco5c|^8XuodIW*Fv6r(}Tt!8OKuh_Q?mWuz8vvry3l`K9a)X zgXOs1aEOOU$pSVVY*p<~j7m2>dPTgW)ghYi3D)LHf`gTS=-^IOEERfa+@UGwJA5xe zz{ojjozBXVxWcmk62JY9Yq`;nkfgq+PC!w_-c<)-s$f6AW%NLXx;kLqmeI;d?;lf+i>uX&)rX2^$8hI0-gbGn+#BKBnEg#q6>WGKZ z<$fLuT}?!SkP|kR9bwXYU@uicF5s#NkXv8bPF!18A8o#}w>A`=vOf8U%e1Gd(6E~; zLytkLLqZ||bsq-5Ql0wJsw@Y?PT2u0!LhN?6c0r3)~X_5>S;bAMFT^>$*Qb7U^ANg zmKE5`x@sFH?v&VEOst3Kh(xd-?OA@N>@G|{47k8#9@Tjaj^h>%XeEQ*CLSi8RU%}Q z&_1oQEb-3US>e;wp{(^X3mSado{W;ibq^p|OZlT%fik@&&WWzB^jcO>n8Y$})i@t$z2x3ZzoXxDp8&&pyn$KkTu(&nK_ zD^~bB*9E`@*LrG?PoV2f!%5ffQChB_i}ZZF+-@XWVdRrIoOqC)XfhLOmJ$e+Io999 zgrsUjL!Oe&0>zKQfcr%w!ENg2PYxpk8hTQgSFV9L#xgT~-o1PtSMpK~uc>D6pWNEj zq|Hx0t$>6TWJC?LMNV62yd)$(8k-Os%R*hg{86(>HS~N}?~~ z2Tm6|_s+YW7!u5M_AQnb258&ykhW^MQ;CH}DfRv@893*|L$^}y^o$4C{PgHKw$#-``^7i1IRwd;~hyzNVFa%dB!zVbpO$p zUAQO(=-*Qd_vrR#YG|h^-hSEd;W`u8hHzE(ck6rC@QEBgH~T;9!vAM`e(^X1@$AW~ zqPhe5wBitm=)Mdx&j!WWr6qUuxw5JE5B~s8p!X50zGd4Ion_kn2TL26g;8T506J#O z`6T2|CW}5)6hLxYlk)Cl{ZYvXpi=ymy^0onTUv@gnnx%>>1?M-5#(>reQutQffy6z z?k0A*;se`g6^5qj`C9X{Cfy|!n^(QM<-H|H`8t;NoVenXVZTv`^<}-msY5J$nh&Av z_2lWKEC7?Zq`nFK6aNiZZWKW8TJepo|3*=we?!JbQ|nAghUsWODlf1I-A~7e1h+D0!>3_ZM(F@b;+f)1CoE zvt~>#avjaiYWkP_2}A*Kp0oj^aGc=JHqQ($a3XBg2Go1-;QXFdWWm`4~O`MSA8K#itogtjr3lq0Uj{qoOyj1J(vrkB1P$y>Pw+rC#Vd9?XM%vf2DjJT9& zjD=?Sv*G4G&FXAqUphIX1R;g<+!yl%nAy{^Z8&-P#3%JTO!qm)QP}83LxKM~^Z&Ud zM|W`efbtl`NcJ!{C|hJ_S~XBOi>&bI(n1&f{f>8WK<^0zJrRoQ570%Ar>3XbpOUr8 zD5Z+|S3?fRZ@xWTD*u)|5vjfme^Tu5S%aQ6*IbH!U`5nb4TTrcreO=4zNY&pTgOfO zAL!l+;M;Oe-|y_BJY1Q#9?blBZE-^G3~$Jr6AHl5oN{06VbU%zEo$3$`(}wQQ#7p< zlr=MJo61={hdrQ6=DV=>M7z)`ocf8kt}GcVM+n^7MSZT_COmt%LXJbX{DFEao@~bV zhAs4#_W8@eO%~WOtgigL*5fP;`}6A&1v1&EL}I@Lgp7&sqCxigmvM%m;ovHSHO{kc972NYR(2eOc=5u^HlVrafkp z7KQG|8rl_h6{9`*@cu@bMjU=8VHXiu0jG5v`SJZ=@N8FsZNksa%$ak{jAka^mj+M> z07vFgGX{Y!&-Wyh0o2p;mey!94hP>>onEV%py%TkHq##SR_xSoj{Ag#3EidFo5~l_ zHe&_zIvmq}ae}a13J(2S0IqZv=$?(vOKa|;NQ{`2owa#FM|{EIj=P{ZoaxL8`1IcY_N{_$TzgKDnm_R5>n3QfrLXw zk-JZIc&sLUZjs($_78a{>pH`-$*TVX6~)fvUEM%)kRAZwW91)6Va9`rxFkxM@6`84@hG>UEg>YM>24!t~>yH;$V)KFM{^7=Ij&aNP6`c1_oPfk`g zYO3MW!uZRdp9@`1U$Us9G(nnI`Y!rqz|MT*_<&D6<0xT~yUNaMS3j4QKF?6$S2~i5 zk``uc8iZH_acVmjCHf|ZFSrg%nd1ydD4*WwsBU}4dEnZh#N<#*_<$FB(C%oh6*76w zO#XE+Hv5X@0=6oZuPgBmE!(P%6-fS0ex>5fO|)6p>BUtPMWW z7H*v&D2*H^7Q}HZCCBl@i3UCs4!hyH5jt=Ih~j^E*7UM6iG?u6rLG;g#r zr*OEki@9!I+6zywe6=ik z&TR_034LO@A^=iaj4WAVwPPCMv}oM+6+E)P57I$J zv$5nxBNgojI4vM(F$dTQje%y=gh+={SxlGO4_$VZtoI5(0EoUOqfPB>zZ+5yEl}f+DNxw zF1ONcz{nxUR?bpo@bs9sEVEBG)PPA z92O>7Gw#BO~tAO1)uAyeauJ3{th_4W0#>uY-TtycoUm5wDu zCwF+PV?8n5u8-cXXFpX?5IM38Ccs_6SbEY9DlxRKJY$jUQccV@ zoe;@=Tzy*RtCGA`!8AHD`-Q1NY#%*L`pelR4JVqW*48*yb$A{DGOihPP<9|bBbT%E zRNywJfPe-$yB-rd@LE785oNduoy9X|=B3t!(6>!9se|K>;C#6tFAq^hCkUS4VvL)* zb%km5J@<3Aur@`h*Um%jDwk-UubZLsJNqHxQY|!?JKC54xMjq*B2!su*DvaWRZfC2 z8%S8{muQ6g?%>>Y`_p%P^-V>gm0cNGmHfMR#^C#7#Ir%`X03`@n$o8gK)#i5VsT?2 z&q3vJFST2>Duvs2cf@+x_|Ybq@^spz=bJU|zT432tkdjNXv>V7s9^SN1`*9HM#^3G zW?qD0wR5r`>L65@Yclb02H|8eV>@G~Y?0V)^EHumWyjnNav)gfIq&S1Ga$6|lXk$X z&^~NfSRqlAxwHMKW`s+-UAs#6tLIT0VNZVH^sX{!!**|_)1>1Qlu1BDVM0I0{8=X~ zMJf(^;L+lAw_+Bq%()C_4;KwXoup#Q4ls*TqQ;YEbh0h?Buq~I4Rj`qbZDNOYdB8| zTmfK^34qHIB*ya>@~D!MAA%`mR!r)y^&WpOG+ShDl=5Oxr91}=!`cf?r3mfS8^-XM zB?@h7Y1+wS1{>nwtK>Xu%aqeOInWBHB}2TTp#G=OWxn+YFx3EjZFa@@#GuudiyWS~aDYu`IcJ_>qpAA?J-EWWUD;lx4x8GgO!NiKhVBT5| z7igjpkO8rS+ZSUaK@qJj5eiMZteP;sWRhR5jT^vmchvA4mK+wy=3u&iFo8^+Xi{yL zTi#pUg3cg?D`t?0`(R&ZPte53W*hHt&ewwt?0NYGZe%I{6Bw{u4lgbz)H*kol!WdIjVErzx@Hklw^wr!jMKrG})TyL!pO_8-4NK7jdWhxtq=@c1KblL!dm$cA5 zk+Iznq-F|b_!vRtQ{^$U0srZll5fW;mSQa01@7THo>=uNuOfnxHklUo!T9W;N59_i zKHR>uMiY^kaFP62-b!x0@74S2YmhFj!Gw^Y$Y_qswLDom;?L5ccg=$II7QY)wGdg^ zZmM&quYwSw$}w|l7sh)Ykuo(mKOg+=etj$(XS}a6Gu@|RRf=fQfoHHg9KLR%b|F5- zEzg7pO_$zCB`EWy)*GangnH%T+Bhg+krX?bl^P&{vD2WfD5+w*{YX+$yXW>JQ0%3ed0o+r5FC$ejRfy=^Pkg5$$3nU@F+WP-10XqQ>tcFeP3|d{$3o>8Cg? zG<1MET6ZX~s2fC*ZLUemW7wsUnYv|?;cH2s_=2eK9{%4W5zM^jJ$s*QNGG9O1#IUecp~tK-!nY``6XKXbN}F#*j#Hi5%Sjf@ z4#(I0PotLS&`W*MW^1KVl?3k6Zb+4_2A{ig`Ia;ZW7Bn_bBp}?OIu$}zP{PKG^E zq$NyxZWTk{%6p+%;Dab)HU$*X%notNnr$y7O~~& zHOPt)LuK|-G9 zkI0B5VKeL4726)zI(aj(W3sRqE@0SUsoKO_vNUY4^i_cDHo|?KLmLTAD$dYTj9Rn3 zwg)cv3LtVt+b{Ugf`QwUXi1daySeH!+pErSC@HH_OH_S$ket~}lHJ;l_~etd=j+Dt#z4U$;oYUUMornUn1|vtf~{@L3{{8YwfDEVR^b` zz47l_7-i8V??a9}m{q1>)uqU{vrsFW=u*O~udEA4t8-V0Bl?FDq==X4rAtZ*K|ycc z&MVL~n}-iDP0j0fhOe#$jDUu(#p_mER1lplg{-xnGCS_W-1vXyA^gw5w;jXZdQmg6 zrGCd`PYZ2uD{IvvOEZVjThdRc#AFq~5Se!N`qAzh8Cg~??OV053Dk#Lul80}kxS|X zw<3>o0i12-j6<;PgM34~=H4!&0C#U$cu7DBUCsWqnPBNRo6)pRfeG&~G&{Y!zmt_@ z3|PeO(h0G(J$Zw5DliRO4Vdi+CqgfQV!}+ezj`zpdy79!s#U0ssr5E>oV+|J_4g4g3)sXvg{T*~$Y(v)9 zoFun|@$PH!s_5>HBMaZPGclXiAJ{#Sz`U*D)1qvVQMmgbxz;=1;&tS~}Ei z>X4+@cj6~=($5qb6s!WLWoT+|w~<7D*;k<9bnzNmNt{hS7B&?TR6<2=I0(R>wOlAz zzmxR(aJxjp;2aKMz(eBxAI=U=r^9Kk-#SqEI9UiTG_Uvx-Gz|f*jb^jn*Kq}BX zvWpkc6d|T!`tHZM#V0$%4(8}qo+l?-=5}(&miULWOoQ=S4=NR_C!a;NhW@od#Y+}JEvd4rP-hS+{d|xlm+iL)tU!1Lv zZvGHmCHg?!NH2&+0CfT|cq`Ms z=ZgLdy@B^DQIQphmc@mLbqwT^MtK;$a3F{C0^M> z7YIHVMYCUr7zCI(3cz46E5VSoprvMEm(fhb86^bXTipRlPJVUfTieep*qODLa~$o- zB+Pd978#a%%`~r!VC~WgCe{;Pkl49hf2n>dz$K)@67=IWviY| zV5nXb5r46h)qsjY78^8kR2%tOWa}StZ~w6pd@C74nQ=}C$O`f6X$V7J7?Iv|Qz7Tj z&t2xc&y;G8q$2AS-v7|yxH}(qYCId;t{q#bA&)jt;MA>bGd~qRBwq9>2Sv6DT6J2d z5_AwcOgR7{TqC)uNcVDr*`AZx-Re+r1shB!0h%IOomJCzBr!oYQd`Ny?8t7wqX&TV zH#clMJ8b={%T||c9XNi4ZGA1+y4tURR4+Pb3T=1a;)l9f7?cF9l+q!4F`VNb$WoJO zVivGkRz_2X0jO7>2@3Vn8@NKHrhVMjlj+XBtwzThQHW7RU(CvZs@smoR#nV=c4^W+ zR**j|He%e_FngE8)VE|e}PP64bAq(`Q$NxpKNxaL!ardoMi<|08IvdJ^qj2{( zS5-yZdj_gYGVvD!owrYP(u&Z-K?I2k=z9u%9{ zLND!vWEl?Pa@t(YLjA^b$XdfUH&|8F^r0y*+CpQrDa)anOUBU)MRUO|ri80`DcvIZ zs)H*pSmZO+N)9H*(leZL71qq-?Z^5!9x9b&{%D@Xuj`5|>yS;R}jvb;OnD>2yW|HR$-q7yC5{D7NfNog6BBnk(+`-pqTGNVIxB zY!ga>2V92}V>Fge%^(&-9b-x1%G2Y~BAmfd93e=02w0b_Ee+M8YL*wH-I|8M@D{^5 z6Qi>&SHYE9H{D!TN1CBM=4j;5GNGIv1MSef?Vd~k&cHh>d+(VjW5%^@@0u&RVK?PH$e z%c88(CzLik`XVgKr61HdDbn*s26CsnMsuUAzy^`~iY7-zoRGND^-ERXi*RBwpbHzY zXoI2w=Y{988%c2iS!;7sGj}BN-q02vn^KiStzg%pZ;|_pU@0ia zfFdX?!(96eMp>`GbHZeMFz>$D!*RQiHV@2ofN{?!oKW%k?q<@X@a$}#%p_3e`em=g zaG1?{dfqCO%+V#car))j;z1_^kpT>_ri5fEi&dBpWQ}?p4FkAKk83v5Xl+zRj{P`I@noT zua6udOuNE{HQ3Mz;(xNqF~ic!92`2x?EQK_-CTBmAJrUvkX8ZchhxHu<~v#uq8y>? z-Y0eC4oyTkGpQ$4Im?xf_9eP{dR7x6aZQywf!V)$jn=xpnW?R^eSApMJ)d1(^Sev) zd&AuJpruN+s}C4JlWb0luZ?3{Ejf5h$KLI#688_TF7r0qdd7`ljGOut8FzMMKFUjn zo?S-tb+~lxvfLAyo2os1!@us;+@(s1k7aI_o%`T=#G==w=8Fs#-{LmMjNgZygpIpIK-!`^ zktxtV{$INRoaerk50t((ksgPwQegXGy{nuK7$E?-y{8x-<#i{@N$TL8YW_M6D%eXBi#8~=N-`=3f`8)hnnA}~L zL%*DH7KQhr!XvldT|f`)3@VzAO>kR0W4DZ^<76S;El6cmIm`zgd=LUi2IoYx|M6^; z5odgCEexj-`W8lS%Z`U#MZ;j%iQpz_Nb+4n1-yv=3+|5Gq*ML=UQ7djORX=OAdGNh zw^vl#%crtO5x)yxz0fMbK4)+SpohJA3o>5j1@#&lmh}u%g;NO3r&;K`>JR+tf&ac) zDa@fToRc1~v3~jl06@4>qL(raD->MdE!eOrQj*z0e7T)c2C2Ft;lzu-|CV!l&4&P> zNo~(#J}Wf)QWHY!Gd2GI8<xxJ7ECA{DxmUt6j z<#blwV*3luoTMUj1>|67HB@sq^}d{#0?FeO?K_a|>^~ZoeT_55%VAG}?YVx!ujb@G zE{`}F;A3%%{d3xX#L3yp@c}PIuA>p0PGp1sY$w1X4yNJH)})$^iC5HU+{J=Bv*)tszHk4 ze#opJ&5VCEELU{FzKV{ZN%!MTJ z4z)8nRwWyt%u#u?l>eLJ)aP<|04I)v@Ww_H_~LoD`){B5deQ^@91+*gTKE2BIbQ(w z&L-{e8gl+B0a!)=FO*oHG3vwb4uo?LWuH5Ee{ZhD^hx(1!_y?tpSt>B^5wExyeQhT zt;>*Ged1Iao)5-8IzwI6u$~@lW7tv1Yr&%2_#s;bXHUl`2=*j6K&tV_k3mK5W##3cH18@d3kWiIKPm`|kI(kM z^*~9?v$Cp?JHGnw4a7f(m#+Z0U1BQt&H%|;24^6q7#R;(T%c+>=b1Bwa<80th`{n#poOS}C};!;S~tN|;#*Yi^& zSC8Pf8J?%hNn4wO45@%)KsmnPsaE{%y_9UjmWc$u`FJxE>pIx1lMdiT3%b0QZvK|6 za{p_@=6ZVNUNqa!Y}Vf>aGtlEZOQPh%YBepuVJtQmsxj8KDV@vAu~~z3Jv))qv{NU z_Trg$)18bf+cv?D;%_w^8uj8?&26TdS-A0gO?V$Q5N~0Kk+8DC-jE5-5MkGl((>{& z^W?uJz`&$btYprzJ}ABZq4apjQ>o{stsN&l*&1P2&d<}tusza`i#wIr{y__NeG{Pu zq=en)6l4Cq-c5_sucH@zJsHJnCdewa?f?twrj2|r_l{?2QD3%(L`S?J=S0V`-idaB z!qgP0@znC$yh);K?ee}eGsfD*jwKO1g#)P_U0>psO9ysQi-*nwF33egQ6L1DGS9HP zwBik^nBlD6moW9lB$rlH$N;Ic)dQ_&-v9e0_?y8!#^EbR$@D%x^1%H? zT}{fNSI$`1q|lu_k@Qe^sXx=Sz3-bm_agTOk9L)JR`$Qt9i|!A30BZh>3fQaQ2^xx zRS{76BXV}ftKxL>CQTpg&2jafVfHOf4Yj-!w{Z8Kt*C>o*Ap+fj*Ixup!(F;pMPkU z%KDufpFeNS`1o*Nq%e!@GvWBk71vPT(3*fK0n)=%8U95blYy@dh3^~hG?U^0xjh5R zr6q24LIsHy1s~$8Z4j&1x?2~wk(YA}MIF3+XJzFO9wW=GK%RL!ae_lvxmQ#8!{rW1_00cAd*>O|)VB3~3l^k_3P@K`QL4xRqy!X>q97pB z2@vTDgx*Ucir^sv7MjvT1f+%@2#_FEX+fk1h=70)AfYCtzB%vnT<^K}IhS*uG2U-4 zUmR<=LiXOj%(doPbN}aDXKxneafygsJkNTQCeEHj{A4n1(U6)b3)G{9#{ue0l30qd zxqnxT$q$xs$`2xYb#-BBMQJCb-}od~WQ6f7)-AFmWB$GWUA#dWr%M~(zHsaE-BBo&-%a_$Vx#7++)m0kvsPrORU;HMEqPK|NXg$qTORwG$xPfB4reL#Y{!6h4#r@yp>>9pT0o0R>I8QpWC5Lp5SRDG) zH+)|I$Lh_PDT$bkBv#PZ`vcXrO-)d7LBg?n{l-zJr`?(LP1%ixJ@*7v)}DBP(L!!) z+l_!t#nCD-qQyq3(EMde7PP;hS}4cH>}pzs@kd|QB<@qoOlrt;TY7H%s(*aG4FXv9 zN?Y0f*wRsjZ?RoDAy9u$UFqs}(c5~S^C+puH*P0>z2Ti9^;I(Nxth|W&5*!M6P`QtpuxWX50oO9y?`j{gq3r5-gUckO_xXvC*%&CMrkQf5`ivvyBkD7KF&O3l%yk71n0hST$%UP$?_-gQW( zrp+uR>}_+?Gqw3OuXM|CjU@=))}Ef5m2wTz1(9+=@M`jql+0@W@pH-=r%fI=M`lhN zkL&cV#3k#ms@#y#x%$HQZfjALh(@^%OAMTv9a;ZkN#=v8sH(qjN-OAT&=26J7JVtQ zSxSp!kiN~knfXULx-;5sj}zs2Wl9f>e$Dv2bh_7fV_kBOV17E8N_`nDKa)~8h@D5aqAK9_+Yt?^hHoj)r)gJa8`MOw!=$Q%khbc zT-&1a4?LAv#UReDeSc732Ds_^9u^;;^m|ZRS)yf{Rj9&gRM*+gv{!26&H&sT+c1m@J{iD4WV4Q~AIF%wM zMKo~UCu8qKGdU-TsdL7PEod#tVgYGN&kqK;owO2pwGn19${8B)==b+t8DHS9?OEXu zR**=hcOQf0X^Q3_U@81~+^bUWHR*f;aFcK*r@}pJnaS>HIULg7^{l=BPt@iAwk7@^ z{V-owVPTwTaBF~=Z(Xs(&IBeC-sAfY zyJlnez5_Pj0%IF##D!cNALAuCi=E@Sa9a3~brz<#lfPr_X!5T*Zn*tHl|iCAo7Z zk;gXfTPZLW7G8#*wo-0v1jRFw0tK26!cAW^akuxzuFRo*UVPB;sc<72{-*vSxcXM< zcDNLIxj-2%$Tasw{-7-83V%{uZWqdWFiEkd>(w5WR!oV1xtxtJQ44stLj7JMk!znD7%Eoj z10tW_zW2H4#!-8{xA$1E*GI(FheAwO+y`{!&ec^Hn1ZYF*4IK2 z5HpQ&jgsslZT`*_>B*YXo>%9S#yV$YV^lZdVzne*F@vu)_lhah&*B|R57G*{?@OJ+ zw*=i#Wz+d2*z%8B0MQv55jR-cF;3?!17Qhgy_EhB6Kd5dGrW?sA0-cV6|1g)XXU=; zjrN;5rymC2%NJ7a5Q!U?(W4t1aQw6Aga%X!WwR)M0}I)Pg4XYm*y5=HoE*iwK_qGS7Eh z5`H%2fd2dW7Lv6|#Okt_+6dwGB+q9@o^i)P9Bw->BhCxP`=MqD&2{5prsG?OK#&RF zw}MF^t3X2v0u*L^M38XEj3Pv0kw~Wf!t}yVE@a49p{&sf-OnO=DHq}L&rUTASV{P; zr7NP}6nU*qD<;4??>$JS5*lrzg{_#0q~myM`CMJ zeT{Z8i;rh*KUPMA?yyc_|Mk!#a~Y7bF3IKSUxp35We?ccjOT{IUQf@xQ4@Rn&{TTj z+PKY;&X>!_H;;|ST~#q_(1^u5E`oSem^Y6I9Jy&`d$p4?l%fj^KT~O;XX@5Vyr}{{y)|8SzZ=?nO{5^EK+k>#RiJMSzudn*G^%!|7iDmO zF=3s97E!J1{j?P@msy2X&D|`SO5Ou3xLOHZvhfsh!5;?5zb&(nr13* zhxwEkRKnf1!XwSMuY6euWDBo0U6?uvd;4fxDE`BlRmtPCt?hn8CMwYU=tQ@>-AHAl zBNj4eW%CDKwk5cT-FA!rEZt$DhIxJ!gk5>H<$gCnVuh+59l?B98W?bkeQ%NV`E!t8 z56}zx%N&G5kR`kXzin#^A@mVs*exbj^0Pn}E^&}*@(4n$5tZ1Ks=8cn)p1YazPESB z#kD!Fiyx~;7u*e;f~M!(sNH6mzFJ>B&bY;QLn;mH`8OI;fp4l+;Qhv1qPR26Cp7z> zq03}Xi4m_40pQFNTQYJID|DNK5!lhwmR~$DULA>j7#r)5EIU@#1GNC_$&$_zCjVPH zE0Xl0`2V?dwvcDi-}Q6DKg6@GI_>;bQZt!jy_R5omZq&3$||=}I066FCxm8?1&PsC zNwEVu>2@(oTE%!moym1htR|=FlOvk=i3w$kPmzfiMlMNQx#N7ybZ_QF+#Vh3yM0nw zknyd^^?5Dn<+S4UrG`ATjAd~LfQ~*?%-S=(bIl6yYDhLq|QekcEhjl2v6M4&&50e8l{4VD499dl=~`^evfP`t1y+^wnd-&^fEKe`HMu_uNg(t zw!I_M!#`eS>6Jl6rAOqKx~~b+3|s}b9fL*%?5RP(?yjc=M*^RHR>)$3Ehk7zRN@F} zTEk`uO5(-)=S&aYzp^~HSUQZF84+qs&aH=3 zxYSjyI1bgR)uC@`MPvjk*tOHf^zAagURM^WI(Nj+$0vy@j2#m_INJiV;Ub$vT9jmX zytb});d{L{*Y)60)@>u^T?plu&cnax*$$)!;U?7rs@amah68wCED*xP#eB5wLcuRr zNMn5vR-~@?3>qQ07Y4`-xlZ8>)j4lSwQ)@|d5tyQGczc~I=IC9PUoQ{RrrzGdRE?1 zM9BE(O65X=M$6l6wW#P(hEM8xZV>0;vIBA}BVdl+Dnfn*vXPZw*V}Y|h@i6XhiP3= zgnK?vs%Y4TkIy3o=|oa8`xxp5n;07e`;B2NSYd=WzkF~6CC!Dm2?5ziB|XCV4=Ulo z19JM=R#~s+O@aJHzVqM`kw=m}s?ugM`9?b2BadVTujA+9{4+W~-3OVzEjSVdcJ|;% zpXg3xIjCNGzzYakOUt97nu}D0h|#|>%DX(=t@@X4>N$3w***`S1FJU<*7MzX{`yEL zt14pYgeyn1F>ok(6a3uG>hl_wlHxSMu<|8;7n&F<>zAN>cr*yHrapRcQA+Qa+>I`I z{6WEM0RsNOiM5M+Jy9u$0W-SFQkJZMT$^BrSAdHaU$DieoDTt*rH$%?xqk14)~M0; z?`rBM_>gzFv&u?90SQ;oMng+XZPVuy;=su=mkW!IRIkCp=J05?neE&1)pVk-qp*XS zX>{XSdvbeWqLjIT_#ixhivHzx@8`&;v+bB8Ip;@zxU=C$&hcMXFRBX^{+Qw>x~_t9 zGuLO&(i;kKk(4^MPBo)DVZY$cuF)pDk@Hk4dRg}~yUE;|aH_MN)v#nSq{K$BM(ZB7GnEv!g3J*WpY2YlrMlG@?^)$8P z&fZ89ey$fu?SZKLXK(8$v%ueiu3;;g^;B2sSGckL%uiWZ)E6DI^Nr_ID)HNbAH!YF zH^s%IEP}lg(z##MxyO}Y*~mBk~Ma=d@He2TDoOJuu^;cAG8WEhs$nZbZ<0woU{3DI;C0(ssmHGTSZNX- zo8K^8?LJH#`^|v4CqZByRrR((#cg;BELSc3IJTY}mrHzOTjS2q1!#S(_PoyMkWO{0mU^oC!V3zdVlUGZd77YiX>TLdQ;BeYj`uKDNV0|E%%$)d3--*=07L1IW1!#p_R(n|$)4O52oCB{~m^hUMg& z?;CJbw<5*3lKQ&S+n@y{&-mb8ExcEaUh} z@FLb3TePAfom_oo3>(-?bZj9;$_k)fR}&+NQD zf_wSnf}RV&@?_mt&niMlcSmL#CA!=rVoThBIIl0Yk3m&p(!(}$IP7H}MX$x_KCf`N zgZsjncARbAHl66L!ZGh zv#BqFK~IbyZylD07dzc4Bu$g3neG(P1b7G6gd{{d0WVhD5NZuUPfbZTr;*@n934Cv zmQj&jQq7f>UAUr*z{O+d6SPi`uYUK^=A=!i(}&iILJlp1u{qhg!I*4uSXjhx=L~^n zW&(uV=mt_8mKhR?p7JwK6vD|HO)f^@S{a|T)NkXpNe>!z zD@nR(xlBr~SCaHYxI_9R_n3Qhj84qc}+Puyd4e-hFl@|fw<6=9`TkqXoEy6 zA7MJVDuTKZZ(uFL!7nEVV&deLa>2k~?r*&V%yi6w>9)Cv*KZu}d5RXa(1b{)aL2-@|Lf zj?LkzOzBO5gjwl%M5;Mw8{J`1Rkjqoa-9?H4<2vdn&nHA^Q?5*?!HVE=vcI4;~Zw4 zU9X`*Pstu<)YR6HKB&a7OC)e4U>eE~58Ci5|KeE*6oof}NuulVm)?7IiE>gmMJCS> zEtV@OfYJdAz?-ZNclrem682RUGT ztp{VSMFet~#8YnZw%2KmZw>@?@f`#XgI6}&%d($twq}`}JGwEZK#+Ny+Jlw1(rmo` z4ZhuoPFm-yG#?hLCX@o)=CRXGz#iw>hVzeW%?N9P((Vabi{#VxS(WWknL4nfbHaAE zd(mS$zhrJK7+dYFLcXzGr>0EMUeC!Rf;HbUN7fk8TzJ?e&vP&{SxXZd11@dOwrn0= z6Sst`TL1A_ew?3MD9qz2>&A&&;wOq`0Rj-KVj`(Ey;JVZAcfq>iWF^%oQNM;Z=G|V z-Bu!bYFbpG4zuxpZn=D^3hAT1>UKyPb~O<1Xj!=p-5kIJ$35@UZoA*-KkD)2o7t8_ zeTXYgb2E~-MPi9SeKw&n4-&MF-yo>Th*)_r!!~$h5~V)|RN>#XzqMQ<0aHMO*%Jg! zPk(X>wDA2ZoUutzuOif_Ve7(lLPA=i^ZX&eWor)6a)aJiu=trYP;M z@jz0ZNYKRk9jidoRZV@>yP1mTyJo)h-Pz`F4(k9QeSo+yA;hLF%hCu?@~T5X8=xh* z`SJ3y)@Fq8e85oRt1f#;&{y!XsG<6Qn(_b3TAhyFw>Ea0Xr~B>6dG3OJC4YsAkMbV zyy=ULm%V71eFFE#~ zV+DZWJyv%|Mdh}>D(JQqAG2GsVmq0JDx?SD@7`i>H$#?guj}drC^eld6H)yBoONV4 zjt8CwLhkPiAE8$4Px?4!jTE2 z(vxgCMz2w`Qd=ifH00UEeNRB!w2-Ovb@?McX=9EF=_lgT!|%m#t&Y*Rq!8U1TdkGV z&aUdk>*k^*cDC%;fFi#q$fP1E_IWwmG)lt5PiNb&jwXrb zPPiQ=6}}B(*460QZIWhYTpP8PRD7Dx)BOjYl*ZYAG%HH$$8K&!bS;w-dw3phpquJtI2Q=!7Z3Tyz09d^XIoCaIEFU>8V3n+h)O( zI)P4A%H#jMbokwnunAX9zU61@o1A(-xedP0k6ak7W}csA(I}~@3A8mS^Ja8mqOv9- z)+Z5N_~!3`-op`L+|m{z=^9fT4l1aXsgDZ=(m$&6mnSf^ORaDbUCvbi)&oB==h*96 z1Ms!!AtJ*t(iUX(aCkx#?|wTI)beWH}m8N(zDCGPs|)i+^dcB4>e$@EQ`^B=BbYt;g=)NLodXT7td*N%FAQ@>pFVk zm|xu?7M_mCjoCbBFCaoxA^c`YfzT)S2y~8}ymdv@l3U+H1EzL)t^&WDwa`I6Y-eCW z-cZdz?O88ZnWWqc4A?{=_^<^NK8Onj-9863r=QnTf9Q{@U2=9;f!rsjrEp zqbu}NpSj{+976me+3cfha0C$OsdsT$u_SR`DV z(#rQk*`s*r?r@RrEZSK`+!OvnGKzj9%aUz1S_R$kIb@s>q~qwF=SARUNyl-1!)w+e z?#q+1f)=m7ZHLwOG&49H*tg`IeGTcrK;@<&{f}$4RI|QIQ&)iX0+2GVAY{nz8%HoxZy#g|6wt@8~<+9|G1f~>i zq1;_%8#z;e3vBNjrH;$(gMQ5n&I>2rI<%6;^;S1QugQJwBlQWQarMVh1iA5hm=>5s z?n)^+j4)`_>JIi8 zV2VvvD4}K6V|p>ZK)2Ix<#N?AavG`sk#dZ!S@B5l;_LYsKP`H$!HPJx_6;tAM~mN6 z&RkMN#l=a`X5#&ni`;vZR}NuQW&Btn@FZdd^0{9x*ylU36VtdE0cZcyoy|W?t4#L5 z5WjR^o1-N&!uqyjaQU%`FVt`ik(zm}sPZ=DDxkVK98B}d*-yec6^$7`=DyH1TnaSC zyL7&=h$4w@Gn>`9xAm6CD%U-wYYI!)+=Y&waZ z;yqXxB|Os;WJtNO{fxEJ@+n!K=2L7osbrXj0cr`gV|A}6Y?S56`s*K45u7EBcFH-& zWJ+v*e18R2N{Kv9tqQI<9b>kpAUrJKUI+dV_cS9P95CPta_d}gSboS;eV>U0Oq#aN zNEO{cl@V9hCc`9|j^qTBQZ5jL5MV#>cB0F@Xb5!-71TBFRxJTbVoVzwVXb{6GrBFw zP9^g>Li>NK=4ks&&Q615DxsNfYB-j00m957buPUH4aRBv+wTkzn89Ho4{S3cOXuhw z3$z^9OzTpA0mK$qdSJ+WxpmmIrhrZt_j1Pv2Ct3IcU_$jo9UEQ9t3U+yg8|d)aeh_^swdmCZf0_gk~tIPz7D~|Ky}~Ohzs(Nobb{i61V2tHGy}t z(o)cCv7{gz*mQJBV3>ZoSY~c=h68@z5BV2mMh(BhGz`g*x+RfhB zkMH+O)`Xw|0w$S$OFuf93#(XWT}jV#2hKCSZ>V=q_Cdc!ZB4~tDG=&fQKu_}Zu|+)J zq>!s2J`!fiyG6AO5n~3ImK!W<3ri++*P@V~FG*2q#C5a53Ui$Io;nP8=&Dc&}h%L zWMpGKiOW@lUgsM&5xoAwDhX2~YcM}0h=>`yeotOZ^0tQ3QoZ|Zkl%2mV+&4n>$f$^ z8wcUF0t_u%1M0g%pCyqy1a%{nm-@Y&R;iVKS5_Y?x(&O^g(5Xr+5RQ9uMEXPRx$Pw zGg8F_wT<*k_9Kwln}B-K5m!I~Y(4kj&}Ny_tz z#-P_@%R1zeuH$>JP!y+{gFt-e2@vfP&#)^6;b**NUOy8w+dedEfH358A68x_G_ba` zm!0@qJXbC2d;{{ktGu!%r%dKEt+qrTgO^9gf~M{2Q;JCRyu)BRhILp#s>S#mNjQaG z%Q`Imc!HM?0UqMRKtp7U@ADVJy11Fde6*Z}q{h;7I&6>d%z9fy`e zraCN`K_Lqzk|Kk&Ar}mf=G$Yj^^0$WwK>0-CDay9RuAJ$QR99I<%3;C_<2I$lk0`6&RVn(n5?tP2~8j2v5u@as9PzkuC#fo z7`yIm%~Pfp?Bnl^)#X~j+4zWFV~LoQ8C2ChblU>iftM{c&Icm zC#SlHLBhRiE66g^Cq3nRX5|M~_G8Sn)yluv8bR{QF&UjnY>wDhureFYjyA`!Vqqb| z@NkX!z%9uj{&hRFy}>ATBs(Sv6FlTI9D@+u@X7|3OW6<0O~9ILocF$0&m;$eyS_Kx z5D_<2pIh{8L!$sONt+xgMm$ABbL@GdzpM=O<#)(UHI?TlI*KEyhnkztj0ix@k9VY> zv{R(W+FP>1a%H&p08P`H(G})}{T?$dSTBH#h|lNNG7}wVrJ8wY<-$Ab42BH$zMO5? z(uw#6e+PRa^3rO0-V;Qg?UwfPi+pCNXF&c^8TPl9<}|q0SrKmrxi1&kCao-TDf`%o zXY&FRdIXyNP=b1Cde=(}{wOU&>a6)Z-@2j&F2ZNDRG(LfHo^|# zz-cbe5$l3z9}fT6x&UV`q6jC@9Z9rK0A}JCznr`V%<+4Z&lF6O$;#xKZTIdQaqM#M zX^CVl{9x*<3AIP2vhwKfM6$EpN#2t{hO~`&YQ#*0N|a8!g=S8W@Ll%c_8ST(u2kVl z?Hzb&>i4>Nr}|bUqu3?5FKEgKau6~%zFexp5HA(fDpz5idUw802%7H36Oe00&nbAd zs_BvraXu7t#=UFL&@CwG%cX)5mmtsb3J|B;=E|r<>m?wktE{EvEmY8tZ*duI--d-j z&|twy7?29tYN$k9>2BIfZ-(0WL#|L8I9i^7sGhY`dDvzY_4Z+X*r`!)(4<8)@w{l2 z*OiqIw}4CenQ2`X#he7S<>R@S(zc-j9<43Y;Ei%UhrrH@lAbk1Cu{5H&|aJ|B4u7+ zh{CUewxCLqSXzlYWTr(t)TkD}a;PQCv%a1di{XPR`v6DXPOTFrx0C12O%-1x93g2E zQ!0918EJ`6Z+-Sr>X^4nZBs*WE%xY$EzSvK09O#L zZsa`j@Ooq5bmkNb;b?eg^?Gd~j_>+CzdNg9>3uRzR_+r@WDWYEobO`4$$_S78;)tx#gSFlAbkux$Rr8LM)P~|W0`hmtMoN3dl@YU3S_%)ix3pIa$3ApUDIN2t%#s)A|l^HMGfj#DO6Xiqh`a@MABTHvS$#14FE^R;kGr;>E+ z;9f0YG_9~Gu+$cPo<@#`@{kPC)nipOMlLR$uUD?Ur1YglW8M9>2Jg`H|jOwyV)Pz6YZMJ`S}^6GJ1JVfNdLDJ|;UYDKh&guNZX9wTL%8sz4ZDJrY0 zEnvosrRaOZFeJRMP^Gvdf%iAC-C1$EGXmPj_1qoql)IqYpX-#UGsZ zo}EnzAUDhj*cZKmy|w22En@1cIJlB|NIXI#d^=4`264GWc?3S9Olt7cJeh|{6-5z~ zykbabYRObfJ{r-`Sgi>P_6bZ~n}V`SqL;a*j$aoalqe+!(zy^(;qj@Z6IyQ$1GWr36PD52h9rh3Z)fvPo_S51Ik)DE(s(NvsIl?Rqz zQT6LoZ%HJjn?~2Z0VT|5QE1TQNCdvL;R+{fd$+l4VAt^+WHU-w(?^{|5eK~xmgREM zqnbgC(W!?KBu`_K7UH(-qo+9bEZUElSTlkhw}ow(j%=!cFe>-(s|;rOS60xSs+gtd zis4G(mu{$@+`7U@XDlp{VgRI*!w%f5*KL-GcpUPKDdK3U@=4al?h0v%=fr8&)+x*39NN6oms7-ll&_z53RYCN-Va!k;uYHV@W zeaH$;(6*;sJ0}m33TZPD%~9+bSKgGecAaG*k@;DrG?-YHPMV5u%$`V%eXPpmSA?JJ zi7eqDYm6aZ@UtEM&dVwn-Z6TC?XZU}tH$|IT-D*JPORpgD7`oP0HV(Qxt1B8-^2|a0-T5N?l(+%BzT^1NIqg<3G{UZvmKty=Vj$31X z50&iC8mw!Tjj3N6=*uRQJPyjbTMRL`GAmQ|u6!klqB{hy6~68%9e}FcBWPxa;i4}w z8RzV0qWO)p0)qmVNF@2il;{Vzkn9O1u+q}gI?nNWFH735R13&6pb!tB3Tj#D1h!oJ zJ|~c}c(?fBm3tV?>}(K%*gpZRSyZhGv$Wkn*n`^?J@ME^FGQJ}y>mcU?GtZ|Z6G(i z+jfFqXW6uhpadz6uQ~e`t$C=OCAd zU!wIlf}{8mLU0$p3+(9H`v!Q{XgH=e*8^}3J+!~odm_DV9o8k>^8Dj|gitDM6A$xo z<==}4zz@aP>Mk7%-FV8J*X|H6GKaG(+#Jpi!YCuI#7neqqZcMzV{~}ZEsOif1>70G zcKWreVlqv(?>z;w5yl(CVtOCwXAG{Rz>}D@GY{3=SM=?PO^mb%9%U~=`ku(}efyx> z#c{trY)U>rON?OQV>0`^s4It#u&N*IIjsB4deXG zC9eFQ)_KV$hMRZ4e~AU>Qs3S;0sIz_v@)t^PpJp}7r7F3{h<@YvA)%x6U$B9?&1&;p(H7U zv7w*|*MEFZ{x9462m7#Q-{hBh*GvKAPcx?vPB@ILw5o`jkly<6A@E+cb;W6`)9FY& zpMQE&sMTtnRV+U zFRxmMg7pC__ei0n6~3*sD^qYtVX1BXd6M4}Z%p){`|yC$@3tMKA_7o-CZN6A=CsGO|J<|;H81RGusDQ6uLtWER%!xR1U^Z)XD{1?}z_|tvP9l@MN z*Z!pQ&)0*29(yCvqSSf(!k?=n{^u9nJH*Sn&uft9Vt?^wdfXrt>!Y(hruG4cy2oo0lk~bC5;G7|%4C zSb=YR+jXsF_WJ+6h+BCYD!HSb{fj9r@|QJ(CY+A^1?_(t(p3n{LgK(JrN3N`LN?cIVbV&L;oKN_HvjQu;R!WE9^Yyl2nF@LRuLGyQ2cyYuW% z6gxC_DeVB~XV)&JpD1={>{8kR&d;u0Neb}8)u=V#Y0rJpEvXzWtj0nX2^T}nSu?9kYyv;&-KvhsOU4N|8)lsp!`|d-j~-Tqg+esMkt8e7+&JqXNO;Dsgzc zc!&G?*RCeBtHhw$z`rl-eDm-dyw2+H>pF`bej{`)Kl{77(Zj#&{Rr}_Jo9fFgMYp0 zKYp3T-jBE>tl*AJ{-~(EzwZ%x&gJ^w7P6MIJ}-^q{trNO2Y>(o literal 0 HcmV?d00001 diff --git a/docs/pictures/pp.png b/docs/pictures/pp.png new file mode 100644 index 0000000000000000000000000000000000000000..edabe7fb8a670f81cb3ecb2a199311e4bb34dec5 GIT binary patch literal 84478 zcmeEv2OyPg|G!m2W*j1WBpe)jWp8C?9h+kwnT z1DctY25la_>YFXJ-fZA5XJzfbY$D?;J;9T0iuTXwW#Yq=;5+ zYjanC7ZA)Oz{S5iEkwIK%-YflIWmNAZ^Cva$dmVG;$mfD?%=+A{nw_0>xf)UX9owM zzkNj8J+S{3!Q)aeID#RO>!_`8NgIC7mQPMm&Be*&f~1nBvKDx6aDbWqq7J{UoAWox z`O61(w3Za)($;~pJ4@nnSX9ru@z9Zax z+=ARdr+^mNlSEh89)thc)%beeKc^^?_<$(E0G;6K42U4--Y{SY|K9BP#E1(5+zH}l z+`ls=gnN(JR{I!|i*K*dWKR$*zjcnd{Lk-_80M-prDkLEFhVZfLSuP1_TPpb@mRxt^R@) z1qJya0=!=*i?|KIDEwcuA_TI>P)JTha@{^o6hJT*;{JZ%L}cUNhZFbia-T-oljpyP zybx0p2n;a42UQ9uC!dT2g763?L=ycw1OEqA4G-r}^yT0CypO&}lJ6%kvhD9fU4Q|< z%E)&$v)A(9OkE_IkkWf-smpzUx_d4E&D2F2`$JFNgD%2e%YQR<1^<81%m+RB-*%Vh zC(X=_w8Mwa?gE&{Z@c?b0LSx_Ws-2*=M`OLrVF3(Sz`S4O#;DGG@w!6H@z;(~7 zI=s}~gC_UM?r*z$KrgKA71kA$?87x z{x-e5$ZR3f^d4UF9>lAD+uj4p85t!VKHGZ`u=;I#4=CruOWuQs)ojFUZTY#{qxu zJTK%RX!Y0H-a|>=pFQWpOWuR1)n99S4<&hjMw|~Vd6D4i|61I87|F|zjB^jYa^^aS zTK%@Y-!?%b$%`cSzPR^r*j_}Y7x?AeC%wOIFW=9~`S6nWAZqp7_8w5qhnKtuQLDc; z?md*Y7r7Nu>QLS!A0t3~u6TeeuV77QcEP z_s`tI_Zv}l0Lq{McePiGK;S2I^Vr|dsiXAVc-G!KPW!2JJ_Edk_$Ow z_=4_8>l}nz#Zw0--GsMQ%8Om;Z8$`%TXvxd&;1kqYInx441` zTZ~x!e>CLbwYUd?8Kk=Uv#ReOT2f7GL@l>}P+;i4!$xug)ro$|IEPNtA(KG=qHFe# zN>CyB;+r#ts7VFX%X0&~p6@L4?dZO?A5G{#ER?m!z5h~OEaWKPhShLI)WZXmF7gOq z!oMBd*Y>M>0#5zks*`2GX~NBE0@wk>xckZ_aen;)+7Hxhla%HMj(xKn|A|6b$gNL% zDwK<7&vfpS1i?L=X}`CK^dOPS{Xg$5@*G6NkgV{RFx}r;5&*URO`-pm1VABLjJN{p3695GKw}*fe@nvuOcJ<|6_k;ZAn@mG0`DHS|2L9^gS+f@jl_2d zjSHy#?1+HnE#NSZy|jbmz6`{lKuLZoRxHVl`1dQv_gk@Iq)GwYP;SIvd$qNJqU4C{ zfClZoj0O??W>~9e0{^j8It{V9-%@qF!aODR5X8Y<%0xC_7Pa}FDCF|d>rT7cM1Fx z0wbaNzd3<_1s?t6Bm5r*3Q}Kx0M{RBr+)_uKVq8*F!QIqPhg}fK{6xqGt#{L^?}0A z`t9ct53fP{Uq+CSxwU^Wm-UlC{!i?wg@u`!pdcV&2eMP&g7jVc{U69qfg#B6Q`%!? zF0MTn9jOiwZad@``6-?~HHKvQ|Ge9d#Dw-)P9#A0m*l6uaS|}|e*yRVujM3xJ)ip! za?+o`^8Pj44(4I)sskW@oWR%h0jL}BNf)uPY45!>B149N-w+3+5t8^n#^QjDm|yqb z0(}7z{L4+JynC8=*L3`|6Tf0q|3+Rc9xSNSs{vIk=Rv50&{j{gl2`P00Q$S3~M{`tRt zV><62qPYLFjYi*WDj*oYZR)}Ad;G~ev!U-|Cm z?<40yqJsN%=HW>CBSq$4WW2pc4}Q>@-@b@`pDO&bD)qp#xc>t_M(kJ7Faw^d`&S<> z`N=K+-{oF}Z?o@N4p~`2!LM7>{$rRp7vGP2vbgvU=92$o+VpFP@9cp1rh6GiPQ(M< z-+z%$u{)bMTJ5T={pWl=YHn=;e8;@q-o<+t0GkSdU5SY9p3FIaKtZ8FQIM9<^e~*g zgXs}_VL8}`dx|oeK;BO^TJ>s4t1z=XrWzqC>*#c4S!rIyGKs;!7Y9@WR=RXe35jQD zbcCw3v;>lp*;ttb>hcBLi$2X;TqZ=4Ttg*SxLxKJSIMF`20Z6Ht1&zTSM=kQ?{z#2 zi1%Q|y1=V*AGj0tQLfgBjc%@;Fur3v_uWF=YIc&((Go zT)MFdU&b)6DsjQ!>SrToDV*x=xQ`aBd?uvd=sg<|zkh0bJ%8K*WIqP}+Q zqgWVY*x5OIucp{3=g*M^f{nEiD>tMpe=+(7t>cBB&+C2Ee#Lt0Z&ri19t6?bMb$*v z6_~8vknos}Vmuf6i}8R(E+q1^fZ<$=ukv9qu4%fL#{Aipn`z!5*$Vpw;5Q3yF7 zshX67V`#1LA|1{{b+v?I5?if&M~mZ6E`DQ|y*2&X6X4^0z{JKk7*5c(~GNVo)a9!A( zH%g_im2RnCblDRO{L-T*Pu|NTBEz>K_p7g;J9hk2c!bkpIJs{zG&I7lmNdorBPhey z>GUt}B#(fgSf-gUJ*S_(yJ`fNExDK4KSPR>xsqN=i@@*Y2~x*-X)4hkZ?YnKi?qez zBxtyO-Q)5FwREjBPME*!@fD$J)ylKel>r(sSxVZ5^E$x|{VzPsWjW&6UM<)?G-NOs z?2Qk~m$JXgk~uze=S1}-k{u6MELq&K3~9SE6A{P(*#?Mv9JSMDa83TEM`cqsy)+&A zXqp0>-mLkCLPh^hvq~OF^lpaq_|(uy2mVlLOq^4~FIch$BUVY=t#1_56}8GDM2H$( zii2fKICC=$)3T&r<&lzGbN%2zd}vB!oN>+V5Bb=>=1NYzd{?#?oIeD-%>AWU8%v<& z9C6>eK-z6-ky||8UEVS_)pJ%LxAyA19fTNG{$%~lnWe-preVcd^W9nxM(7@ubrU7O zki~G0u$0U|$Ad=%Yhf-xwyNK|28GiNTmQ`1g-20$=Au+{X|{1b_8ggG7nlAdzfh!p z%VshJQh5eH-=3FBNOGy}ahTr6I?7l%<8FVWlFPC3v!9-xZNN#UH${^rqIsmAfrxQDE4x)!?C^GXzxsGD+F+RgaJ^fzA({5&cr!;QF0YuY_0=N(aVR_el9WDLZP ziq6#(QoJfT!yON{3mxK%c?d7UwM@>kJ9b;;&C5?ojL$}U1&Q38zKCh+=9Ih%nt$*k z1yaFSr$U17v8tAGP@T=E_D4}Mo`?5^kpjJ~5p`SH$AkIM-gI=w)=Xt2(uHN^;AKzlX1k>rc&CRu;n5rW=i)Rp^xI5HR6) z-o?0>7l}zpkwade$YxhbC?fddeH6-}eMw|j(|Bb(Dl&r&Ok{cWYW?*@SlBt^egh{A zt_Yav&|MErx7n+DUgtrzGOtk~->ubG^<#{W(N8q0w&5wV-g3&URI-u{Iy-WnlUp_( zY=QQ^t}j-6^OZ-YH#L5f?n^h_Uf0te&Oc6LSR7;eq5fqR3V#T{u4p5$4Cqy?+EDC9 z(CKb-OoLI5nK21oy&Ow+*H)6x#b3$1EGOQ1uD&vt?ZtUqk4Fkd53-L;m%M{IxH$3N z`Do*$$HD^5?^1^>3VbD><>_H}0jfJ!{IEcxS}%)hu?E@V!ODfN@hV=Xbk)+A6KAKc zv-R)tk2oR5^DV!rwl~4)oeGjLm7|VNrKh>?;xl2;f|AbDJ}9BwDB3EiJ0IIWl$6$_#eFsmtS=IGx~uv^YN=?53k^Sf|7y#@8vFI7Ze0 z3da-BWyU2MMh#d+nO=V=0^_Z@`5HyxM)tCL%GeQc!$(*#kq%W8L^DQ=)%=)M1O508}W7?@wY^|`^fY-!5?QW+C-kzve zLFZa7x&C;E(l<9)q^CWfqBV_wm>N&~h`s>Y?VYMOhHyZ2y6_di@7^viFm65gdw=7>+Z3l?wl0z94J9S=x=Y5 zQwmOZ5z@Iq@1e76p@`i<%i+CQ_r>)H(+bMfCn(yasg-!lsowYeZmoL<6=yvG>6j5{ zyEINw(7jO#kKp@q98Kt5-Hi*_ry8P6+xSCG6p0BY8L_gIF4Rze@PkCDM`m2AiNbkP z&=vbg56V%Uxyuj6XVK=MU8!=vYNN;!JeFC5Q&fS!IA1=V2Mhrgf&?&O4NYJp*ln!moxZ+A8f;aqZ1MJwYZJ_)Ux5IUhC=`sC0HNt_;U_U+Z@CVd>B2PiHSMw){xF{k~ z$9dI{hHsM*c7~_5@Y^xCED>4iYbz827i=XKM57Y%Q?b`P_x8=LbaG%R%jIbBOh;KA zhcC{=ao;SOaVuE~?RtT5l~hmpku*APjfD~P+#3{5AV}S+2Sro6-afrxmO4adecyiY zCO34wpWzy45+x6$`dHqS(VC?7J~aXP>9bNP^e3gl&2cC{O$k%CYASh6+4|3+%0D=@ zUQz_7y*5fs8!l08)^X~>P2borZ!)@+Xyd_5Pi}37JS1G5pM%x3LTU&Q9!k}RFI<4+ ziD4XvDP5o}T+Sx;l%qB_x1_1jSa-`is*`Ab8-6AaHOIyePxPkX?JA5-55;JH2B++{ zE4qy0A5dxrWY$l?q}090%xueUprpUk^wnp^)YlYu+|qv=%;iMn;%oaD_3SjalYHU( zn@T6y*^hpfiGW*Mvzv9>jvf=l{+<17vxA($RIPGgG zu8Efy6`GfB(Z5lM4)PuVR-~xo6B}ONJB(~)B@2iucZf__UeDr{uPiBuaUFcav+~&L zn(It);L8{M*6J8eCgS?pOAtyDJvXun?Hvagis)Gks}kP=;lg3^`_!=1RwqUmbDYko z9(C#$r2~FcUwAFDsovys*9@R@SeCZlva*Y9E6yGox6KQW1LwVg&0Zd#J-Gtg^)@?> zRWQD!FQ(*dd&!yxO!oE=zVvkzb{NV>IzN|<%+ZGoP~|xkQwy|Pt8+u`z;&Zj3jJ>Fl;3R=_oYK3ZM3R#rd#yUU6C?t!9|>exxDUN$eYu z)7)m0Gh>g@AYbI7uE&AFZ{lv)ZC*ap7)-IN_jFQxKa}xGmPs$g``oVgHdpH97cc0! zY0cz=)p}lr0UD8C%_d1C8tokxQ6-bpC@JT0)ije{{(h2QXqfn?N51Zb3{d99tRcIb zJ?dyOeW} zNQmIr_4suy5GsRa_LKvCD3mTvAmeIz@<#^0_SGAKMTu`z9g&br4h+jDue^F<`Ff2-QSR0BG_^~q4>~m8 z<9ZBOEPj>-I9>qc7mxj(ue|-8dDv5S?6ZAUXB`dPt%)mPlVSAjgj6%`&X?hNw8O_{w#I0<)?MPskCzi*GSj>qqG^dn$kXqR++9 zS*5aoS_L<#e5h)CC}0pNuBP8pDAid4pC+sqJ?62B?q!eh*1lbPIw$i1aTT1f98-pO zThAZ<$Y9!Y)kR@xDFMR+L*DDYD!YMhx{Q0-uB!}B2KIr z7q2jO+Ca*f?L9`f=7pBUJT^9H(qmkQ?xh&T%+MxWeu$x{R64pC7`I(7b~Cs^bcNQo zb>mzZ!b;MF`DVdzU}3XwVng5HVWIa=;|xvPCXu~UN~0H8GD^W~|I+q$@>P0VFnJd| z4dT*x%Sa1GS-9vrW+n}WCEb80_H8(i=nJLC#^VR9Dabf~L9EII*Tt%0zHnjguP&kQ zC5&@T_G(YzQDK1*m{F(BG@_QY8@zmV39mnYynIT?{ej6dQ$P2KbA~kRmm_NuX|*4ne}m>S zPk^8(6{=EWd@iA03YvaSv`xCIU2JZtSTlMs2bmF#whytqvW1do%d1k&TVb{8#kJjE zga(spS-*z9@)%`atET9oCs#xZ$k?c$UBi`YW(0$ctFxfG7;tD0-Z;kdVi`ftXHoi0 zg)Vno1L_kVfQZ1)43A8bt3N*mmLG;Ei+z4lSLj$oqd^slB_QPI=wp1jq@*EKZz@|w z$v{S?4e8xI7E0QM(Mp*~u0LxX+VM(*KtvYnnH9GX&bmZqrF_kW^4hVw{ z;F{{5h2AT;2{xnmGkm7Lv-QFE{kkXK(ICZ>d~cbLHe5{ujiR5ML-pu7tNn;$2dq&@||t0@Nsv$CG>lN)pDP#?1` zFN9m?z9}Ag0?5*_qTQ~+2Y2g<+%<+Yed7IPst*O9KPxZ8N^WH6KZtv zV9+JT*t5qPL9kDe+_ILlgz0m_`4R;+G5Q2u4IEgo@=Oxhk)>F3)|QtqzKW8s+8w`1 znrj#e*q-&r(sHvrF_q*Jv+cuVSq)vveBI4f-p$d7CPNT$j~TuM zSlCCtc;H26@!StHxyS-FCw>j<8O3lA{2ECL>y55!zZ-7nWrN0+!4NO>r|@^J^;ehp zcBJnpH?)|mja`sxF?pm3^ubd&Nf*)x^2UvoHvU-sE;W<75ybcrs}@iX*22#hzFxI^ z!!>Kiy8Yn`VnmT^J0+=bg{|dFn`RgBkB81pUJKm)6POSXH1L624%x&fBvEnUc{+N6|tW% z;(Vz0;=pdC(T>~0mqkP}NO4Ge3c~HEY1%$7m+@4%o${T3wy1aKHcTluq{2V&Sjm03 z(?UQ)Cq-qix;WG}jYRMjCmZyX8TBQSeZ(rXL{nykT6GlT<;#8E&Z{tM5I-J?Wigdr z$e+m2_-SfQr*w=UKWT+bUl@;*MKT^NYci?~qA@<-Gb(6xQe<7EpIOmvP4I1|PsnsE z|A@;+zKS4a@eA!?*`n)}uXC|?iO5f2{a-S$<+i5$d##yxidVqq{S8oMK)@NU)n zVWjxSF7$D+^D+a0py|{1-yYM^!8R~BsrXJWKM2$a((S9g^8<|eK)mKYEG*gb;&UNl zOG+u(SJtdFWgl;3G1i(&itV8EDZ>MM=;$A3D9(iXxg^K&3ITmoR6IP1@qh(7YDm(K zev8FQQAOe-;qkbe3=9q46?plSaxcKn>^b<=*LxR-lAeS%79U|A5=3_stmX+hMXu@M zRGpxryqFY&R_dS~yCZ_@_Ndx$OD!JkT$n5w5-$T?7)9A|%Jv($y> zG1-mVC!cbkNCPVeHoU>N*AIj2AKMk0k zQz%o_NMqWf*W6w^w&~Cu&_J(xMVSMtkGAo`@_ASEeA`s&8D)}0*2-zL?5Y8GZ)(cF<)0j+@R~ z>-&oO*j=p(xOE{m+>)nC>RAZ%v zky-ha3>{%cN(vCB20LS&qG7;$eJaJa{5h{HT@GG#z;uQKPc#@@d*%a)aSvTEXrsq>w`Nh5w~DpD z7T8rtqTK+x17b|7#3~x)pSe+g?tVx$`kTen zWk-2Q3#i~W8ugqrjA`pE8@R1ZwO zvJzWd<60%oLbfs#?@g^3IZ1t|W!WT)i(2vh;NeN1j+5CUQmQVaCo1GPo<`@DfohG) zZ0q6o1s^FcWvp}96LzS&P3n5!UD2%vy~)z7kQ)Mr4z#z?LY@lP5Wj6z(KcX~zP?oI zGD@btSe*5=&>^NNKIf$mcAAS!Zlp$YrD3@XS8P!h)BVaYy9KzXK9z1nTAU+mZ{cQ@v^ES6B3rM z`lY0JCKcDezAt_8BG%C*{i|(TViK7<1nzMiy0!oN5W z4tdaH8qzk;mmXv$Ed&njIvSQCii*R24IYz&9{Iwh-Y1|i?Ey)HteR(FC~Jl-Hy>Yp z{O5s1>GPn5+ijik87xzR9=Mk@>f?JKP>%=>U$ScCR;zqeevIDO1d{43vlbaM!o4Qv zw8it9Vxbg@QyWE33tbn(yovWh&23Up^WG+@H=rl*CNI@O=o)Gk5(}qtoEk0#>g-Bu zoUe&im^bc7E?3^u+tcmp@9*8CuhigFem{ZJW(s~}RzYTYXQgOHj{dG$VdqNRX=AjA zbHkAtEyFcMFv*gVQgg_AeLX$@r6sz6;Oi1vh9Y&R7a2=1y_4o1-L`HhmYF!pS|r>E ziU%uR;h^MYs(lSY=^f>%>BO{2X_l;vg-^3{ozITICdtmCD`Ed=ns!D}w#WSig}Ks# z_Os~JG>Q;xy!45so4Cb+(#J?XkQ{MY#1|j$k?7~ktS;{P6vP8bDPxnWCtoM$jRxBr zIQV(Kr5yDjFfcg3s9~%3Fw0egc=H3yh*w$g6G0Nsm`mED>9(9i7A(v%%LSn_>;U4YeIzr8@u`h<24_q$5yHZ6 zb?_IyDn`AFtQNFzz=qdX9t~%G8#^^&%%uIEte+cI6U~cBw_Nc34_EUb$-**}hf; zhu9gBY!D;p8|o!U$;oNk01~Wt>wQ)_N{@&l%V;G)sg=emG(w&q^CZvBi(UZ)X)~$k zZP=+CKXU5V3hpEpT&U5tAA$m2P$M|OQEA+NHIVu5p5+tLnCI`UP~$ zN@NqN?OQOIa~Z8yKuw;rI|L{AsTd693uEoq#A4ywV%;o{8Chvj%i^g@9GV3_VV1WP zebA0KvqZB~7JpqFILyGHulnHl=xURvmu^`?or;snS4pk7+3ts~o(b;8z3b*66xjn08*lB z_O7!ttx`|u-N@7w7AYy|!&hU$eaXJ+=1P4rs#+L;`&>$WD%=eVa1z;#wkGY)+|()S zwurn0s$;Avu6oFA)iwEC*sL>R|0iFzX;uodc?r4n9_X-*dZZVRye+3a)8P~0S zX%i+6XM}$K{J9X;7;kYiQ!BSE-$3{}r7(z=mNvcRnLlP!LISfW6~q)H|Iq|_?(hwY zn-{J+MoZ|HF}q|SUW^1-W(bUXOeyov%K^huG>hPz08);?(y8sg3kIFzjEy~J>U=o7 zRRL<}3i*ynV-?prCb~>C5k8l}D$aUT0WW046Aq+<;gICuk~7%j<}~)r6wX5R$QnWS z`Y+xyD7jzE$W((%aWBAlGyW8@76tKRzUa&|pGSJ0edOeBr(kCpp5Z3ZN!GtQN`A?u z@s4<6`qd^AKe!=Dao&Ig2Clj3A0f4UGfun{5T2E&BJFT+ZYVvTxwCHf)ARg@;oc zqlF*SQ7;Qvs~eueg3d2Rv@z$hmmITJdVL*GxshpiET!_)&3q=hR@uMU#dg*2T~2|3EH1AceSgq93?KdM~zD5HOBjfcyWB*WHK_EOgUkE-XM6Bta9MeM%-;C zZBdcl7V1dcW8%&xxI`T9j7>`Lqp@u`Ej^QOTf>HAmj=6OnnIRdap=ZX=w#JyGE+Zs z93~P~w%fAcc~SE|Uwdb^l>qF=N_7sNMBir zTt$(hvlx}1V~^D|c$r{dKAM~PVcu*#I5b3sTH1A3ocm(R(pGUJ3-9H`$eO8F>GV`<7iiYetyNq++(NdxAy`2EftI>V{Xn6 z9TO83AJ4QNPQj0N=FFLTzzj!+r)#5LtHGGj0Mr+Z-5PYztq0S5DN~@ zEvJJuhQ;^o65gD-d{!A1it~C>d>P2>h)EB5BE<7bjjl`?KiY9 zf;hHBWC$132<+LZJS;C_v)C$)d?0Kp&U)RP9TY*9k@R`k<{^KRA*R)>fl&k*-4$IF_W&|{EVY~TcE8~Eab9;dx)!DDb zt#7#31jDsg?=5WSSXb+es@HoG&3jML%j+KjF&N$?h{51IR!(%X-1&7m?OCN7NkxMT zI3q8LWgrD$%u!=NltEjc@3}5NuRC$oF22XvE`XF>%f>M?RW_i|$DXz~d>v&W80CNdLqmwIolr>28w$K zS8GsebP_m)>KO&TJb&kjL7}(vvC56dqxXlx3Yn_Ty-Ti-1Ft_@d4nrMYNWk>KgiSl zgPEsA)`bsQV~Pcl$6j`W1$78M0!#t2n`^>gQ>r;>9+s@*S)eK+=uI!Q_N6ArU&VOE zdhrAy8zVRKtFC)H*YHFQQvLB*mk0a<7Kak6dg4POBX|dRSF2a+meod&fg0mk%WJ1# zIJ9k+Zs?>=+m|jQj-d48L_~OR<)5tkRO=<@-;GdFx+^h#-tQ#dQ%E{06qHVvd8C%FrX( z_<3@RKm_wCMm7iaS^qN7xUgmVoaB`ZGimmZ8_&7thFah6V4dGa^$1@lX5kmpWE8<& z$rc78rE8%>bASX|tPXi#vqaL|fU7<7s5v`f2spCo&O=0ipN`$7Jf?PYi@U-7jeR_O zBM9fIs|fK4E6JK<^Tr4HgXubUrLw_{)e~m!T5+>mJQvDZrPWaFySW%KGDQ`+y?S=M@t+K?zwb8yU7}~B^=sk?5 z=lgm}$;kMQoE1J^*ZYFQc;$X)c3XYSVi3;gEwF_{LGjVnAw}s#aC2`g1GLh0iVg#N zi_`jinW`8O#oKKA2FnyLCQ+vQ&W+&m5PIo#Ss)Xe6PfPdK6vR~_G9N#*85+!j*q3c zs99iRQ3+XOa?Iv_>bnh3n~fWJ5zODD)Dn|w-J5-kv(q)L7o3%qC0j63%;=74H@7{>N{EZC|=NNy}>_1QdUgn2>A3c++wCb) zm^*2zvh5Q`&VFvecK>AIr!;5-a}5A3}Rizm-qDvbj(}AoA{b zbo~-0yi*RaUU_|2JR&zydZS~Q2gpwoRLGq3Us=V)mGd}`^Zqj4m6KFOKBQluEZ&b# zB>V560|8!UQ~V&lW*d3@hCr)XzBGxOv9{b$3y^Rps)?nR`R-6g+6=lT3a zUwJ2_F%0L1`y9iQt==@V%(We^uIT{nDPxY}@h-u%V+(+DZiGXwrGD?o@Wkjoq$x5s=}-9VsGsO>6h>cJ;h~`|NHyG}IxqQR6j0cr|gG}wh9O3NX{85m~$H-@Gv}D8sYRylbjaW~}0|^RDmG>ht zAA<>nO+iBVPkXXY_DPoURDHSkNE{ji&X9`)@vo>hHZ(kUA5yR+o62s!dIV?w_34xl zBaJj&FRz4&3F|4o^ojME$mP~9i#N9z?P-NB&XRd=oL?X5zYTx1+-qC=rE@(#DvCBj zKuy9Z8thv8C6U|mCFHaXgTrJQlUbLs$Mx(fec37uzUfRrzw&BNhQjWn%*nhRrzYS> zZTl!y`k`8?cJ9)Aib^@jS~}B~*(49iXm?Z@17+z+qZfeh^YY<^Yf5R$AMsWaJOeV@ zE|5x&Gj5G1nTIib;`fMzp5`*NQO*m3-qcOES8{7el6=b31;nkBfw6*bIA#WGd-ThY`M(QOhV5P?-*@4weCG0yM*(7dlGFCmiOesjRQdNXy zPu0HgGQM{#MmY3C3e*#0M6Ijj-9UDM7=QU`cJsSq3-~@VOQ}V9qAtUArK=uOd6WDX znW#J#`?NNox&HTy10%|=aM9`zXTK?Z*4bhSk_ow55fna{-JAkl;Y$}Bc(TwuNfV_j z8UC))++7rJTq*E!xSca&NNv`5qfu6X%Db@eji zXDR0lscrZ)mFlmX7}(bs1?3T(q<=fv3n${ZheJd{m|{Y}2E8>zq+86$Z$io9f& zcv(azWGHR-GK4UFlCASX8d~|=r9hAI`uOQ-n+JNOiXtlM0THxqU?~bdxVUa&taDyU zH6yB|t^k8B^O(g%Q=(+I>Sl`1_9HSWpRLj&TOgqCU5WKv8W=rF*PuvvUK{hgT4;7p zdM>+fOF+0IZVJm&&ChN;o0g~1+10gBu*l;qF`X7wYo9XCS9y~H2ob}Gx;CvnRgUN~ zD2FJ;ud8*(-!rn0A~}nVROqhs7n57IY|wLkDz;)-H^JV1v}Am4QT4k(X37}6KM}7>G-M^Gs!&H&$gJp^WrEK!f%Xk z3^@nvdy=%z@)!L4?DnJciCuLU0-w=R33%Us$514_aEq%zaOSO8V_gWFL{y#QMYQ-> z@H=B`tVEOWiX}5W7Eyd{YH3V-87-$kMSO%QNUZq*KB!dmN`uktR+NBqCNU8^5 zzbEI*Dd#eKbzQcckoOH=&r>hq<~xr!3u0WCUe61(-Jq4?bQ~7X=t4j@4squn-pe($8ln_z(h6e4NlSO2@=6I%dOVmdx-F&(1`Q=*rY&R30I`2M zyRDp*Sf#xl^dhJi`%)m&8>qpI%4r78;rDcn<|ZYgNn6XUjqy}lz&37`%yVTqq3R+c z?~JZo8KN-CkgFzKU3%14>+~vkaKP`SK~kY*2j-;MMW)b2mm3u>C(wifOnOHx$s4t*Bhfr_E`we!VtE=0Du5@%^<^Far0A~@DdLmTSf zm$F6Xcc+quV-e%&<_h}!fY-b&Pc56a|_`OA{Z3=CI^9L~kUgMGr zMnv&ot=c|L0RX9UoX}11; zs|=>o4+GaboTwWj(%49Cc&M%s<|L)>e4%vjr^Vg`upZ4TbComsQY&-qZD=;cEdbb1 zyO4I57b^emT#Bf{b)`26yshK~d^gV7Jh$O1fh_^#PNh$C7jKvN4zX z&gy-&lx12%kL6s3RSXQ?5Tf<|Ns9Q?t{c}RE6J#+g`RwXK3Q2|btu2J(haRxeiuyf z%8EQ>TfXIW=Ft4Bvu61rixbt=*JeN>p;bfPp)E#w{j)+#bs_Sms-7k%_Q<5o97JWiIYQI|FkHBuc;h;wDlYEJoMY=D9ChDm{KJPZGsU#DG+a~ZIh`er z$BBuF=~_6Q)@i&Ro}NaK>!t5#_zhh)gmd7u?Dhi=fzS^sw*a-Uw205IdgYoy3gi$>41(1&-5x8W;MC;vF0k3sxyuM`tO;(GvA}EO40z-b;7= z@s#%y?-k~S{_3*Z#JWs7PdzhNgKESuEloJRLz;Izteo4XUN( z2CIcc3F1GYp3Qf{pmM@RJ6;(MsaWS7L|nVBVvF z?cgT}?9ldaPWmittQZ-0hPh*@eO_N?W4qIXD|6|C%-yE`;O+3CvMbl^s1>Wr6X>9G zUUp}<`P6fGADZ{*9ltF=YHL_U?ODBs;eYzgiA2tKHf+!bGGz;Z?SsEav@}6`M+2xW zksQVNlO@@9|*YKMooX&V-0_j`wG`2QyQs56!y3}t&tG6j9j|alVwVJRj5HHCw}-^L7@6p zfo1hMN$GPwt#8h$=yQ+|^2n6+FU?bU%)K%kz0{EJRG<-SRnEIqPT7B6w6w zQ|xH~)|U)VF+poUEXWft)#9eqxHE#H01#f~xmgm9!0I86$@wxsN;!7zDxhNQjh3PV z-Zwm{8zRD2vx^y?Z%!AEvxm6&_ z+If23P|ZEircgD5oP#)Q`&hG5Qaky}+QuXmnoO6sBseyDH2f=Y^5kj-A@Xy@GCmj~ zrtMvhZTMUp+dkLe6gCC+jRaqGvZv~vNR~ZNk+5136>u|8sqeL(BWr8&G$QB`csr>R zYvQu2GAj%LvKFT7T4-u-4r=tB@9fV$*jOUQG!*&`3hne*UG9j#R*5A9!WQ~ z$y+cR&8|!U!scUY=tz|Eg@p+&Jy>A_vM$|x+7Gj)VzgtxZ$I5Ro8`E&^s26d5#vkT}7|7nZh7rVt}X65N>t0oQE|wmM_<>osi@F;0H^%+|`lJ>F{~* zQ|%MXl@i?^=PsPw z`N@~!3nC;Gchg^7`k+il|App>S0C-g_rzm-wE>2J3*&3?oF4J#RU{{BAR-SkD9fyC z0Z`Y$p|*oHeq>PPW|`29Hf^y*NV=9TOPPPhYf$agiJI$l#x2EJC7NNkPfg`aw5x}! z&A2Tq=cHO$X;LxI@Q_?WH!Sx&HPHZSAWxG`ze+w}D6LD`(i~3cW|WQ@ArTxx)m{GO zp)Ap8wNE0AJ$Ab6NqX8l8BUuu?6lhX)njLzYIziJqbV3{v$tQT85o5X46VO+zMiro zo?9%#Av<{~hcPC3H8-20!?T+~q4%cbmD7wY1pRSqv> z8fHH?u8-?%touubifp$fTlllpi-u)Ath`pxws6Yp4Jap2Qd>-80+Y9a3g8!45$3UO zi3e9k@M!s$&T}YCb}`WKJ^4bmy}BJH>!MCcHvQq_5ETid48j#6H^+3#M5|m(@sN9K z*G9c|$=iVI`Q@eB!0ThUa*@o+d0PJ9#+#$TEgVXa2EH^QZi%!g(@p%8r&%NK7)({Q z^l40)Kua@=Q^5=GhM4$?kKbKWXw^So7E5ecGwlPmzQIMjm_QNlYpfv^GjfK8M=IU%hgZ*a z8)hv68w;P`{Y-Bs=L=KnD=M#T0>GI0hYQusw;r;4#(<}$+P@4k=;{?x3e;ES%qb6d7FVvBZt zQIE&Ao{;c4SqmSOSAx%XAu->mfO<0Wg=Kqx8Leo8&*#*v?>J&Ilw3ezFE3{HmQ5{d zJ!cT}OGJ6ZWg!<>&hMk}$B?i;ns4cbY=5s;9TsPv0DZ)(4*{CdpB{Dn`9wWEcT|vz z`A>jUBa)aZFOTo_yBE^3-OGah%N}3et)I`LG^<>Y@)bwkzj_@T6;1Ta88XYl4%nB1 z7A)gNtER>`udThU&RHO@f5jM&e5jm8G4lR&^ly4%!6-*-4#=)aG>w@2SMZc?a!^)u ze!IkTT)+3%S8x2U3gmMwe4xDcQLxUi4l~iRP$T%dAjA_JQ0NJ=2Kx3!GrT&p3kkhq z;vEIT zHZd`AXVhQoN!;8U`GhaL^pukIdiMEqz2XxldXcTjwnH=kd+E{PcKvKQHuuhSe$bz+ zN~@+y4CyQQ6r)BZTwP7Cp!I${?mbqKnW}fq56xr|xPYmQ2=Bnr0sZ+3R4P_bI{!!P zJ+jHx`u0|YJyMhB?e)l85eGme1bw3@SM@#|2QBF#Z~SYq!oNgBNu{qgx=`1M1wBke z{3Pl~sq83^VZ|qvJ=4jh5xS@^s@ZX)e4CbI%RHhbO?ss?wI(Pw_^btWJ5pHlE!u~NGDW? zDAfJ=+TlNmg#LP!xXvS^PIWc&ajg8-$Xkn z4evkppH`XD)%)?93Zp};L3EQNn72-+lr(|}<#IlSHkmF4&KH@haaSu_wz=#XOL!!H zQl8;Q+}a<~_8&_&-PC_GFPIwoH(0`q!Td*?}EjU$>`@fSsQc&g1mFL*rh7 z_u@-E18I?(sRz{VqrXUrX1;s~KChSULou`e<$kndBBVf*#IULgn3W^uD)2|J)`ryr zI~B`4O~r;CVm10UL-?`5j}&{$qK%McxkuZpm}KNHnzU5bMNFgf^(K;_WjL7@#iw)* zUSY;SpS_-m`21N}*?W^VjrN%h=ge`^P&I=n*YAAllA*dkUL~*$!FrH>2W+fc*Mo^} z!36WUQJJ*9zJgq-LRLoWQ3^uryrt}=1jR|2#(GvWu(Zw>Q_kaapg%a!)hChEGy(#+ zqAXxA5kel6Lu^z{n`{KJ;2nu-B2J;E?u%*UG*_lLI~8oILe|QlaoL{;Q|%0dL};L%uI%yDB?SS~_Adz_ha8@)Jm{XdI z18N-0m-^+n7k+iWCXh5`6h4orcsaWLr1f}|qkC2xI{%uT3!*)($XlcNvqbrN=1IpQ zE@>7DvanLc-ySOCmWv?L$o0)CRm_&%{(k|BPzagO5E+tO11qMs1BxP*nE zH<@IPRcP$zz0r%;qrBSDq3gSR)T;GJNxzW`{C+V|tcC+KDaz@+I|E<>Zo^Kwhlh@x z6ArHcU@=vi;0{mx8Rsw~6X9MXXNUXM)kMDrjy)Z~f3-m^fO8aKR??9IwAG!}&U*rz z0;@DhpPmsSk~#ZK_#`S}~5*!Q{t`e7eGper!mbgcM$>&IY<>i7)BiX1C_4#So1 zS#M&TNI-H*KtB5GYvtU02^-le_5YLAuo`|v`(_R~euM9x3M+G;1gS9K8202Wr#Ow@ z>N#ZgNgZB6Ln?Fxhdd7$h5}-<++Fb_>0+%!tDH8TLF=2J!|S$LB|&GAfMRJntWy00 zaLRLY4=ta(XsY;l2MF5`Ud+Ixh4BoTW<93vY6sxWH@Yi2al`Q0AfrvO6|$$`E2XV` zrmtXyDI*Xgmg^z}qhpb2!lKp@6*DsS3qzhgItLeW;|B6#>sM&bK$2gzK_aI(qk5O3 zOZTT!0fvkj)(BdvIF`4|Kf2r?rwZZhlMSSQ<_&q=M{gAXq1upYFY_-JNW4ySCvipR z?2VKF1CUtxEv$Wjw_7eA*G=sMMaF$0$r#}3idR^y)SCv5#W2K3%fGeYaYvEHQGE@BFZJX7z&wdhl|BQILcbtpgi?o^0Sa@+bl z1e@hT4B4B^n05(#r4^k8hJBduz8Ns9&^BoD6-3dpIA)nDH0vhs342RCdYOVNt8z<_U<<9 zkjgJK@@ABW-yWDp-1)Ats;3Zk8rBI1Agrt8vdd%tIP-Lc8h-YIQYTY^_{kwqKNcQ! zv^-%HoAoMuWp2jr%)wjXAy<0p`729;j+!uFt&NJsu(-9&ik^jR-1ztTQ-yp;gzSF9 zK@?uw=hqImU$r7&T6zP~$E`RRT+6GP!TvSerf1F@69huM9VkZcu%t@fhua&a3z+72 z8CV$tI-hc(JrqE>(?l1TJa_Sok;i9#ZutO$kz2^!4TPAH$MV7p*X3F~@N=$tI>+ml zntdFn{Fhi|yQu(Eo%rbhPyiXf?og)bT?qYJ1fv++d+(EQO5>;eph|SK75oXGE-a6v zStT(by>Z@%O63RKJWz#PED#iDE{Y6B`dttHW&=5OVI^Rk;5Gwu_3+%xX0Y zq*%AP)6iTAnoa%w?dB5)?F0ZRAm9;h)xy)iu`BfqT%@OFsSv9x7H8WD{mLnZKZqmK z)=4#gqSTbUu?b%8VJ~SvJ$RwxW=J*PE*IK<(x&$Qr(8?{;E#H~GCxE5e~?3&G)THW zd?I+oVv4@MNN9;&AwL9&YZ}BjDcNxFR~8yQBFdA7-pClv=r)|5kjix^NY&JO{>hkm zR#*%U%~E*T532O&l133MftC|WC{Tr2z?k>e6Kr3-M{Mp?cXUn|1ux3$0Jvz4r}W=# zJOexH)$6x*zc3V20mI4cRAC1T^oi&(Ya?okw1&4`DJFVJD(k#fCjqJ7S4XUa0v(A;i@opoo7)GG z4nvy8Z|H%u4e{Xg$CXD9kPGGDVP{2RD3oaGTPi&OawdIWq7OI(Fx91Gzb9Na+C&Al zfOoZ1ZV$cWZSW%Fw^zmeG@thnPj(2JL-x7{%P1`)Ejn;ZIY3)tO(i4y0__lRK%#loePPZgam!`z;yUNs~9*8M|(3r0q>paBao zezfwrA{Tu%lsy}nQt@L%0VTXS%gdHIqVkW`-59xHe3WE&75C4D19WyT)sn+GQQ_Uk z?AwALh4t`_?V+Oo+ln+f)$x^2<0J#AFC+WzAFI9%&Pdm$#$MGi3?AvNMZpZQDojdhv>(~Y24vh#Qk8WdVWw3RRJ4h= zsnH9%S|t+W-^2B{OfEe&1E>+oSV&}pdzJi3lk?8&c}MSFd~S(Y5iSdS$HVuN2s<}V994-^C}8S2+i{Y2_PDDC;#AHtV)yf4vq#jB|JsJw{lwt{Lb zVJVP-icaMK{8NDY!I&MeOk~nVd|FbSh5Nyn`$2$qD1I;IH|x!y7=D#UNRPRi7ixRS zjwqlcim2PoS{C6UnqAQ>oWIRJj}Ag`lan>iW2%g(72bVY=5gW~7mppbE>iWd=j1qF z1b4-o|QhOZYtX0b|i;;HR!=#@0SEtg|9C;&BL> zy*a$PB3J-SM00;H6zUY+gi&e|r&jy$qHu5?xy~?UmHY3+>?vz|bF?n7FC^sEd zBesIGdt@CuRUPG|^q_u~39z;IJisC8>tF5@gcB3D_Oy_`wIo z?YVCO#l1RUGPDtt*GwzNa&=YU<$xRbyQGxnW3uir|B*Ooh@r2W+tFSO;W%RoM2=PjKEAB>AR zV&&F}32*gX0{{?W*oi%<1xb1(X5gsSol{?iEqD7+32->dGtZ2wr#m;TKH4t+dinyl zZ~SsrngkB<(c}8lopjX@&mT@pWoUS`JHJ{M^}S}m&K63y>8h7?KOGPDkz>_l6q3J? zO&IUv?<@wHzg|5;Lh{UhF$HyM$l~vV#PakcY8ULw1$@9yD-0_CkrfD>`Vxkb6)F)C=-SW%o)X?o9X>cCR#2jj_G??OcNOcWzgTly*lq?W3n;I>aBo71CQ`) z^F-BeRa`R}(K1)+IfiCw`n_g?fN0O`nTv!A*8d>PIE@xZf4qdGQG8tA2lQixySN|k zvL8;yWEMhYkfyTo*HtQ)X@v(j$O1M4F`YpI?(#}35R107n^D)(i>%=Gk~`T=DHt_n zyJykgd~k+9#vPojfVfF(Ya>~4sCh8bO<4Em)LtrlL$19Z02x?tDe|8YLe~GY8_t=! zs*3%-!a&ASX|Pu-mwv+E36>mIBs0|J@~2-KVa1i#CU$MFwww3`KjNrU?OS%3U){De zqv`ju43h4c)Fkj(sBfA%LX-7g12cnD&o|w9QRdMc2&918icn0QTt8T2FUZAXk><+y z_k8}q&T3KHA3_3}W=-3#I=P;JKqB33JL|8RY-0ANs2LC@o#0efZiQl^BtK=P&W+{m z;LSfdJsfXIDmv8%)sJ*JGV@mfn>0@^Aeke{MiH~Oo00Y(kpOTx8=JMe{x}py`;$CP zZSAxUBmZ>vr%&b0>!x#sfL{juaXq8|fAh!v|&Tma!+N`9I+p!e2o!-X#qB34D*Z`ei{WJvls87z=gU z`G^BJKM1>nH@9t4v-HMN!Fg%9eCzXk)H#R(VCOaiy;b&3NF zd5@hheMUY8;I^5ui6gh^zXsz!gP?$Nr-wW$2l#;s{p!(p)f&SC83@=gj;;U< zy?9ll&QPu9P9h6fI#})Ij8M+z4K&_wfrLvcIQy9}(*lk8u>R~Bk#tCDF#iugBwW?* z$f&^^8ZkPkaJ{~QHT5TliBo<&*?M&R{?)M7`lnT;5rWLRbhrS74nF^~jG2G-sr~t*dO4YwQo@`M z8Bz4Oq9qZ~$ckCN*?s^gD6ZDXryD6Zlue{h3F$BOji|XFi8jak4G6}$;zWRTO-U=U ziTV%z(fhyfkHkc9IHKY6vvTpv9oM;Wx0hyHw}nW77fnj7IbR(^I)Q6LLYF?j$C?IG zO=UkQ!CS;LS{GcQ3+TP9-QLkOqV$7y_TW4{pj?GB$fyhF%e|cHsIcD062j6Az}i6K ze$%c0EXv?V6eQXLMu`gsL)c0C{5&6W-lQ*>b~Q~OtM%E`K4fIQ{M$NF^0q$(T*Bm> zia%Mpw6a^#X>G{DOb#dsUaLNJbpg#tTdhqxSOXyIJi;gkYySxW^}>C=znybj+(?tH zcIH&b0kcr_p`9$?pkAtykYk3+Al=^1lVLFn3DFzWU%U7`-TyLrbz~>|)V}!sm2FRA z9nKp;e@8L8%n?{9Eak>*ulvofhDI=+JuL=<&tb#+0<*_U8w$3K2On2m`nHyhTlsT{ zo}9Te#)q$9=bh?b3DB#l3vX3Q3aG}GT54QyMT01+`@!#%)Z2ye>)?*7)F%$};@=&1 zGJxx2p8RhJK2OMApb`8)g+jzQ59?>;J@Z>H>R3A4$=0*ii_K&n%9Nj$U$s6rD1K<_ zO%41nH#Sgd^Tq(p=>Y!H41gf$ASfEdH4FKzVruvbxg5&co3#hQz|YSR3n}I}YVjJ) zc_rrkImM#44jbaA-5t}iKAK61OcG+WV$w2>9Bk1WRio8u|IW)vI&coQwF^@Pex5IT z$Z9-!5?#@E|KOo#ss6O_sb6CS3KXfLW z+`mWQD5|Am7RIPbV}{T%M3H_~{+iMo|M0MA(#G<=;b5R4E@hy0sB%cr`9NsHZ%Bn9 zE8UxG^461fbnKfCw$=D}a!i&{#99cEAmo^*lkND=70gfj+#9?7rZS1s*8!Cm>h%5F z%N1tb4ec@-h{0C3Exd(aXaB8UpY1M&$1GOcK4wl zch|gs&hh|NuP+HS_U(#fFj)Cb&K*P;8v&Vgm5c_0TVB>A#^jphdp( zRJ>J_Sgt?WR}KO%PC%R;=H55;`F^h7H|h1gDnbbYH>IY0iQ;_v6#eez0t4kmCKW0~ zhY?1zERZOfj~iIKrZBBco~@YKD`78s$&GuHDRUX5-V>iwc&&Wx?^7up!9790kPG7~9qE?f-);GEtk|>WtfPERUo0vXPlfkUHe`9kRvx2xX+HNv zr||`W{7*dAzX_R%$#c{*0%YY36M*gdploYm33BPLuWoYdxSEH)p5msD1-<|tkQS7)p zRH3UZEFPAT;^{)op1)L|cOSUxo+`)nz&;ERLytax5jP7phBaQj61&^wQ5a=IHVI&5 zVnPl^$EuHf%4-z_4&e49l`epW4C;q(=bh7sjBATVWdE8o5wFKThSRA&eY-LRgNCVV zmMMnUUHGaJc&4IIrTrkrL;I=r(9a~9FS$8koAa3(E3FD?y)8<0yEN>m1?S~@BfeIj zC@O(J?JgNh+7GHQUzkas!?kiL$KQ`3pRXtqD;9>!=%a_RIJ1>g(N}YTd&d0~t!xZZ z72ut$GpJx}&1t6Z{z?k%AStZV!|yLDoy++y*qaqQnm0TNb49|7UT{cjnJ{*pE471V zcqIMAF8U)8uIGnNTfFP;G@ehq37QAX+TbH2gI2J}Z8aU8D5*wr)6*6YsiO!HoBRB% znuQT)I5H|kct)3DS{j}gM{8YOEh7do?-{^#f^<*f9x8rC%cIF}F%7BR)lMGr!(k3) z0HOF(=_NRW9~!j$iLc$vi2P&%?}pqyFcQ33c`&%S<7Ib?%ie!rFOv-%9Q6eECRd~8 zw>wCC+hx+uCUY?2R6Ll)m6(z7ePNtB58`&>e!k(+s$Nb%D}>P^ZaMuRK;dHeRT;xF z*%;<}jLrcyiLBU>?H)>4PL<$AgZ+Mp@%|94B7EdK!2PpA$K%hZDR0Dgw}{S>74Bos z5iiHnNUycjk=O4&CG@}^Qt$r$$tu4YhKbL)usrj_{heT6a4AdoO5q);(bT~#79@sL zz;JEfN?0SnScy#p0~LaaaIIk8J9mHdh{nV`$w0kBEv)Fwq;p%68^-mci}}g*w|_QZ zR;ALIYi4ncDwe58f;Z(MAa7UO(3eA9zWD8X*a{jIwDuxy z;&tDE>mEaGO`n>Zl%D~q>L2d>%8Faj%(Pc;|3tQ=oOf}*&rAIiNA0fpLDi=;ubck# zCp)Y&=;w*SoN+)I)AVRB{*zWv5IAG^b$tUZx?WpbTgi;W<||2IZcx3T*_PRLZ17co9gj9R}o&B2P`>r32zIfB@AHE%kFjc zzD(K=ao=a`X;&A7#@epbyyy?rr#Kf!Ns&g;Rpl+7P`K3JAmVossO~ML-kN$BQZ*~` zAjvSja-3vakAp&nP#sF_kCs{0E*MBDMr%~ha_!Gv!o7qD+v-thUhymy&)7C!hur=_?=Bu<#&A)Sq0yErm4)d;th zDK6@aDN2k-a;tD)uT7EoKWEJy?(FvMdgGUW1dA-U?FMdWT7QK~|>uiuQrD$iC8R z31qh4Eac;aD>tATqB7U@U^S7oj+nL*tfC1+dGY!q6btd1%^|xjF}hhNaN2Lx=bDdc zt75Uj0yyi`8km=fTNC?Tl;NF?F7ICU_}*b6YVsLdK~t_LKeUFH<})PD2JSZkzza7~ zEn)966^^!KJkMbo;$&sxt%T56J(H#Cr935a7FO0b?!G-az)aT@OIdpylYi3D!lviv zd7e<)@Buccvv|owc%yk%#2Yfyi$qFwquK7&!|QbWH>WNGhsb@b)S zQ;VYF;qxDphwyB}W;^}JTRvl_8g1m3e`B%^Q$nB)}@@Kx%83utzd#OLcLk;MQ zaJC|mc<8M(bn4_;7HVA(^Rb7mb5lvvO(M?CkLktp`e&*!%sW!u0=9Ea1a&eH$0ka_cc1YzsqayHti7Vul#Q$Ysw>$ki{bofd^#6Z0_w`RcSfaF-C57x(5V{`1ZYi)nQU9a$Pd+zbay<5d_p288f^tj|$7 zce>q$w1DDr@TtYUYDao|QmPsG%N0lpAW6-ksfy>}lB_E;-pG(bp7mIfJFXbw}hhy$+?p;{Lb$NM2R$5?XpsvX{3c;C+ z6S)2*s2~lO(=iPBZQ7`>V-e{KuW#KIvPLOt+8+J>QLa$TVF;+?GFAf7A@|oS{mD(6 zXCNbbn^Nvp(AMva%D-QVPqX_#2W!(wjl+CQ7Zoz_PT#vts#sep>_D)Vg`B6)cIUR^ z6N2V5v1|PLx`_4<8)x!w0vj=wKi6lq?%rfN9AE9Ka32095#aTM_P(pa91X$5Ni|lw z5f;*}7PQ1L)q=4Z_y5o+Fy@}Q5gk5kLYDtFVsY0vQqZYO6a>zcpfAc4keYDAUjup{ zeL>Xr#Sd3nEdCTg(EHUHT0xAydg%#Ir$7N~HZy%$0i)u3NiISRl;4USqkm>lZz}KhKh#R$DqLy6*89;>@&L zIegMCsBBbLh8#{P#E9sBf@&N(<~B(@$hN79Vtwst6X`G0-feAQuIEb)nOGe^7p*|d z*hX>5DJHP8*=8Z~JCm%CXO&@5Ezzr)-mG|=OZPkrrAXxgJU-fHb z)dP2%1q0Woe$woB`Wq=tIuY@Z$pjL=HIWpgqa+PX$0n&5N^?}wV?R$wd+CwUh^xOz zxN}ib{wg_zI0(G4?X+mz88ohXYjJ_rkj1utpjGZidqW8->ni$jWCfocUahQ@uAUK$ zzG9tru7zd>fK9fo26i4$!zpv5VlP)yTMLU{5NCW#1bHG`4s2auuAlqk7I3;aHi+V^ zeq0QSBfB|lTi3Pk-i)~1r@z84Rn81W=}c3z6O4W(X5W@{liAsI>*32Ju0UvA-hMRR z;b4s5H(rh`weRl2B?R`pJR>|WKbJK6ZSCo=8~TUJs|s6b|CFr2T#|54hRS_kd>(S~ zIhI0Si1zB9LYdv|G_}ZfwmJq5PC5r9WNS@-fw&OZI?p^xiLf+ME#KrlK>=S~UFDkK z)QiWbr+)_SFnB|z2Xp^X(Rt#`)1$64{iFoY(!A}YwW@L+650L^qRcKT(#l;+r#rjb zZ>?oKaE;7BM}`uO6?6*y(3}}PCmQ7-$Ci%44lQgohLJAKJAhQXn4_ozN!uyuSasjyIY ze))R};+Q?}2lkp0rPXw%3@at{sFHsU`2H>6vrvi&x!@TwX-5qkcEg^!fNGb#GN`Vw z5dK1Y!sSx0@bldQRv;f(VWl8$@xedZial56SpiblJ17 zDbf|xx0Ae-VbpH5smyk`wZc8%CpYt+|7>xS)m_KzP)ki$M$fRWvVV&M1VnM~oDBmR>QqQbR@}Y{e6j%yK`y>{LV?J1 zd^jR`*NO{T?dS26)3*|sHf~Q7Igp>6ahQ8&EW#s7L|;GY+CA*Q{BB47h*Jk>Z{NZc zfIWu(p`-E3yb@VxNump~s)CT_{7Z7HxE64Qa@fHF-4w{X88Tr8fn zDmPh1a6`pD5Q2)wo|g}|&q&#XVE-)W3g#Q!t3FL21u(K<i9f<#lt70uHx)PZ@xb&n<85~00J_z3yO0)dPxzd!KS`A8sBr@pl_u7f2?thKgUs2TNN(+W%zlMTV)I zgld!iwKZcO!B?QPp~=K#(1qBvxTA|*oXbskV~}OLaG6H2XotVAm$~cyX}_zxRIvP#Citw$V~|`||0k^K9?)+t)k(ZeH(Vm2fJiL?B zI<_N1b$T@RpW*YLX-=#Fmr7SAdT5577#!@nmXUfXh?yAb2ZD>=^~Y>^)VTWg5Rqvi zF#Dr{Kp<*`J23j4mJmwsrqW7UzL+{qfK5&M&ufK4(0Uz*CY=q{+Dl*37V(-aezZ1{ za{k@+Etu)B{(ihtZn~G0pOcUSVwj3RAxiaZsM^VtSEF$3<-B{7{Mv57ncVW_n15Ci zNvk4waoUT#tij9lRx!Oc21YJ0bQE!!9$bTXSWRBsklLPEakrcoAcVt1?cYGul(sRo(-(=@bsh?66YnO zztL&~sqH#gd4G4TJh7ChsyLU;1jcqe zJ|D(r97gIS1pUY%Ud2rc)CG>E+b_exnN*pPLmEfF;+K|&JoY%gYreXIEdcKyeqT-T z?)R3Q7Wm#-@kLi}5ybF%UGRF)uXL~n3vsDyn$c!QfOt7fOic~@{24cE`nyC+Vxfx> zqmVb_b)KG&bY=eopbG^rD&n5m#lE-Bfp?NTuHR1wGb{GzBE;e>7U+|HOILgP9%B<* zlO#~57kT$jctnp*E4?|Yno>U^YD%G!ZJ_)1%dZfC&wcNnjNDFtTfFd77Ql?2i`k3V z;~-Gc&=>(cGKKaq`u8ONIsrViGxItHcWa9(tJprL4^-U@MWtq#X42tEdWszi&fFZ6 zhO@%|ptORLnpi|805o<#4_J3M*J>6pTPom`QW4ObkisT}ZZRx3t|7qFE1ba6$~z>V z{+jOlf@#P%^t0xW+xf+KEYSKsW1Xv(f#>-~&8IM;B0P)U;XF3@`{`PYwjvc@!#N;a zLxe!hU4S?f0*-;HXygCs5q#4tO1^rxLnl9J5fZ$_Jdr1@X|Cw-ESW*H%iO696Ur4_)#%E!yu-S`+b!7nSVO+CfIUV=F_zT$Ik;c!#VvMbxZW$%c*} z?YXX1_M<`?PCL8p`g=?4d_n>f3^$%^8^3{f)%`@ZUtSC=AqY??f?&R z+q1I3V3&mFp~zn3_OHWPg@h=vu&~6B23(Z5FO`N86HN~)4hS%x-P2*G5h0Gv3gAWZ%@ z`wHU<-5<$5j+2r}8R!C_MsbFP3--iC%s1A+{tXo5=n3 z3t!AzUe$+-=7s<83;x}YpJ`8Haq7RZqCz%tFB1SP^_IgRH`uKOgn0Q6!i-03*Pez0 zU>Riy9RNoyMIedZ5>_I)fB%^L`IdN3MsJQqnqX+VVl$SRKs*X$^!bFy0hmBD;ga!y7X zj+c(d`29=T^%c947r^w%zt2jUqe1xGc4Bv&Q-`;X?Z@zTN$$+Fb?9gMVoJ~qD0rGO z{SwY#xvrqUpE1t1@R6so?;BufCT3sw`kT0Bi}b0-y6ETKhN+lj3iJ%FUosaR3`o?t zzi{_mB;@ftaY(Zc<1fGYwRtG;WRL;C_(p3PDUfWmw}@$<2ZKXHLtiT^pUwdJKaFXc zNw_l!>_~>t>2sH9_7klx$nZctjxjgvWzy)Dp70gN)s1e~zeH)Xv}o5$z(`TV@$%#- zKI<%;bEbWs0KkXuWHvj>7ztDY2TcR62B^mOYU1X8wS)N0U;b?lzc!+mzr;g}`SI^$+oU$LlAEh_f++xDd0tOxFR8gfsq@5;8sOGBcsGfN!W7CVY2kh@ zu-|59O`;Sf{PFSHAN5S_9n6qaoD3byF)vVY#>PD27TH?iT{gKP+i?fy&irw_kd;0k2zPXkM(`% ziL@z_5{jxDy5JqNaU~|?U|APv51$uCr`}zq!bK{i- zk~!B<-F_-y$ZR9nNW8m^PtVWE5I6TQrWpYM_?TJ#iR}cC;LNfQSpu|vIA`8;mF=K& z-`_T2dF!aBO9r8z(l||izW8e#+Mak&z_`*yESZD;Twbztg}<1uTkT}WiDqOzwi}SIgX! zJUtrdn>73oHWUyEgT@nQG}Q(I!Hb^)3AU$l@H`pwi`22yRI03I;kD=Jz=;zb_7mRu zR)q_R6rI=R{Ralm%e16+P#W0;r?(?KH={90JAd4+8IRFSe0?hRbcgMb4>v$i@-qrT zr|64h4)7rZ=MW-pKcRt8YA-qLU~qn%5u@kyPk0Jf_D3)69?fN%KQC|ij4sN9>1wh0 zmT=epgc&fK%HTqOEbvmBOhN7W-oBy)sug+gkqagC+0X*OEaSZN9oZSq^D5XNZ3P|G zI$NOqH|A+aaTK-K=`@luu7QrcZ}Dg*llje&nPaJ|NGBUZ!EX&3F*P59W?MF>lzMuj zUXuKVI&k_5oNVDjaJ3}&wI#PT8FJIaurZ)&s7PBpfq$%S9HgnY14))pu1zJMqNo?3 z=w;kt;Aw!fBjD|cxzDqgd|Gty)=GX2(RzvutA4oniwh~XtUrKrxi6iOwZ!rzH7~YK zp(4%DBgZZTX2v~mXz&d9ldW-&i z!{2(8VM=Yr^ydrtvpz%7>BF6O()khLTbme|iuF6kcskUoXL9C%4s~m~+lOxFAy+XI zA3!;?0geP8@|^sbFy5Wb!M(o}-|6PINKJWFG_5=+nBr#R_o3};aC;&w$~R35K~a&l zF1b2yINnE?M`0{XyW=(5KX_7fVFL~gj7zZ{sAq|6Vn9s}gAF!%{pcwie~KY%^{`Hl9()wBGRfO`|FbbQBrw639>oCtiSYZx#og*U zTm(Ea?jC3N?=F-Jg->RG-QsZi#zbR!)GW8rG<3$|@7_T8GIC6Pv4xze_uO5Dt?HK| z8V4JX5ptjwBh1i86ak0h1wlBls%t;MTu6VH5%C!^SLmu$#T$h3GOeuRG|YCuZ`78{uGqTNeSXLrw)61g;M#vWDp8t}h4>Fi44TQXhX> zOafLsnILeIAgBtG0bJyt!vGj3qnph9dMpu?D2r43{rpX@4U2{0?L`tNy1ve8VI4y= z*C#eptTA2fn7{$$p~m~CZNBvyv=fx1M)m4dg0267ZDS}ownhVrW*#y8X?0(8%meH- zE(~sa_=Qx&rcF2)LmUHr!9IGsr9d(miNcdJ8y6kh14XTImtlQ?E1MUC!iTNpy696V z829)kg^Ozj?eEhR+lyWXyc?I3SDXCaM#%hc?a=x})~S(OK@M~}#}k>r)Xr)Y-VJ%9 zA>eJTMePCHq{qXhx8C|^eVGlTA-?qHcs9BufN}@@d4kZezep|-LerPh7K3*eE%#XL zWyU4&jhBRNsVB!_wHEb^V;x6RVKndfZOg<4?sU%zgo;hF%z|4%S{Kwr@gJ$1={hn? zE$?Tn7Ax~CEjZdX$PrW@Zm^9~rQmX(^;R}Ov7RkO z**jY9)6P(_81eo^W{V~Ny%C{ODy>Yo(O;7omKzlBldf@VchxZ_sM-^m{kn#VP^1i` z>R$~f?ZAf{j`mA*mr}Fhqy36Fw|#>loUg?HU*iS~1IrkO)Gf)t848}cJId@d?b{+C1g~GAde-f(Tz#FWl@k{yDYvXjTodzJld+^-c0nA zqxf3mRxf&$j|l{`?7^X_IW*zRao@qaWo6OoJHw>B8|mz-Iq#!L;rMJMR53{|C9?WK zGROK9j1w7PvgGn|{l4-|@H}D#YP77x1N5qD{r>AfoBg&FSWpTZUxJi?nzFx4q6OEe z1lPHR7wGmfq>K!JhA1t?sU@*}>51>U{_((d7JR6j9uuFd9#y(xvC@dOuV!R1DR1ow zVVr$)Q2Yg-st4Vs$k4qyVKBJ(*rLUOt2JTR)4NkqY3dWr3x*8yC~_6|v|WtY$W@ga z1e}8!k=aT%snn(B7I#+qzs@eUf)b!7>b(2L2ts^1VxKx+QL&+%F4Uc%j!b3*X#Sew zkMD?PqWvGthNS9wNp09y@JG+_su%J4jk3@pnUkNaCG31Gl~r?s_7phuRLR3`m4*;DZ_U%i3z zpCS#&e^IH#uN8$IJVTu8KI~HU-#p+>a?j~aTG{FV}rg!E#668P8Fe8w*`vYY-8-LIM8BHxM z9Z>+ZndhW74=0AfU`~xWg8aB4hUrlu5}@@E@dA2W+6*iNz{swACxRvuQCR*<3lJ-< zRq;#~XxXGSkH={~On*(bKA3CE#{8-76b9by!}oRuJ_Xz9b1pR7ksuo%uDXQsGCUn&$3jVkHy!J+-93|c`4#_QK0sltjxp!pUt<~5<#nZZ7Dw1IS zFJ21GC^pz)d6DcQnlns_T&d5?febNEuA?~3gcB*-ll;z$55{;iw2!f~0gwcjUdRwE z0M++62DnZUVG@A+Wa@ka@Qk5F<{F=XzYHlHS96&@V?b9pl{+u`+Qo>{M*PGq;ZApR zJ=oyFN%Pw;a*@g*4r84x(8lP+`i>`B6mFr}iEwEk2@on`#jK0`$LDK96sK4GTr>KS zTN8gLyz2(=1y8oJ(ze7?OtSaNQQemla)mF24PKE_AUF?~Q*vUH6Nr&Radq4(@Ez7P zzxA>FuGVRGrabIQrue*@D<9Pxc82EW`Kn&=57!G7lAWoU))nAyVu^!HI6EJthvUQ` z#O-58YatV!&UaRuQDnJ%R@Q+9#;s8(qh538 zd4UzX(};R(H@t}+@oM~jg4@A@Qx{a1vFwePyj~zWU6Bs6qY*x)=+|2=@<8zP71^^9 zKsh13(Vp~JRrv8gcDp&MI6&Mu{u-S;4${cfMCq5_mu_?g$4#OiiK~5Nq3lI2_nj&5 zn$EUx30%x>=3u564HM~eFB1DVIGs(-BZ^}=CW8-DVwOcXmf1>r; zI@}(q6yp3ZT@Cl|9FvKtNFyXlDL?mJGmVtLMuV)MdZW7BNQ8|g2niZyhB}EkwDv~& zdYYU(uQoeHh1UA7iPUg=f$&Kk+SDUhrX=Ei%GA)v`KXo^BFxBNSOrW~T9mM7>I#Ko8u z(aML-4sK=5GX$3R!4~sFt<-UjbzPq+ml9rYCsAs1nomneU|pkgn(`jw2ZI-wZdP+f zvLs7zkKRll&&She$2>ba4LcHLyNLE7*M2(Gki{W+;K20CQ!Au=J5f)wzW}I1e9?M+(1A1abba8z|mcxR;hZ` z%NLlPmmX-Caj9;|b?aN1?|V6%gT58<0{=~gEnwWdEuW3FZy(1$RYmGZasr_~jm68X zcjUXnCS^(cz2bzCk(g~r3*qyDkS7Xl0iQ*WYkCs=YKxT3o<^$@T)XF&_Dx{~>3&~v zc6IyYDE+^M){fdo2cx6F@8E5*Zd9P8sv`%0zk6dY)C9P-G_kHU>kwde)P-)nxq06hi?=i|GmX=?m}k(dmHpXFp!-w=p{*b;h^N)f|4-ds?l6l znvIX~4;@W-cUfAW@LFpt$c$ITQ*4qJLaflu0Q!fm#B>J-CbxD*Q8t{8bNh*|xg3_k za^&8qAEyLNN881KyO1-Qs=v#5orMj_1i{xKFf>ZG{mN`IUZ0XAs;nsDwUZ7bV0hZ^ zP$!m-j#!3dX3(e(+|j|W-ZC4i&!|2{;7&?Rw73cXHe?5F3Fdd5k1DQMxRQRmt20U= zbsr@5g{-605FoAygJ6vS)|_~D%?aeaq5Q^oGm9w{Al0p4H9rwvYU1+S~Ai$$}lzeFFWxOS$R@9XNAuJbyWT`%y z*{tkTF$M>bU7(69ZzPhwFXw?vzREg{+r^GyDfRTragJRuy6T;$f;8&(2~26Wl|Tck z%ZyLJ;m$iL!dZmM)Qwi%i-v3eDkDF`i0tDxNe)0fjh)Xh?A_g`sn`B8Y~0prYc}OE zK&|uS{{W>Uyvqz=@Td~t+b)U+v5HQ-3g*qSg_0h1zN?E-t=Crb)=Ax!n`(XgKaje) z3$2s+^Ah2x*6q(1TVpEdiXX%09UeoS?+u`*7kM1o?Ghd9WS`POc6n147}$`b=`kfk z5EH=68wg?t-a2j}vKU}u4x@OM8(!WBob_@8BhhP6B)Y#A`i+&PdQaueiN-B(_EnS& z&^kI3n@kIO`Xub$b?b=sHz%FGO3UWU39f6Bd#3CR(kJ1~OJS!iR? zDQt{HEXLSJdjXYE!SEUu05THO4gw}5>ZVU(y!<94n(L~ApV8DoUiY=dpsaC{?>F0?$mRp*f zW`jsrmH$TeL|5JG{zmp#1){$wIP;6X=?LrCATB<~c*J2|g!1YOS=348?gB;$)I8hSjZ}2v^0I`DT>= zHkQJ=m?6_O0l?2iraYxFWd@;ZzoKP4ebfEe9e`C7F26>D zO!H}mTo?-O&uGjuMoy#dG7@Ue(GpfcGK!?MV$jbapbY}vUDVO67kKMm63+;pZ0w2& zF~T+Nyb=NHT!*_^{lJvATt1p8@g&XTSbVQ}^JYhPIJ?Repkx-H$drdk#OG)GA<&0j+Y80^NvNXZ9 z3##DXCJ<_uq_n7OgaacoD0F;MzCW!&tU1v0L42`cAap3CK(?=&iIM zs3-B&ZIM92|L$i=E1~P>2fT4*hM+0y1xavck;AHFfV^Q?$4)Dpp&UYK>^w7K- zj91O*Ov{c_Kh@q^n=y_Uhb-xi8)1T<@;)WV$Xm8zYHG?j_z^EwU`$(=t_;+i7S2yt z^dGuXiGlR|{y-i_E1S#5Acq0g8YI}Oe@sNSm>yCC;g8enJemw49q^8*{c?iS=-Z%y zkEnWg4b|ShsC;bWB5H43)VPe&I1zJ;ziK3c6X(qz!oz2RU=Qg<3;-pJWMFgcv*;fg z3*9lP0nZO_1OZ(Mr;BCc!u=B*uB*q*=IetZMei*RcNDJdpSP;GH+xCUK*a&a<1XCy z;CN(T{Z7!Wv@Lvm5z^gsvc-rEb}#mO`@TE*Y>M<8D`)c*@irNDnJwB8++*( zG)RURXiE9`g^E@51^1)No3R~C8*?W-c5X#w{E&O}O1<=j)Wt_~@vO2}ydOhe;&mBN z7S7aQ1O32Y-I-oY-9DF_HJX(yO{4BRZvT*5Du0jLDhE)JqtRqs3QuGFcO1SoB*x`n z{%{Mn`QRT^xpnO3!VmV%<`5zP7!t)akkvf5@CN|^lzsklApgPTzTc> zVI-^Lj~!+CE}AjM>nps^Jpl3!03dJZ1p6h1_q6-v4wYy*2I^ za#PjS@LPUrnxdt@XAH1ysmwXrb&}is!cFq8#ejz$84yR`AQcnIPZf*AAnhpCtqCO! zezE`9m7ZF){i;Q0^6En#mkSR*XEKz7>a=4Yjj0)Esld`+E&=J-yd3H8;uAmv@fo6k zm%{f7G^p@TAk%eMrAa9`;Yj8@w!Vzg94Vs?sP(6|Y-bt_G@K{T08lOw@pu=M@XN$EER)N(AQ!}@Pkd7DOwb;o0 zjSebqnBR*5#H70zx+6}2V^EsR!U7n^r<4oc2S#qF81G&wP{1}m#kdO;kRopmx(DG$xj3$yeOd8l?8W9|^avkMMg026z} z;fMSd;6vlK^~-^V#Q&@$VbQ=z0vl(nQUfKt-)y%HBW0uMHfdMe-5f9jSj9FHmsUj9?o~j+2{Eo8e~t(7$*BRR^zff z&tHcR@?I|PTyYXhm*UhU=~5Zc698!6kMNsf+a&iv<8uPH6PekQCDpwju^426o%E|I za!&F=+6GzMvos$?y*}wN@U(cMW@rBlY$hQb@s!5?WN z6!Vr-$F0})?S*_}o9Bt}me|HxDIp_Tzt)YMotdr)H)lHQWGMSFN5VbI)|Rp;0*|%z%kIHaX~$ON zkD$xsWjuGx8<{4SYmuO(TLbJG;go$WR9O_TmP>Ebe%cUYK<^N+=MFzhLp$bglG42i zz0ECicOFs~J>jkw20@_WCOMDVU}o*vR}Oe+#UpfL3%)0KP0!Sqb+N!$FP#8nCh>f2 zI^?je%6*jm;yq84OsWhu=Q*#Rdtahnm=zd$alD1)xari`U}%??p!;Anzu4G4)FDxP zw>RC?R~EA3I@!4LRzKItCcK8&$0?0s-SsaK*b!FzKFo_Z$gk{M8L7SF4lj-Ou+lvV zt_F6ei48o{+kX&|`asl}R~<1(G0|J2GiEArxdMN=KY#lNoJcBla(XjzzESlm zwrS_2TNoUhE~wa@h?+54?o5CRTghcN zkkRW$D2cG_URDBVHHS_z4{VOZQAzH(ht_f(DxAoqfuHDb*IUG@)eSO$o8{rseLGua z<_9pG*7O{lbsE3E>lXTR;D_-kvVG&A9Y!-|-2gTJroa*tthjO87nS{!f@VG`qXbZz zc&7-6p{D{Q9=Xvyy})nWRG;iq`n^mjq0OW#=H|g}&A7>yuV8ae%fMcb5B54<;*9Gg=6y#qW6!~(>XVT34ru4l);gyg)r8C*!j=J^Tj`80Rqm2daVvtf_n8= z9=I=sK~?}e<^#F4*c1R`4$>7&)_Thh;eWD8L>we>X~AtFB$9S=019NHpwDLiSU4hJ zTOUR6R8nO-xkQ*$tyn8x7YR=S(JGB4qp_Y0xTJo??b^mrl!EOcRHBDJt-upNAK;bE zvGnh?$96+qB~0tu_HyUYv1Mc`jg$6`^-{$l%Wm)U^CI2LVA2lBz!F7>PzMCOChso@ ztbd081mhIH=zyYQ{p7>h9vKiNj7rxRSb7XxZ(^>)s*sU4cWGGfHnVz)Q|%0AVT2AnVzGN)?0H zPNsl`q5M$3f&nNn;H}lsS`yxgh4|sa0>PrZU!Mt;X&PQzdqzz51&-fygU2eC)T+VF z6H1S?gwMj`@O5fTfNdbWJw2(BNVHOB<7`W6%uWE+M4O+MVi&6^70~{@9nT32$60qU z$(J%^n7fyY{mXI@%`PvAf8)7JnK_?nBAnjx^3tzvXfFR24$c>$IMnhULLe2UE2pL>Y(M= z!2v4WfTP7;HhcbM4G|a;7_vf9EF_3V{xzFF{Ry4jsBSG@B^g6Rd1;yxjH504n70_l z`gx0MT^$pK(JT+c^;X=(v9HLvzAv;RNUi} zN6{`j-0fZ=rhb1opbdgQuvMud8?sJM$H(Rkl(91BS)LF+=USy#DQBUehWV>|i5N`& zmp=}0JkxrB?&i|g&8j_G%!n(zPY|OLlHOy~u|7|q1t8*-){Ifgv?cT<3jFl#fRXQa z=pHYNL74Fi^llqhqK^z_HlvKZz7G$sSg0Ytqnm;ovr9p78cui?(-x^SaIk#Wj=ndm z+uc_C`7C6MgUH^-3)pj<Y18iC5 zI{`hC(P(06>#SjGb}FSmA2X%Rf+uD?odu_JPgGL%CRI!?!E9$%2EXy#bM~ywRhn!J z(f|@yv4GW|eiXH@92I}>qoG>3bMJCex#VR1_)zugO;oO!_a_J51;t#vDe-p{`DFD=u4nU8jfS#aT=*DMxpt${|^sP?Jwceoc zd-^0^e#=q_5`Xb{|1$G=iqbd0kAQqWT3iTK0xiclXQQL@N^ zz4Z!!qnH@4?!PdY$5NR{@vOiwBI^RCip>YXRe>xt%j$^u{d($?fg*!6(N)%KOO!Km zPzTb^*<+Jch%<#pK;|aE(R?39@BY9Aq{JTXdif0vyh;;>oZ66aZpCRThLB_DxWfAh zK+{`F_79CJ;J9jT#T4T6^%EiwFD(RbpuC%x9%t^OOVvtE8=S(BGHsY5l7tkzAs<%) zTDy|KFO6{l1eoB)jNni*d6*zlC^2kJ_zZM{{CPkBBW%9C=ZwZv_O%f5c7Yr2sn!xH zrcX!?p>2kWo7uboH>eeQ?@o->I09ewl^`$487MwJCqQc?je7B@_9O>co@Fk+=F|Vi z`yn6!A(|c5=G%1$%5c(6>T^z~r3$ypl3|XC@x^5JJ+n!SPLHQTg0Pwk$4Gs(=YmEO z!>oS?H!;fe=@9A&&8C|UVUwY?@yd|l+nybo2>bg1 zL4d7LZ(YEYojTCcB|)Kno}e3?~>w>85_z2vPtARbEhJQ@b4gCn_Hu#vfx+MpH`rt z=AaUEAh-|Z0AjPBDta$R&^5Ce^-reva(cMD_p?B z8*xvQ&*dE&29ewG4HqSCrD7<_^pv@usi+zEN_yj=e3{!3 z-2g;BS{8lzbmCqn{wYOp!=O0#MJaie7TYQfO=cG<|pbvt?c{Z4~ zG#R543?d@C<|~2#7TWJOV76lX(O5<9fj!W=4v?hZC5Dkh*J53(s6|G-F$Q444;2h6 zY0+Agi#9>z$S%x~Yds1BqfUs?IpAFMzG1N@>qfmR9}iDYqhy4}KH zvCjlGwWahHP>Ygq%CH>;DADK6x#O)swo0qZ;*-3^3~|_>Uar^Nm<%N?#hEV7*CFjK zy^@}N>!+DY{6-A5KmLBTIavJ*)EeI%`c8~w?zM-^V}LRxlyPoq6wh51tV4Vz60yi~ z-)|(}RueUZu7hr43v1n2b$?6fAWuN8`svLIexTxXcSeT!VX)7BE~@uhXj_8cpd=1I z9581o0p2<;I~u^_vB z61a+tOM%?Sw)@Dm;_OG;Y=*i|N*n2!ARjSA*`*0bqM-U{M`Cx&dOag*T?A~ckxTp5 z+X&bcf;jNMoN!5=QE%a72B3G=@3bBC4xQIjY%i&#GU*qNxa@rf6b);E=CKcDz@jZe zN?U^GwbfCd(lxuz$LDF&!9UrjTudBRqBuFf6-JOsB@ppxGs0KB6=*`{>L#7kwlyag zyhppMu@x;Y=1=S96tH3@jJjgq*P6(gi-!m6#CDIygKk2O=yrh?-i?Ca5|;PGzB)kH zze`ZRk4gk|W9ucA-9W^rVV9gcpwa5_UTe@uiSB$>0h~Glu)`{0W0-a^tpDbXY^cRD z{mBLRf5aidccOh@kjPKzCK~nDf!(PH`KEY3`K@d$*2=KADS@Nz@7CVz&KMy{ zMRlSww1`CfT|4-)GY^qh%_e4bj3&o^^Oj;NR_tJ`G-q*SA(ggo(VHqiyk24*brCRfv!Y*aksP(3EX(q#O<@LJ&P%XZcFZ_f@5?x_ilte&c6S zYb4@q&y}3REkU~aK!3+p65RMPZzl^7KxuB^>X9sPiF6au6V0Kas3GVur5ey1EXMH`pPc@j4i1cP>AZ5KrY$rQ8ZTTyVvByHv^i0V+sN00LCqDr#Ri@ zVh+fOL!2K4EAlN?hWR~;up+3fbx9tZD8(O;DRo9Y%IIhx6E>`ve$c~sbfU)sxya8D zpEhI5%LOS?Jp;>8hQ*tz>I+e4ZlXFba?(On<*Hi|aryweXEV0kvv1ZRKxzAY?WYY; z%H0TgrLnpFn#)5{h<5Ui%XYk)$NnH%gwI=l)?L0Z4|khEKFse2qFNna9F=q6n1Li6 zDsXx?Du%2o(NQj_`|;*&$GWQG?*C8;9u~r|JHQK$ve?c_=;aWV3mEZ;=!NFwUopN* zw2=6WF4YuRrogIesc+3(}X1?RUuGTZ+(}oD*w7_Z3dp^DXi>yY=LP+ z3p{D?gyo!7sx7DjS)M97#!kwxph&uZn97K;=TR|XKCKJ#o*-%pTf>s>H15O+^H};7 z0dzX^-9b<{4Zk?OCUSQpnksNa3&OC_7NGl1<0jt8yvw}9w2Bt4DN9ySXayRXTyE@t& zRc?sjm+t>t=w(YJX`J+a;};92{& zeuMJ{sNH5OpOQR~3j(yGE%>IcSEKg5AMcZ%rDQ|t7;DKSJ>Nm%bm%oDDKf&>p*h>l zq+E>_4-5@;cC#ll=}WiH5S&@9<)Mhms%8{v6h?;>YL}* zER~YfdUfONV$YghkF&I`B^|W+`uy0m-shG?ZGy}8E7tGMCh&#ApXMH9@d-WzEqn=Not` z&F_Fh06Q%AudWy^0X7=AF*(7v?;CZosCLMLJ^RLF*eaLdGJi*RsdX~|HgYDM+D_|s zlC6}BC6eeKOuu(I14D$51mheD=yT@7BH31AfO8P$ryLl}&%BT)^^Tq}Z=)V{|DbVm z?<@^P+Y7eGVs8n5#BONXb6TI#JK`1LkZFmRp8gXg89+-1v}sXgaLDJA5Rz2_riNVs zj%9Iq&P5%OVMWSbmFezymL2k75~{jeYmSXH3~ugIM0X|RLMQ&iBDtO+%XS4frByj> z0icsEo&{$}3Lvzymv<6WlwXR&GvDnb_0ao%M9hZt>e1y=|_&=QCkRKvAj6@s~MRo2&t*gARI)G zl9n&U;cOt!F!=UMnqB6Uj_E*(Fw!W{yS7V~(^@@>hn*dBB*r~tTn3lSf|}Y*kv4;o zoQz_dyV!Oq6VqZBa*x+--dPa-$jyBQqdr>x8$ITOfWOqDv1O)ize7*6F??8r@URiI zVe?9gO?AJ&bMc#%)()XA1gT~ zHF$=@TFA~&l}{!^3r`x6b`U-zhe_VF%Z!lQ14poIZbxeXaKFifkbNIO!!fX<3juKn zw%W7$y(EEi`W_940L0^;ezqiZRElk}nsisAz z++!rn^0yFvr=v&u!)w6ii#ts%qo5F58udG$Fq2)#OM@_I06}FL2N@vbRGL)k&+sM` zDldO=czwy2HA*G{$c0rtG!b0g*cz4+(YV7XxIj)XOUpvyq72lwY^WP;z1hK_o(pB46se>t2)NroQKdD4|d z*_3=EK}M~(Kbk_!+~&y~gL&j*bDvZRJ9Ql^K*VI?Hzg{rP5IixbH*l_YTf>x!FQT^ z;MHrMAx}(p`_`p#aj(%428T}F3;=8c1d2Hu1X@?sgUTX?=rLpcC!ODoLl~)fAx0bl z>LnaWaQ`P%nHzSeDb}dT_mS@MhC(;=Unke6v!V)c%QG2E7{Fc3)*nO6hm0SfR;h`M z%wj8vGUk`s)G)6YDwFld+8|$~IG@d+8Vjet< z=3Hz|x2a0m#!lcv{4tN$Qi3p^cGGc(JKT{K{&HEwkhCydsp?km?xf1Dv*MgMZQ*iPaNViW7pzMdOp95;kX81CadV!Ij4 z+_PzuHwabER7mYV5qmHn<)z*|y5gcioJZv7SqFqWO)yFZ5cGCKdC;{{%QXPWp7Sz9 ze_?^{1`Rqap#pAqD-vKHC22fp5=m~JQfunHANcQQp#fsJ$8KXGF$G^aOhv*n$7X1v zH=}AZ_Qj=ICXt)=1x&YzIi_e;^8tPU7NCMtZgQgGNoJRrHhbC^B$oN+Kd22K9%Sn!Qp%a+A5HJqK;PU-a1G$R6(M!~FkBoIV39J>;WMgeQS%NT zZ#WbTUqsf_t7+QTg{5_>GP%nTCWCOQ!PXeS7O47#MgRwEHWSKV$B5z?jz8fN98$vi zctJTtG{YkQTWh0d_rKNJXc3vN3-`_IhH4g)Ynvd6_9GR2G$P=MZ2nGg*7Kl1rm`zv zJVz~dJ91L{3DlM6Va`8%`3)o5(zvXc{XJNHx!e((8chnOkZxT+r<_4zi?I2;knm$k z6im&b9k7r>(aGZ`EJ=o9_=@j_XsM-R3iYhK6NQiSwcGS1%A;e3sU!exVN*(b2_3+! z@&A&Ac-kZy8T>(>P62cZiB;3*`lG3do%w5?j^=@s$YpncfO>uLjqbS`P*|gTLwBA| zA3S=*rZJT-$);ZQvu+*`0$P0=w?-mIFW$W--sFNAGW+4wE!t^_PKI8EfAB~PA-uyf zqgRxF)y~&1zJZD_2qZ!exAgrD8DHw$ZyC(iWek(AL^UjZ!|h6nEbyzR1kx=p3YsU` zz22QD&-t=+$!-NhJe6S!j3NrV-~x3|F^k8$lqx#!8gzjM3e8I5iDp<~W2iM8m?z*K z)uofl{XXH}+!~<4mp>;!vObbFG#~=+u7w)%ALJU6oi1E_44YFN(MirJH0hr9?}<#1 zf+?(9S53LXr!9nXYji7ry7dQnjz7ph`fh)(JM_^ry~(>lt~;lw{3~?qdEk+ln5MIU zln|D-_J>ig-=LpF^~|Z@A=8i{4B|X0V}M^PaJVzOaHwDf8e=~!;sQ!{{IU^Rz*F>2 zay^R>Zm;zpUR<(mce}(Q^f1F#%~=HEg8;!5S+s_S6B}P3Rnr&4z%Im0@tQ{f`{yRQ zXB=r2^#A_>ZN*l$ZCmO}21oD)Hr?K+$Ix`-7Z&1k@Tp}01$U5|Ys#4E7KLV|aanxM z`9?Lrwtw9V99eU}e%Qe|MT!WSQj*4apkx_-0g3N*GC?PP9FWWN0ud*Nb(-$CqenJ< z=qi2mDumSN0@Sv>O)4z(w9#b4-f2DYW7)-4D{3wTfeO4$k9dkIaxhl*ARH^bA)rkO z7kJR7z|hvs(8!%3odt&$jP7RUcJvS;aIBW}vVe()NQQc_Li$UQ&E`#_bC!wk3_M-o z^whl;9d6Fc55$oJpU6tfK9P=vDc3UDu>{NT#loKBfCeOrg}!jR><$k^e)*hMw#~R5 z`9fI3R)U35txTmlk6Ip=5to`ehJgmQ2N#$6I`wj)3oSx?zW4Ze-xB}?Hrf+C?|V0V zJ_Lx}MW`Ujn}?0IP>j5b7Rf-mFdj^R2~eHWMquWz``L1kR+uV2h09#}LNf%^$wa^L zW(S_uzMIDNLkboIdIoWrJWyR;vu9@Bv5xpT1QvXeTt6?w)UCT;+u+!KTBb-TRQ@)h zvg?KPG4-p(5$ARx_x#46OydOiYDb?l)cP{P;0)dh-9E0IQ8pr8Bl!%@bP=$kw)L30 ziTu=vdU;#~3>-2w_5d6f`=xZhl^1^MM{=d$vGTGW3OBXy-#KBwOiuP7gB2h1o+~Td zeDCiOXxeKN-Q}(@8=V=6Ss}x>zTHXhcT$)-Uh`j(kc}1mGzwZ9JlyRqzh+lC&>h6h zjzY_M?3q(vpdy*V0O8~|2%d9%sw=m)rzfbruRzAm&;Z5Fz7@9F<2JC++KVuvg}wH(huHuvm|(#SU42DJ5b$+100 zyFyRLy7$W~1?&+=!v=zUH+Cm~CDXyD6_RLxtwOSLPQ z%Sw1mAi;|q2$J9>=Rt&p#}=oShWEbP&l5^XAshmZ%qmyW9_Qw9R&r4a2F1sR#3%45 zBx0POtS(@?GYm{c)kJR(xMN+7$&&?vBC;hGJ`{WDq&$^(Ehb^?dgpy+XKHgMXZ4ho zF4q(0Wpk|ZqV|NzzKDl7mtS~IeI8EK$2CEfF^HSZnI!;8>A4HJ1Y7I&$s<bSxK703Bjj0ixDwXP1h?4zWZo#6dOX@Q3pO&;#_>s>_%my!G;V$6U3s1{sN8tIT!?Ka{<03qFi?hQY`%7yJ$FxpP9*7T}57?Tr7EmGB)-#AF=>r+u-3;m%c&s$0)#1Vh9?- zIqHgYf`j(~B2(0puWW?p6{n25l+iNy>R#wwU0~#G(?eY&^qzYyjx6=a&%K-)>3F22 zJ)M6Xx1%n)9nV%0wv6-ZnjT(Q{c57EFR~@0&P+-POUaGoiR+~TK}HMCrSlA(3#hggzca?^rAGn!|(+pXz2TG_Cn@PmIh)(}Ez`+Ii+qe{M@ zuI(<^O&?i!X}@s)8M`WQZ0GgW0Px;I9U@NTbKY%JSQFasxF-}Rc(BI&_)Q2G*QdQ7 zL78^d$>S=?7b3PK6;JYKE~L zl)iAfF~A~$o2&LBdS5iJepPN0mrB;<_(_pc!KO5!Tqu4)>q}>7oFZG=@}EP(!Z*?k z`K>+N?N3phs=#5xnfjsu{^y=IfVpN_lCES z?_OV>;OV$rjtcsphUMFw4F7&oa9m7=`Qt?6?=kIujTs`Y*?$KT#*%xECUn*df@g$G z2W=)u3Ay$wY%5nwpeP8($Kii}NhJ1sq|cNy(Z==T_NOar*Om{3Tm18bD&HvnG$8S2 z1HVArymFQ7-kF+LyS69%|`7jE`ZveC=egZsweo_>27#(1Dazx{J54`OxacM1J%3zs=X5@ zpV`d!QrbP_f&tKus?Wsf7!0pb1|9My{~QC@weUL4eU_XOy##6aUz5(R$*Yo5`*%Y& z4lda$z;GhRd&Pr&*2%X9RGWoVcUAqkNz)_rIQ-55zS=5k>MVmyGP?g53%Do-w01Uc zg$;Cyq5a3{gJa}PtqHkvHkD5OaC5>&$bRw+Sg$RJEc_Gl5+N$iSFg>YQs&K%0ztO8 z4pRImNeWKQ!YF{ zw$&TqK();fe{<$uXcMN=5*KpA>{T(ZamEuHr~T!Hf4vym`=KT- zzxcSFF?rb=tQchFI%RBcq~lWvV}y1Ps0XhBaOq06{efcUjso)7p-ZzgMiCJenX(u&|M&PI-+?XOyDY) zwVjf*XFOS!J@=1coMoFK9xP0S-37U&z41ZEN;I4bXtTr8~JCrc!(JmL;{3`TKs#=nyYMy zpC0GbI=*UlJc^U-kbXh3;kvMJivo5F1HrZu>IW*}<X6)~8xQT|pNZ2vya`41&_YDcK?eg6r+dyxQR_ zA^M)I8`!Wfw|WNNS_=L9ZyJ21t{52UN3v||cGSQYcgZ(>n%0hS<=5?7L$T&w>MmdP zPF=bH-!zw{nD&pUdAO4-M#F1>apEJ~jH=P6?vg1n$o}PPBPFkZ93K1roRIloW1YNb zGy>-x=4u-^A|qF8mTSbBjhS}8HWif^m%54Lhc8O&VubrYs|)SwGN3s#n+3#wd70m~ z*$yws3}m~Sl-i5a*c+X_S3P|mNBV&v3m+%I0q-)T?*9tIf4?}V`mS^)yVj_ND5_a& z=vVW8dQ9MHQI&)sY3|*NXMgZD;qYdq@;08B$jZx{&3@`VE>!`T{Z*p&J|51Jl9Wk= z2Bc*8`3_bUpUL(&p0(3gN~VEdyux=)%d&O(VBs1w$mUr8Y za@t>L-km=?D5t3l292jCY{n<1uZ6(<=j;JfD8#s!B)DVeYP`=|g$1tCpd4R}bhDxF zu^_|RN;vpAp3w8T=ldZm9Tzv4txs{R`ovS~zQJ-$G}*#jfB03y2N7#DP6B3lWpngxNl~qV!<;>U3?DtVR)&;@ZB`t2|Ghe1rPEGstwU zbrvcj{q~)wxyzx;vcuoG!ylhs7i+1n-gV&Jf5!Q6Kb=52Zzi1q$)7O06j7TT`ZQ7+ zZMmBHq~ZLY?1#PxS0&J~BwIH>4;zoYA{4osn4H*nj7Vp7Y&rO>cCbOSpsr*9+Nghc zsc-w5LV1C5IE7oZ0?3d}Wy3w_Jz7ar-_^Wd5y_*8hZI#x64%+r>@e$R4-0-z@!Hs{ zdW&Sq^AETC%g$S#EHW~)ISi&dMVDMH*#{KpiB@-zRV9gNzBDvEvl3LA$<~)s_Ank+ zymfnVKLIt{p5igf?bNE!NPnBxdi9inTc6)&hMT(so^`ZoyzA}ckuF_`-bzcwe;gdAh)ZY@jBUa8tctX zT}>yhLf^#8lem2<@7;%v$L+b6we?7+EkXW7h3Bo|`#$s9`@8i?4K3r=_#0T135)1H z*YD&fkP=?7O{LqZD251w1UK6$JO@3b#?tV8xr&MN*DRUn+ZJx&^_~b1W_o5xo9aoO zwHYh(2yMyL{P|BA=@*(XFz`%r(h_Q#)8y6tRD^baWp97vGR;J=ERf5?3SJNEeR<|2 z)21A0hF}oAA*W|?lFj5DVncoLb~ZF8Yxs*#((d%^#9jhH(v$XQTIZ<^`rao?c5_|| zxmVfJ;Ed#ht}eI60}OB<7oC$%Y)v0J30`>;*GsLFqH5Rek$TE1RPdWGo-;ps*VCt6 zFOC~uWBq&mkiXIx27x;0Y|IjHqtdT3_xO6eUjGmbk9r4vXU8`EMoTQ^qnXR)i9*QF zrP=_G&8e3J2g8Hw*;#U*J;AAq9;I_gWW4s;@!O=};PpZ>pP5o1I=&nyY)=6!mlgetoAEztg@7hz#hx&U3T2lSz#re*` z7ehOEuD)9sjC5s`4!^7Sh8B?VUU#Hk%E8cEa z&DgDcvuGOoM)GgVtU^#k1^4Zl3l8V8g!4dJAK8laQ&iz!^uF&>6R>;I8v+sZ;356b zZ&6PnG~!exA&T9NMd*|j$`UDjUD%Rt!JMDl16N;g>T#I|-fhim&&}a)KmR+_{SFFG z5EOxZn0|L`h4&4gyNxa|c&^Wqgg+${_!L+2cP#kz2|lZtq>XTN!a(HY{rder0R|bG9{tM2%77{3w%OK!-KhCScF?d?vfR#tCp zFrj&^^WHb-d}sUa^16C8ZoB&UX;zAnRil6V#~ZvuLC6u|b>9iQ&DyrH?etfDB3&T` zmnhJSeo|vEVP2NOrXMwRZE*BjNBT39tRwoG1&+bG`BbxxYosHe0F(aLSe_NG+9_!#_%u(jeukcde>KLlDlk7i>xIi zlOXx$E&aKUmKow9tqEu9-&OyW765;WZ=&8%ecvY|1e3jtfC2u=$tX&fNE-V8FF?=W A82|tP literal 0 HcmV?d00001 diff --git a/examples/open_llama_4D_benchmark/download_open_llama_ckpt.py b/examples/open_llama_4D_benchmark/download_open_llama_ckpt.py index 876228a..5fcf32e 100644 --- a/examples/open_llama_4D_benchmark/download_open_llama_ckpt.py +++ b/examples/open_llama_4D_benchmark/download_open_llama_ckpt.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/examples/open_llama_4D_benchmark/llama_mfu_calculator.py b/examples/open_llama_4D_benchmark/llama_mfu_calculator.py index 9bacdd5..b67908d 100644 --- a/examples/open_llama_4D_benchmark/llama_mfu_calculator.py +++ b/examples/open_llama_4D_benchmark/llama_mfu_calculator.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/examples/open_llama_4D_benchmark/run_open_llama_w_vescale.py b/examples/open_llama_4D_benchmark/run_open_llama_w_vescale.py index 8117551..22f7cf8 100644 --- a/examples/open_llama_4D_benchmark/run_open_llama_w_vescale.py +++ b/examples/open_llama_4D_benchmark/run_open_llama_w_vescale.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/examples/open_llama_4D_benchmark/sharding_plan.py b/examples/open_llama_4D_benchmark/sharding_plan.py index 12bcd65..2aefd81 100644 --- a/examples/open_llama_4D_benchmark/sharding_plan.py +++ b/examples/open_llama_4D_benchmark/sharding_plan.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/requirements.txt b/requirements.txt index 5e4d40f..82b132a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ pytest tqdm optree accelerate -transformers==4.37.2 +transformers==4.40.2 flash_attn +matplotlib mmh3 \ No newline at end of file diff --git a/test/checkpoint/nano_gpt/test_nano_gpt_load_save.py b/test/checkpoint/nano_gpt/test_nano_gpt_load_save.py index 3c487b5..5e1e8aa 100644 --- a/test/checkpoint/nano_gpt/test_nano_gpt_load_save.py +++ b/test/checkpoint/nano_gpt/test_nano_gpt_load_save.py @@ -101,9 +101,7 @@ def init_method(self): @skip_unless_torch_gpu @with_comms def test_load(self): - ddp_gpt, dist_optimizer, _ = build_gpt_model_optimizer_and_dataset( - self.init_method, dp_size=2, tp_size=2 - ) + ddp_gpt, dist_optimizer, _ = build_gpt_model_optimizer_and_dataset(self.init_method, dp_size=2, tp_size=2) # Load the model and optimizer after first data diff --git a/test/checkpoint/open_llama/test_open_llama_dp_reshard.py b/test/checkpoint/open_llama/test_open_llama_dp_reshard.py index b1f6cb3..370dadd 100644 --- a/test/checkpoint/open_llama/test_open_llama_dp_reshard.py +++ b/test/checkpoint/open_llama/test_open_llama_dp_reshard.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/test/checkpoint/open_llama/test_open_llama_load_save.py b/test/checkpoint/open_llama/test_open_llama_load_save.py index 0a3a29a..c0a8377 100644 --- a/test/checkpoint/open_llama/test_open_llama_load_save.py +++ b/test/checkpoint/open_llama/test_open_llama_load_save.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/test/checkpoint/open_llama/test_open_llama_tp_reshard.py b/test/checkpoint/open_llama/test_open_llama_tp_reshard.py index 5096062..2a85cae 100644 --- a/test/checkpoint/open_llama/test_open_llama_tp_reshard.py +++ b/test/checkpoint/open_llama/test_open_llama_tp_reshard.py @@ -1,6 +1,6 @@ ################################################################################ # -# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/test/model/open_llama/test_attention.py b/test/model/open_llama/test_attention.py index f014531..ad7c281 100644 --- a/test/model/open_llama/test_attention.py +++ b/test/model/open_llama/test_attention.py @@ -56,7 +56,8 @@ def test_attention(self): input.retain_grad() non_parallel_attention, _ = get_model() non_parallel_attention = non_parallel_attention.cuda() - golden_outputs = non_parallel_attention(input) + dummy_position_ids = torch.randint(low=0, high=s, size=(bsz, s)).cuda() + golden_outputs = non_parallel_attention(input, position_ids=dummy_position_ids) golden_loss = golden_outputs[0].mean() golden_loss.backward() @@ -84,8 +85,9 @@ def test_attention(self): d_input = distribute_tensor(input.detach(), device_mesh, [Shard(1)]) d_input.requires_grad_() d_input.retain_grad() + d_position_id = distribute_tensor(dummy_position_ids.detach(), device_mesh, [Replicate()]) - vescale_outputs = vescale_attention(d_input) + vescale_outputs = vescale_attention(d_input, position_ids=d_position_id) vescale_outputs[0] = vescale_outputs[0].redistribute(placements=[Replicate()] * device_mesh.ndim) vescale_loss = vescale_outputs[0].mean() diff --git a/test/model/open_llama/test_decoder_layer.py b/test/model/open_llama/test_decoder_layer.py index c55ac9a..b32292c 100644 --- a/test/model/open_llama/test_decoder_layer.py +++ b/test/model/open_llama/test_decoder_layer.py @@ -56,7 +56,8 @@ def test_decoder(self): input.retain_grad() non_parallel_decoder, _ = get_model() non_parallel_decoder = non_parallel_decoder.cuda() - golden_outputs = non_parallel_decoder(input) + dummy_position_id = torch.randint(low=0, high=s, size=(bsz, s)).cuda() + golden_outputs = non_parallel_decoder(input, position_ids=dummy_position_id) golden_loss = golden_outputs[0].mean() golden_loss.backward() @@ -95,8 +96,9 @@ def test_decoder(self): d_input = distribute_tensor(input.detach(), device_mesh, [Shard(1)]) d_input.requires_grad_() d_input.retain_grad() + d_position_id = distribute_tensor(dummy_position_id.detach(), device_mesh, [Replicate()]) - vescale_outputs = vescale_decoder(d_input) + vescale_outputs = vescale_decoder(d_input, position_ids=d_position_id) vescale_outputs[0] = vescale_outputs[0].redistribute(placements=[Replicate()] * device_mesh.ndim) vescale_loss = vescale_outputs[0].mean() diff --git a/test/ndtimeline/__init__.py b/test/ndtimeline/__init__.py new file mode 100644 index 0000000..98f6b56 --- /dev/null +++ b/test/ndtimeline/__init__.py @@ -0,0 +1 @@ +# make pylint happy diff --git a/test/ndtimeline/test_local_raw_handler.py b/test/ndtimeline/test_local_raw_handler.py new file mode 100644 index 0000000..28253c0 --- /dev/null +++ b/test/ndtimeline/test_local_raw_handler.py @@ -0,0 +1,37 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +from vescale.ndtimeline.world_info import WorldInfo +from vescale.ndtimeline.handlers import LocalRawNDHandler +from vescale.ndtimeline.variables import LOCAL_LOGGING_PATH + + +def test_basic_usage(): + h = LocalRawNDHandler(run_id=0, chunk_sz=10, backup_cnt=3) + file_name = "timeline_run0_raw.log" + h("test_metric", 1.0, [1.0], [1.0], [{}], range(0, 1), WorldInfo(0, 0), {}) + assert os.path.exists(os.path.join(LOCAL_LOGGING_PATH, file_name)) + for _ in range(4): + h("test_metric", 1.0, [1.0], [1.0], [{}], range(0, 1), WorldInfo(0, 0), {}) + h("test_metric2", 2.0, [1.0], [1.0], [{}], range(0, 1), WorldInfo(0, 0), {}) + assert os.path.exists(os.path.join(LOCAL_LOGGING_PATH, file_name + ".2")) + assert not os.path.exists(os.path.join(LOCAL_LOGGING_PATH, file_name + ".4")) + os.remove(os.path.join(LOCAL_LOGGING_PATH, file_name)) + for i in range(1, 4): + os.remove(os.path.join(LOCAL_LOGGING_PATH, file_name + "." + str(i))) + assert not os.path.exists(os.path.join(LOCAL_LOGGING_PATH, file_name + ".2")) diff --git a/test/ndtimeline/test_metric_level.py b/test/ndtimeline/test_metric_level.py new file mode 100644 index 0000000..96f755c --- /dev/null +++ b/test/ndtimeline/test_metric_level.py @@ -0,0 +1,30 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from vescale.ndtimeline import NDMetricLevel + + +def test_cmp_level(): + assert NDMetricLevel.FRAMEWORK_DEBUG >= NDMetricLevel.INFO + assert NDMetricLevel.USER_DEBUG >= NDMetricLevel.INFO + assert NDMetricLevel.USER_DEBUG > NDMetricLevel.INFO + assert NDMetricLevel.USER_INFO < NDMetricLevel.INFO + assert NDMetricLevel.USER_INFO <= NDMetricLevel.INFO + assert NDMetricLevel.INFO < NDMetricLevel.DEBUG + assert NDMetricLevel.TRACE <= NDMetricLevel.TRACE + assert NDMetricLevel.TRACE >= NDMetricLevel.TRACE + assert NDMetricLevel.TRACE == NDMetricLevel.TRACE diff --git a/test/ndtimeline/test_parser_handler.py b/test/ndtimeline/test_parser_handler.py new file mode 100644 index 0000000..b745ccf --- /dev/null +++ b/test/ndtimeline/test_parser_handler.py @@ -0,0 +1,61 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import pytest +from vescale.ndtimeline.world_info import WorldInfo +from vescale.ndtimeline.handlers import ParserNDHandler +from vescale.ndtimeline.exceptions import NDHandlerError + + +def test_normal_input_with_tags(): + metric_name = "test_metric" + recent_elapsed_raw_parts = [1.0, 3.2, 1.4] + elapsed = sum(recent_elapsed_raw_parts) + recent_since_start_raw_parts = [1710332816.6118143, 1710332833.2222, 1710332846.1313] + single_tag = {"is_test": True} + tags = [single_tag] * (len(recent_elapsed_raw_parts) - 1) + [{"is_test": False}] + step_range = range(0, 1) + world_info = WorldInfo(0, 0) + callback = ParserNDHandler() + records = callback( + metric_name, elapsed, recent_elapsed_raw_parts, recent_since_start_raw_parts, tags, step_range, world_info, {} + ) + assert len(records) == 1 + assert records[0].step == 0 + + +def test_normal_invalid_input(): + metric_name = "test_metric" + recent_elapsed_raw_parts = [1.0, 3.2, 1.4] + elapsed = sum(recent_elapsed_raw_parts) + recent_since_start_raw_parts = [1710332816.6118143, 1710332846.1313] + single_tag = {"is_test": True} + tags = [single_tag] * (len(recent_elapsed_raw_parts) - 1) + [{"is_test": False}] + step_range = range(0, 1) + world_info = WorldInfo(0, 0) + callback = ParserNDHandler() + with pytest.raises(NDHandlerError): + callback( + metric_name, + elapsed, + recent_elapsed_raw_parts, + recent_since_start_raw_parts, + tags, + step_range, + world_info, + {}, + ) diff --git a/test/parallel/pipeline/api/four_mlp.py b/test/parallel/pipeline/api/four_mlp.py new file mode 100644 index 0000000..44d2d49 --- /dev/null +++ b/test/parallel/pipeline/api/four_mlp.py @@ -0,0 +1,53 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch +import torch.nn as nn +import os + + +class MLP(nn.Module): + def __init__(self, features_in, feature_middle, features_out, value): + super().__init__() + self.value = value + self.counter = 0 + self.fc1 = nn.Linear(1024, 1024, bias=False) + self.fc1.weight.data.fill_(value) + self.fc2 = nn.Linear(1024, 1024, bias=False) + self.fc2.weight.data.fill_(value * 2) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + torch.save(t, f"{os.environ['model_name']}_mlp{self.value}_fwd{self.counter}_out_tensor.pt") + self.counter += 1 + return t + + +class FourMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden * 1, hidden * 2, hidden * 3, 0) + self.mlp2 = MLP(hidden * 3, hidden * 4, hidden * 5, 1) + self.mlp3 = MLP(hidden * 5, hidden * 6, hidden * 7, 2) + self.mlp4 = MLP(hidden * 7, hidden * 8, hidden * 9, 3) + self.sequence = nn.Sequential(self.mlp1, self.mlp2, self.mlp3, self.mlp4) + + def forward(self, x): + return self.sequence(x) diff --git a/test/parallel/pipeline/api/test_pipe_engine_api.py b/test/parallel/pipeline/api/test_pipe_engine_api.py new file mode 100644 index 0000000..5075bd5 --- /dev/null +++ b/test/parallel/pipeline/api/test_pipe_engine_api.py @@ -0,0 +1,417 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +from common_dtensor import DTensorTestBase, with_comms +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.optim.base_optimizer import BasicOptimizer +from vescale.pipe.pipe_stage import PipeModule, construct_stage_modules +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +from vescale.engine import PipeEngine +from vescale.plan import ( + PipelineParallelPlan, + PipelineScheduleType, + ModeType, + PipelineSplitMethodType, +) + + +class MLP(nn.Module): + def __init__(self, n_features): + super().__init__() + self.fc1 = nn.Linear(n_features, n_features * 2, bias=False) + torch.nn.init.uniform_(self.fc1.weight, 0, 1) + self.fc2 = nn.Linear(n_features * 2, n_features) + torch.nn.init.uniform_(self.fc2.weight, 0, 1) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + return t + + +class FourMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden) + self.mlp2 = MLP(hidden) + self.mlp3 = MLP(hidden) + self.mlp4 = MLP(hidden) + self.sequence = nn.Sequential(self.mlp1, self.mlp2, self.mlp3, self.mlp4) + + def forward(self, x): + return self.sequence(x) + + +class EightMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden) + self.mlp2 = MLP(hidden) + self.mlp3 = MLP(hidden) + self.mlp4 = MLP(hidden) + self.mlp5 = MLP(hidden) + self.mlp6 = MLP(hidden) + self.mlp7 = MLP(hidden) + self.mlp8 = MLP(hidden) + + def forward(self, x): + x = self.mlp1(x) + x.retain_grad() + x = self.mlp2(x) + x.retain_grad() + x = self.mlp3(x) + x.retain_grad() + x = self.mlp4(x) + x.retain_grad() + x = self.mlp5(x) + x.retain_grad() + x = self.mlp6(x) + x.retain_grad() + x = self.mlp7(x) + x.retain_grad() + x = self.mlp8(x) + return x + + +class ScheduleTest(DTensorTestBase): + @property + def world_size(self): + return 4 + + @staticmethod + def loss_fn(x): + return torch.sum(x) + + def _prepare_runtime_engine(self, model, forward_only: bool = False): + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=1, + smallest_unsplittable_units=["mlp1", "mlp2", "mlp3", "mlp4"], + split_points=["mlp1", "mlp2", "mlp3", "mlp4"], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + forward_only=forward_only, + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=["PP", "DP", "TP"], + ) + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + pipe_config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + _parameters = list(stage_modules[0].parameters()) + optimizer = torch.optim.SGD(_parameters, **optimizer_fn_kwargs) + basic_optimizer = BasicOptimizer(optimizer, models=stage_modules) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pipe_config) + engine = PipeEngine( + pipe_module, + VESCALE_DEVICE_MESH, + self.loss_fn, + pipe_config, + ) + + return engine, optimizer + + def _prepare_runtime_interleaved_engine(self, model, forward_only: bool = False): + num_layer = 8 + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=2, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layer)], + split_points=["mlp2", "mlp4", "mlp6", "mlp8"], + batch_p2p_comm=True, + overlap_p2p_comm=False, + schedule_type=PipelineScheduleType.INTERLEAVED_1F1B, + forward_only=forward_only, + ) + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=["PP", "DP", "TP"], + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + pipe_config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + _parameters = list(stage_modules[0].parameters()) + list(stage_modules[1].parameters()) + optimizer = torch.optim.SGD(_parameters, **optimizer_fn_kwargs) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pipe_config) + engine = PipeEngine( + pipe_module, + VESCALE_DEVICE_MESH, + self.loss_fn, + pipe_config, + ) + return engine, optimizer + + @with_comms + def test_runtime_engine(self): + """ + Tests pipeline engine. + """ + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + n_hidden = 3 + batches = 8 + model = FourMLP(n_hidden).cuda() + + all_batches_out = [] + if local_rank == 3: + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(3) + model.cuda(3) + out = model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + print(loss) + print(" ====================================== ") + + engine, optimizer = self._prepare_runtime_engine(model) + + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append(data.to(device)) + + minibatch_loss, _ = engine(data_iterator) + + if local_rank == 3: + self.assertEqual(minibatch_loss, sum(all_batches_out)) + + @with_comms + def test_simple_inference_schedule(self): + """ + Tests pipeline engine's inference mode. + """ + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + n_hidden = 3 + batches = 8 + model = FourMLP(n_hidden).cuda() + + all_batches_out = [] + if local_rank == 3: + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(3) + model.cuda(3) + out = model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + print(loss) + print(" ====================================== ") + + engine, optimizer = self._prepare_runtime_engine(model, forward_only=True) + + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append(data.to(device)) + + minibatch_loss, _ = engine(data_iterator) + + if local_rank == 3: + self.assertEqual(minibatch_loss, sum(all_batches_out)) + + @with_comms + def test_runtime_interleaved_1f1b_engine_batch(self): + """ + Tests pipeline engine with interleaved 1f1b schedule under + batch p2p communication. + """ + global local_rank + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + n_hidden = 3 + batches = 8 + model = EightMLP(n_hidden).cuda() + single_model_data = [] + all_batches_out = [] + if local_rank == 3: + true_model = model + true_model = true_model.cuda() + true_model.train() + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i % 8 + data = data.float().cuda(3) + single_model_data.append(data) + out = true_model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + print(" ====================================== ") + + pipe_engine, optimizer = self._prepare_runtime_interleaved_engine(model) + + data_iterator = [] + for j in range(batches): + data = torch.zeros(1, 1, n_hidden) + j + data_iterator.append(data.to(device)) + + minibatch_loss, _ = pipe_engine(data_iterator) + + if local_rank == 3: + ground_truth_loss = sum(all_batches_out) + self.assertEqual(minibatch_loss, ground_truth_loss) + + @with_comms + def test_runtime_interleaved_1f1b_engine_p2p(self): + """ + Tests pipeline engine with interleaved 1f1b schedule under + overlapped p2p communication. + """ + global local_rank + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + n_hidden = 3 + batches = 8 + model = EightMLP(n_hidden).cuda() + single_model_data = [] + all_batches_out = [] + if local_rank == 3: + true_model = model + true_model.train() + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i % 8 # + i + data = data.float().cuda(3) + single_model_data.append(data) + out = true_model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + print(" ====================================== ") + + num_layer = 8 + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=2, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layer)], + split_points=["mlp2", "mlp4", "mlp6", "mlp8"], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.INTERLEAVED_1F1B, + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=["PP", "DP", "TP"], + ) + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + pipe_config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + _parameters = list(stage_modules[0].parameters()) + list(stage_modules[1].parameters()) + optimizer = torch.optim.SGD(_parameters, **optimizer_fn_kwargs) + basic_optimizer = BasicOptimizer(optimizer, models=stage_modules) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pipe_config) + engine = PipeEngine( + pipe_module, + VESCALE_DEVICE_MESH, + self.loss_fn, + pipe_config, + ) + + data_iterator = [] + for j in range(batches): + data = torch.zeros(1, 1, n_hidden) + j + data_iterator.append(data.to(device)) + + minibatch_loss, _ = engine.forward_backward(data_iterator) + + if local_rank == 3: + ground_truth_loss = sum(all_batches_out) + self.assertEqual(minibatch_loss, ground_truth_loss) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/api/test_pipe_single_stage_ops.py b/test/parallel/pipeline/api/test_pipe_single_stage_ops.py new file mode 100644 index 0000000..9d0922b --- /dev/null +++ b/test/parallel/pipeline/api/test_pipe_single_stage_ops.py @@ -0,0 +1,219 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +from vescale.plan import PipelineScheduleType, PipelineParallelPlan, ModeType, PipelineSplitMethodType +from vescale.pipe.pipe_stage import PipeModule, construct_stage_modules +from vescale.engine import PipeEngine +from common_dtensor import DTensorTestBase, with_comms +from torch.optim import SGD + +microbatch_size = 16 +factor = 8 +batch_size = microbatch_size * factor +RANDOM_SEED = 9999 + + +class MLP(nn.Module): + def __init__(self, value): + super().__init__() + self.value = value + self.counter = 0 + self.fc1 = nn.Linear(32, 32, bias=False) + self.fc1.weight.data.fill_(value) + self.fc2 = nn.Linear(32, 32, bias=False) + self.fc2.weight.data.fill_(value * 2) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + torch.save(t, f"{os.environ['model_name']}_mlp{self.value}_fwd{self.counter}_out_tensor.pt") + self.counter += 1 + return t + + +class MLPWithForwardUtil(nn.Module): + def __init__(self, value): + super().__init__() + self.value = value + self.counter = 0 + self.fc1 = nn.Linear(32, 32, bias=False) + self.fc1.weight.data.fill_(value) + self.fc2 = nn.Linear(32, 32, bias=False) + self.fc2.weight.data.fill_(value * 2) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + torch.save(t, f"{os.environ['model_name']}_mlp{self.value}_fwd{self.counter}_out_tensor.pt") + self.counter += 1 + return t + + def forward_util(self, p2p_input, local_input=None): + print("This is an auxilary forward_util() provided by the user") + if p2p_input is not None: + print("Modified p2p_input value!") + p2p_input *= 2 + else: + print("Load local input as p2p input") + p2p_input = local_input + if local_input is not None: + print("Handling local inputs") + return [p2p_input] + + +class EightMLP(nn.Module): + def __init__(self): + super().__init__() + self.mlp1 = MLPWithForwardUtil(0) + self.mlp2 = MLP(1) + self.mlp3 = MLP(2) + self.mlp4 = MLP(3) + self.mlp5 = MLPWithForwardUtil(3) + self.mlp6 = MLP(3) + self.mlp7 = MLP(3) + self.mlp8 = MLP(3) + self.sequence = nn.Sequential( + self.mlp1, + self.mlp2, + self.mlp3, + self.mlp4, + self.mlp5, + self.mlp6, + self.mlp7, + self.mlp8, + ) + + def forward(self, x): + return self.sequence(x) + + +class PipelineSingleStageOpsTest(DTensorTestBase): + @property + def world_size(self): + return 4 + + @staticmethod + def loss_fn(x): + return x.mean() + + def test_stage_forward(self): + """ + Test single stage forward. + """ + if self.rank == 0: + self._run_no_pp_model() + n_gpus = torch.cuda.device_count() + assert n_gpus >= 2, "Requires at least 2 GPUs to run model with pp engine" + self._run_stage_forward() + + def _run_no_pp_model(self): + os.environ["model_name"] = "golden" + model = EightMLP().to("cuda:0") + optimizer = torch.optim.SGD( + model.parameters(), lr=0.01, momentum=0, dampening=0, weight_decay=0, nesterov=False + ) + torch.manual_seed(9999) + batch = [torch.ones(microbatch_size, 128, 32, dtype=torch.float32).to("cuda:0") for _ in range(factor)] + for mb in batch: + out = model(mb) + + @with_comms + def _run_stage_forward(self): + os.environ["model_name"] = "pp" + device = f"cuda:{self.rank}" + torch.cuda.set_device(device) + model = EightMLP().cuda() + + num_layers = 8 + config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=2, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layers)], + split_points=["mlp2", "mlp4", "mlp6", "mlp8"], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.INTERLEAVED_1F1B, + ) + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=("PP", "DP", "TP"), + ) + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + _parameters = list(stage_modules[0].parameters()) + list(stage_modules[1].parameters()) + optimizer = SGD(_parameters, **optimizer_fn_kwargs) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, config) + + engine = PipeEngine( + pipe_module, + VESCALE_DEVICE_MESH, + self.loss_fn, + config, + ) + torch.manual_seed(9999) + batch = [torch.ones(microbatch_size, 128, 32, dtype=torch.float32).to(device) for _ in range(factor)] + if self.rank == 0: + # first stage only receives inputs from dataloader + chunk_id = 0 + print(f"Chunk ID: {chunk_id}") + output_chunk_one = engine.module(None, local_inputs=batch[0], chunk_id=chunk_id) + chunk_id = 1 + print(f"Chunk ID: {chunk_id}") + output_chunk_two = engine.module(batch[1], local_inputs=None, chunk_id=chunk_id) + assert not torch.equal(output_chunk_one, output_chunk_two) + if self.rank == 2: + # other stages can receive inputs communicated by their peers + chunk_id = 0 + print(f"Chunk ID: {chunk_id}") + output_chunk_three = engine.module(batch[2], local_inputs=None, chunk_id=chunk_id) + chunk_id = 1 + print(f"Chunk ID: {chunk_id}") + output_chunk_four = engine.module(batch[3], local_inputs=None, chunk_id=chunk_id) + assert not torch.equal(output_chunk_three, output_chunk_four) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/api/test_schedule_engine.py b/test/parallel/pipeline/api/test_schedule_engine.py new file mode 100644 index 0000000..c508511 --- /dev/null +++ b/test/parallel/pipeline/api/test_schedule_engine.py @@ -0,0 +1,121 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +import torch +from common_dtensor import DTensorTestBase, with_comms +from torch.testing._internal.common_utils import run_tests +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +from vescale.pipe.pipe_stage import PipeModule, construct_stage_modules +from vescale.pipe._schedules.instruction_base import StageDeps +from vescale.pipe.pipe_emmiter import ScheduleEngine +from vescale.plan.spec import PipelineScheduleType, ModeType, PipelineSplitMethodType +from vescale.plan.pipeline_parallel import PipelineParallelPlan +from four_mlp import FourMLP +from torch.optim import SGD + + +class ScheduleEngineRuntimeTest(DTensorTestBase): + @property + def world_size(self): + return 2 + + @staticmethod + def loss_fn(x): + return x.mean() + + def _setup(self): + os.environ["model_name"] = "pp" + global local_rank + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + + torch.manual_seed(9999) + microbatch_size = 2 + factor = 4 + batch = [torch.ones(microbatch_size, 128, 1024, dtype=torch.float32).to(device) for _ in range(factor)] + return batch, microbatch_size + + @with_comms + def test_simple_1f1b(self): + """ + Test simple 1f1b schedule with schedule runtime. + """ + batch, microbatch_size = self._setup() + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(2, 1, 1), + mesh_dim_names=("PP", "DP", "TP"), + ) + + model = FourMLP(1024).cuda() + num_layers = 4 + + config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=1, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layers)], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + ) + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + _parameters = list(stage_modules[0].parameters()) + optimizer = SGD(_parameters, **optimizer_fn_kwargs) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, config) + + dep = pipe_module.stage_deps + device_mesh_list = VESCALE_DEVICE_MESH.get_global_tensor_parallel_meshes() + stage_deps = StageDeps(dep, device_mesh_list, pipe_module) + + pipe_engine = ScheduleEngine( + stage_deps, + meshes=VESCALE_DEVICE_MESH.get_global_tensor_parallel_meshes(), + schedule=PipelineScheduleType.SIMPLE_1F1B, + batches=len(batch), + data_iterator=iter(batch), + stage_id=VESCALE_DEVICE_MESH.get_pipeline_parallel_rank(), + shape=(microbatch_size, 128, 1024), + dtype=torch.float32, + ) + minibatch_loss, all_forward_outputs = ScheduleEngine.execute(pipe_engine) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/api/test_simple_api.py b/test/parallel/pipeline/api/test_simple_api.py new file mode 100644 index 0000000..97b3d3b --- /dev/null +++ b/test/parallel/pipeline/api/test_simple_api.py @@ -0,0 +1,195 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +from common_dtensor import DTensorTestBase, with_comms +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.debug.pdb import ForkedPdb +from vescale.optim.base_optimizer import BasicOptimizer +from vescale.pipe.pipe_stage import construct_pipeline_stage +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +from vescale.engine import PipeEngine +from vescale.plan import ( + PipelineParallelPlan, + PipelineScheduleType, + ModeType, + PipelineSplitMethodType, +) + + +class MLP(nn.Module): + def __init__(self, n_features): + super().__init__() + self.fc1 = nn.Linear(n_features, n_features * 2, bias=False) + torch.nn.init.uniform_(self.fc1.weight, 0, 1) + self.fc2 = nn.Linear(n_features * 2, n_features) + torch.nn.init.uniform_(self.fc2.weight, 0, 1) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + return t + + +class FourMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden) + self.mlp2 = MLP(hidden) + self.mlp3 = MLP(hidden) + self.mlp4 = MLP(hidden) + self.sequence = nn.Sequential(self.mlp1, self.mlp2, self.mlp3, self.mlp4) + + def forward(self, x): + return self.sequence(x) + + +class EightMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden) + self.mlp2 = MLP(hidden) + self.mlp3 = MLP(hidden) + self.mlp4 = MLP(hidden) + self.mlp5 = MLP(hidden) + self.mlp6 = MLP(hidden) + self.mlp7 = MLP(hidden) + self.mlp8 = MLP(hidden) + + def forward(self, x): + x = self.mlp1(x) + x.retain_grad() + x = self.mlp2(x) + x.retain_grad() + x = self.mlp3(x) + x.retain_grad() + x = self.mlp4(x) + x.retain_grad() + x = self.mlp5(x) + x.retain_grad() + x = self.mlp6(x) + x.retain_grad() + x = self.mlp7(x) + x.retain_grad() + x = self.mlp8(x) + return x + + +class SimpleAPITest(DTensorTestBase): + @property + def world_size(self): + return 4 + + @staticmethod + def loss_fn(x): + return torch.sum(x) + + def _prepare_runtime_engine(self, model, forward_only: bool = False): + pipe_plan = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=1, + smallest_unsplittable_units=["mlp1", "mlp2", "mlp3", "mlp4"], + split_points=["mlp1", "mlp2", "mlp3", "mlp4"], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + forward_only=forward_only, + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=["PP", "DP", "TP"], + ) + pipe_module = construct_pipeline_stage( + model, + pipe_plan, + VESCALE_DEVICE_MESH, + lr_scheduler=None, + update_split_points=True, + ) + optimizer = torch.optim.SGD(pipe_module.parameters(), **optimizer_fn_kwargs) + basic_optimizer = BasicOptimizer(optimizer, models=pipe_module) + engine = PipeEngine( + pipe_module, + VESCALE_DEVICE_MESH, + self.loss_fn, + pipe_plan, + ) + + return engine, optimizer + + @with_comms + def test_simple_api(self): + """ + Tests pipeline engine with simple API. + """ + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + n_hidden = 3 + batches = 8 + model = FourMLP(n_hidden).cuda() + + all_batches_out = [] + if local_rank == 3: + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(3) + model.cuda(3) + out = model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + print(loss) + print(" ====================================== ") + + engine, optimizer = self._prepare_runtime_engine(model) + + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append(data.to(device)) + + minibatch_loss, _ = engine(data_iterator) + + if local_rank == 3: + self.assertEqual(minibatch_loss, sum(all_batches_out)) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/backend/eight_mlp.py b/test/parallel/pipeline/backend/eight_mlp.py new file mode 100644 index 0000000..b4d2e0f --- /dev/null +++ b/test/parallel/pipeline/backend/eight_mlp.py @@ -0,0 +1,288 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch.nn as nn +from vescale.dtensor.placement_types import Shard, Replicate + + +class Embed(nn.Module): + def __init__(self): + super().__init__() + self.embedding = nn.Embedding(8, 64) + + def forward(self, x): + return self.embedding(x) + + def get_word_embeddings_weight(self): + return self.embedding.weight + + +class EmbedTwo(nn.Module): + def __init__(self): + super().__init__() + self.embedding = nn.Embedding(8, 64) + + def forward(self, x): + return self.embedding(x) + + def get_word_embeddings_weight(self): + return self.embedding.weight + + +class MLP(nn.Module): + def __init__(self, features_in, features_out, value): + super().__init__() + self.value = value + self.fc1 = nn.Linear(features_in, 16, bias=False) + self.fc1.weight.data.fill_(value) + self.fc2 = nn.Linear(16, features_out, bias=False) + self.fc2.weight.data.fill_(value * 2) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + return t + + +class SmallMLP(nn.Module): + def __init__(self, features_in, features_out, value): + super().__init__() + self.value = value + self.fc1 = nn.Linear(features_in, features_out, bias=False) + self.fc1.weight.data.fill_(value) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + return t + + +class HierachicalMLP(nn.Module): + def __init__(self, features_in, features_out, value): + super().__init__() + self.value = value + self.fc0 = SmallMLP(features_in, features_in, value) + self.fc1 = nn.Linear(features_in, 16, bias=False) + self.fc2 = nn.Linear(16, features_out, bias=False) + self.fc3 = SmallMLP(features_out, features_out, value) + self.gelu = nn.GELU() + + def forward(self, x): + x = x + x + x = self.fc0(x) + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + t = self.fc3(t) + return t + + +class EightMLP(nn.Module): + def __init__(self, hidden=64, fixed_size=True, embedded_module=False): + super().__init__() + module = HierachicalMLP if embedded_module else MLP + if fixed_size: + self.mlp1 = module(hidden, hidden, 0) + self.mlp2 = module(hidden, hidden, 1) + self.mlp3 = module(hidden, hidden, 2) + self.mlp4 = module(hidden, hidden, 3) + self.mlp5 = module(hidden, hidden, 4) + self.mlp6 = module(hidden, hidden, 5) + self.mlp7 = module(hidden, hidden, 6) # tranformerlayer7 = TransformerLayer(hidden) + self.mlp8 = module(hidden, hidden, 7) # tranformerlayer8 = TransformerLayer(hidden) + else: + self.mlp1 = module(hidden * 1, hidden * 2, 0) + self.mlp2 = module(hidden * 2, hidden * 3, 1) + self.mlp3 = module(hidden * 3, hidden * 4, 2) + self.mlp4 = module(hidden * 4, hidden * 5, 3) + self.mlp5 = module(hidden * 5, hidden * 6, 4) + self.mlp6 = module(hidden * 6, hidden * 7, 5) + self.mlp7 = module(hidden * 7, hidden * 8, 6) + self.mlp8 = module(hidden * 8, hidden * 9, 7) + + def forward(self, x): + x = self.mlp1(x) + x = self.mlp2(x) + x = self.mlp3(x) + x = self.mlp4(x) + x = self.mlp5(x) + x = self.mlp6(x) + x = self.mlp7(x) + x = self.mlp8(x) + return x + + +class EightMLPDiffNames(nn.Module): + def __init__(self, hidden=64): + super().__init__() + self.mlp1 = MLP(hidden, hidden, 0) + self.mlp2 = MLP(hidden, hidden, 1) + self.mlp3 = MLP(hidden, hidden, 2) + self.layer1 = MLP(hidden, hidden, 3) + self.layer2 = MLP(hidden, hidden, 4) + self.layer3 = MLP(hidden, hidden, 5) + self.layer4 = MLP(hidden, hidden, 6) + self.more_layer1 = MLP(hidden, hidden, 7) + + def forward(self, x): + x = self.mlp1(x) + x = self.mlp2(x) + x = self.mlp3(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.more_layer1(x) + return x + + +class EightMLPWithOps(nn.Module): + def __init__(self, hidden=64): + super().__init__() + self.mlp1 = MLP(hidden, hidden, 0) + self.mlp2 = MLP(hidden, hidden, 1) + self.mlp3 = MLP(hidden, hidden, 2) + self.mlp4 = MLP(hidden, hidden, 3) + self.mlp5 = MLP(hidden, hidden, 4) + self.mlp6 = MLP(hidden, hidden, 5) + self.mlp7 = MLP(hidden, hidden, 6) + self.mlp8 = MLP(hidden, hidden, 7) + + def forward(self, x): + x = x + x + x = self.mlp1(x) + x = x * 2 + x = self.mlp2(x) + x = x * 2 + x = x * 2 + x = x * 2 + x = self.mlp3(x) + x = self.mlp4(x) + x = self.mlp5(x) + x = self.mlp6(x) + x = self.mlp7(x) + x = self.mlp8(x) + return x + + +class EightMLPWithOpsTail(nn.Module): + def __init__(self, hidden=64): + super().__init__() + self.mlp1 = MLP(hidden, hidden, 0) + self.mlp2 = MLP(hidden, hidden, 1) + self.mlp3 = MLP(hidden, hidden, 2) + self.mlp4 = MLP(hidden, hidden, 3) + self.mlp5 = MLP(hidden, hidden, 4) + self.mlp6 = MLP(hidden, hidden, 5) + self.mlp7 = MLP(hidden, hidden, 6) + self.mlp8 = MLP(hidden, hidden, 7) + + def forward(self, x): + x = x + x + x = self.mlp1(x) + x = x * 2 + x = self.mlp2(x) + x = x * 2 + x = self.mlp3(x) + x = self.mlp4(x) + x = self.mlp5(x) + x = self.mlp6(x) + x = self.mlp7(x) + x = self.mlp8(x) + x = x * 2 + x = x * 4 + x = x + 4 + return x + + +class EightMLPSharedEmbed(nn.Module): + def __init__(self, hidden=64): + super().__init__() + self.embed1 = Embed() + self.mlp1 = MLP(hidden, hidden, 0) + self.mlp2 = MLP(hidden, hidden, 1) + self.mlp3 = MLP(hidden, hidden, 2) + self.mlp4 = MLP(hidden, hidden, 3) + self.mlp5 = MLP(hidden, hidden, 4) + self.mlp6 = MLP(hidden, hidden, 5) + self.mlp7 = MLP(hidden, hidden, 6) + self.mlp8 = MLP(hidden, hidden, 7) + self.embed2 = EmbedTwo() + + def forward(self, x): + x = self.embed1(x).float() + x = self.mlp1(x) + x = self.mlp2(x) + x = self.mlp3(x) + x = self.mlp4(x) + x = self.mlp5(x) + x = self.mlp6(x) + x = self.mlp7(x) + x = self.mlp8(x).long() + x = self.embed2(x) + return x + + +sharding_plan = { + "forward": { + r"mlp\d.input": [[Replicate()]], + r"mlp\d.output": [[Replicate()]], + }, + "parameter": { + r"mlp\d.fc1.weight": [Shard(0)], + r"mlp\d.fc2.weight": [Shard(1)], + }, +} + +sharding_plan_two = { + "forward": { + r"mlp\d.input": [[Replicate()]], + r"mlp\d.output": [[Replicate()]], + }, + "parameter": { + r"mlp\d.weight": [Shard(1)], + }, +} + +sharding_plan_combo = { + "forward": { + r"mlp\d.input": [[Replicate()]], + r"mlp\d.output": [[Replicate()]], + r"layer\d.input": [[Replicate()]], + r"layer\d.output": [[Replicate()]], + r"more_layer\d.input": [[Replicate()]], + r"more_layer\d.output": [[Replicate()]], + }, + "parameter": { + r"mlp\d.weight": [Shard(1)], + r"layer\d.weight": [[Replicate()]], + }, +} + +sharding_plan_fc = { + "forward": { + r"mlp\d.fc\d.input": [[Replicate()]], + r"mlp\d.fc\d.output": [[Replicate()]], + }, + "parameter": { + r"mlp\d.fc1.weight": [Shard(0)], + r"mlp\d.fc2.weight": [Shard(1)], + }, +} diff --git a/test/parallel/pipeline/backend/test_p2p_comm.py b/test/parallel/pipeline/backend/test_p2p_comm.py new file mode 100644 index 0000000..55d47bb --- /dev/null +++ b/test/parallel/pipeline/backend/test_p2p_comm.py @@ -0,0 +1,994 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +import torch +import torch.distributed as dist +from torch.testing._internal.common_utils import run_tests +from vescale import DeviceMesh, distribute_tensor +from vescale.dtensor.placement_types import Replicate +from vescale.pipe.p2p_communication import ( + _communicate, + _communicate_shapes, + _mapping_local_rank_to_target_rank_by_device_mesh, + recv_forward, + recv_backward, + send_forward, + send_backward, + send_forward_recv_backward, + send_backward_recv_forward, + send_forward_recv_forward, + send_backward_recv_backward, + send_forward_backward_recv_forward_backward, + drain_recv_reqs, +) +from common_dtensor import ( + DTensorTestBase, + with_comms, +) + + +class PipeP2PTest(DTensorTestBase): + @staticmethod + def set_up_device_mesh_stages(world_size, device, n): + assert world_size % n == 0, "world size must be divisible by the number of stages" + n_device = world_size // n + return (DeviceMesh(device, list(range(n_device * i, n_device * (i + 1)))) for i in range(n)) + + @staticmethod + def apply_xavier_normal_with_seed(tensor, seed=99999): + torch.manual_seed(seed) + torch.nn.init.xavier_normal_(tensor) + + @property + def world_size(self): + return 8 + + @property + def sequence_len(self): + return 8 + + @property + def batch_size(self): + return 4 + + @property + def input_size(self): + return 2 + + @property + def stages(self): + return 4 + + def _generate_device_meshes(self): + device = f"cuda:{self.rank}" + # stage1 + device_mesh_stage1 = DeviceMesh(device, list(range(self.world_size // 2))) + # stage2 + device_mesh_stage2 = DeviceMesh(device, list(range(self.world_size // 2, self.world_size))) + return device_mesh_stage1, device_mesh_stage2 + + def _generate_three_device_meshes(self): + device = f"cuda:{self.rank}" + # stage1 + device_mesh_stage1 = DeviceMesh(device, list(range(self.world_size // 4))) + # stage2 + device_mesh_stage2 = DeviceMesh(device, list(range(self.world_size // 4, self.world_size // 2))) + # stage3 + device_mesh_stage3 = DeviceMesh(device, list(range(self.world_size // 2, self.world_size // 4 * 3))) + return device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 + + @with_comms + def test_communicate_shapes(self): + """ + Test correctness function of _communicate_shapes(). + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2 = self._generate_device_meshes() + + # stage 1 tensor + tensor_stage1 = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + torch.nn.init.xavier_normal_(tensor_stage1) + dist.all_reduce(tensor_stage1, async_op=False) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + if self.rank in device_mesh_stage1.mesh.tolist(): + target_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=self.rank, current_device_mesh=device_mesh_stage1, target_device_mesh=device_mesh_stage2 + ) + _communicate_shapes( + local_rank=self.rank, + tensor_send_next=dtensor_stage1, + tensor_send_prev=None, + next_rank=target_rank, + prev_rank=None, + recv_prev=False, + recv_next=False, + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + target_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=self.rank, current_device_mesh=device_mesh_stage2, target_device_mesh=device_mesh_stage1 + ) + recv_prev_shape, _ = _communicate_shapes( + local_rank=self.rank, + tensor_send_next=None, + tensor_send_prev=None, + prev_rank=target_rank, + next_rank=None, + recv_prev=True, + recv_next=False, + ) + self.assertTrue(recv_prev_shape == [self.sequence_len, self.batch_size, self.input_size]) + + @with_comms + def test_communicate_no_batch_p2p_comm(self): + """ + Test correctness of p2p communication ops. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2 = self._generate_device_meshes() + # stage 1 tensor + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + if self.rank in device_mesh_stage1.mesh.tolist(): + _communicate( + tensor_send_next=dtensor_stage1._local_tensor, + tensor_send_prev=None, + current_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage2, + recv_prev=False, + recv_next=False, + tensor_shape=None, + batch_p2p_comm=False, + wait_on_reqs=True, + dtype=None, + ) + + if self.rank in device_mesh_stage2.mesh.tolist(): + recv_prev_tensor, _, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + recv_prev=True, + recv_next=False, + tensor_shape=None, + batch_p2p_comm=False, + wait_on_reqs=True, + dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + recv_prev_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + + @with_comms + def test_communicate_batch_p2p_comm(self): + """ + Test correctness of batch communication ops. + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2 = self._generate_device_meshes() + # stage 1 tensor + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + if self.rank in device_mesh_stage1.mesh.tolist(): + _communicate( + tensor_send_next=dtensor_stage1._local_tensor, + tensor_send_prev=None, + current_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage2, + recv_prev=False, + recv_next=False, + tensor_shape=None, + batch_p2p_comm=True, + wait_on_reqs=True, + dtype=None, + ) + + if self.rank in device_mesh_stage2.mesh.tolist(): + recv_prev_tensor, _, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + recv_prev=True, + recv_next=False, + tensor_shape=None, + batch_p2p_comm=True, + wait_on_reqs=True, + dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + recv_prev_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + + @with_comms + def test_send_forward_and_recv_forward(self): + """ + Test correctness of send_forward() and recv_forward(). + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + stage_list = list(self.set_up_device_mesh_stages(self.world_size, device, self.stages)) + seed_list = list(range(99990, 99990 + self.stages)) + stage_n_dict = {(self.rank in stage.mesh.tolist()): i for i, stage in enumerate(stage_list)} + stage_n = stage_n_dict[True] + send_seed = seed_list[stage_n] + recv_seed = seed_list[stage_n - 1] + prev_stage = stage_list[stage_n - 1] + curr_stage = stage_list[stage_n] + next_stage = stage_list[(stage_n + 1) % len(stage_list)] + send_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + expt_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + self.apply_xavier_normal_with_seed(send_t, seed=send_seed) + self.apply_xavier_normal_with_seed(expt_t, seed=recv_seed) + + if stage_n % 2 == 0: + send_forward( + output_tensor=send_t, + current_device_mesh=curr_stage, + peer_device_mesh=next_stage, + tensor_shape=send_t.shape, + ) + else: + recv_prev_tensor = recv_forward( + tensor_shape=expt_t.shape, + recv_dtype=expt_t.dtype, + current_device_mesh=curr_stage, + peer_device_mesh=prev_stage, + ) + self.assertTrue(torch.equal(recv_prev_tensor, expt_t)) + + @with_comms + def test_send_backward_and_recv_backward(self): + """ + Test correctness of send_backward() and recv_backward(). + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + stage_list = list(self.set_up_device_mesh_stages(self.world_size, device, self.stages)) + seed_list = list(range(99990, 99990 + self.stages)) + stage_n_dict = {(self.rank in stage.mesh.tolist()): i for i, stage in enumerate(stage_list)} + stage_n = stage_n_dict[True] + send_seed = seed_list[stage_n] + recv_seed = seed_list[(stage_n + 1) % len(seed_list)] + prev_stage = stage_list[stage_n - 1] + curr_stage = stage_list[stage_n] + next_stage = stage_list[(stage_n + 1) % len(stage_list)] + send_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + expt_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + self.apply_xavier_normal_with_seed(send_t, seed=send_seed) + self.apply_xavier_normal_with_seed(expt_t, seed=recv_seed) + + if stage_n % 2 == 0: + send_backward( + input_tensor_grad=send_t, + current_device_mesh=curr_stage, + peer_device_mesh=prev_stage, + tensor_shape=send_t.shape, + ) + else: + recv_prev_tensor = recv_backward( + tensor_shape=expt_t.shape, + recv_dtype=expt_t.dtype, + current_device_mesh=curr_stage, + peer_device_mesh=next_stage, + ) + self.assertTrue(torch.equal(recv_prev_tensor, expt_t)) + + @with_comms + def test_send_forward_recv_backward_and_send_backward_recv_forward(self): + """ + Test correctness of send_backward() and recv_backward(). + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + stage_list = list(self.set_up_device_mesh_stages(self.world_size, device, self.stages)) + fwd_seed_list = list(range(99990, 99990 + self.stages)) + bwd_seed_list = list(range(77770, 77770 + self.stages)) + stage_n_dict = {(self.rank in stage.mesh.tolist()): i for i, stage in enumerate(stage_list)} + stage_n = stage_n_dict[True] + fwd_send_seed = fwd_seed_list[stage_n] + fwd_recv_seed = fwd_seed_list[stage_n - 1] + bwd_send_seed = bwd_seed_list[stage_n] + bwd_recv_seed = bwd_seed_list[(stage_n + 1) % len(bwd_seed_list)] + prev_stage = stage_list[stage_n - 1] + curr_stage = stage_list[stage_n] + next_stage = stage_list[(stage_n + 1) % len(stage_list)] + fwd_send_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + fwd_expt_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + bwd_send_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + bwd_expt_t = torch.empty(self.sequence_len, self.batch_size, self.input_size, device=device) + self.apply_xavier_normal_with_seed(fwd_send_t, seed=fwd_send_seed) + self.apply_xavier_normal_with_seed(fwd_expt_t, seed=fwd_recv_seed) + self.apply_xavier_normal_with_seed(bwd_send_t, seed=bwd_send_seed) + self.apply_xavier_normal_with_seed(bwd_expt_t, seed=bwd_recv_seed) + if stage_n % 2 == 0: + recv_bwd_tensor = send_forward_recv_backward( + output_tensor=fwd_send_t, + tensor_shape=bwd_expt_t.shape, + recv_dtype=bwd_expt_t.dtype, + current_device_mesh=curr_stage, + peer_device_mesh=next_stage, + ) + self.assertTrue(torch.equal(recv_bwd_tensor, bwd_expt_t)) + else: + recv_fwd_tensor = send_backward_recv_forward( + input_tensor_grad=bwd_send_t, + tensor_shape=fwd_expt_t.shape, + recv_dtype=fwd_expt_t.dtype, + current_device_mesh=curr_stage, + peer_device_mesh=prev_stage, + ) + self.assertTrue(torch.equal(recv_fwd_tensor, fwd_expt_t)) + + @with_comms + def test_send_forward_recv_forward_no_shape(self): + """ + Test correctness of send_forward_recv_forward without sharing tensor shape in advance. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + send_forward( + output_tensor=dtensor_stage1.to_local(), + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + tensor_shape=None, + ) + + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor = send_forward_recv_forward( + output_tensor=dtensor_stage2._local_tensor, + recv_prev=True, + tensor_shape=None, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + stage2_recv_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + stage3_recv_tensor = recv_forward( + tensor_shape=None, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + stage3_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1, + ) + ) + + @with_comms + def test_send_forward_recv_forward_with_shape(self): + """ + Test correctness of send_forward_recv_forward with known tensor shape. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + send_forward( + output_tensor=dtensor_stage1.to_local(), + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + tensor_shape=shape, + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor = send_forward_recv_forward( + output_tensor=dtensor_stage2._local_tensor, + recv_prev=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + stage2_recv_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + stage3_recv_tensor = recv_forward( + tensor_shape=shape, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + stage3_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1, + ) + ) + + @with_comms + def test_send_backward_recv_backward_no_shape(self): + """ + Test correctness of send_backward_recv_backward(). + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = None + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + stage1_recv_tensor = recv_backward( + tensor_shape=shape, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + stage1_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1, + ) + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor = send_backward_recv_backward( + input_tensor_grad=dtensor_stage2._local_tensor, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + stage2_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + send_backward( + input_tensor_grad=dtensor_stage3.to_local(), + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + tensor_shape=shape, + ) + + @with_comms + def test_send_backward_recv_backward_with_shape(self): + """ + Test correctness of send_backward_recv_backward(). + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + stage1_recv_tensor = recv_backward( + tensor_shape=shape, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + stage1_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1, + ) + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor = send_backward_recv_backward( + input_tensor_grad=dtensor_stage2._local_tensor, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + stage2_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + send_backward( + input_tensor_grad=dtensor_stage3.to_local(), + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + tensor_shape=shape, + ) + + @with_comms + def test_send_forward_backward_recv_forward_backward_with_shape(self): + """ + Test correctness of send_forward_backward_recv_forward_backward(). + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + recv_bwd_tensor = send_forward_recv_backward( + output_tensor=dtensor_stage1._local_tensor, + tensor_shape=shape, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + recv_bwd_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + ) + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + input_tensor, output_tensor_grad = send_forward_backward_recv_forward_backward( + output_tensor=dtensor_stage2._local_tensor, + input_tensor_grad=dtensor_stage2._local_tensor, + recv_prev=True, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + input_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + self.assertTrue( + torch.equal( + output_tensor_grad, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + recv_fwd_tensor = send_backward_recv_forward( + input_tensor_grad=dtensor_stage3.to_local(), + tensor_shape=shape, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + recv_fwd_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + ) + ) + + @with_comms + def test_send_forward_backward_recv_forward_backward_no_shape(self): + """ + Test correctness of send_forward_backward_recv_forward_backward() + without sharing tensor shapes in advance. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + recv_bwd_tensor = send_forward_recv_backward( + output_tensor=dtensor_stage1._local_tensor, + tensor_shape=None, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + recv_bwd_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + ) + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + input_tensor, output_tensor_grad = send_forward_backward_recv_forward_backward( + output_tensor=dtensor_stage2._local_tensor, + input_tensor_grad=dtensor_stage2._local_tensor, + recv_prev=True, + recv_next=True, + tensor_shape=None, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + input_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + self.assertTrue( + torch.equal( + output_tensor_grad, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + recv_fwd_tensor = send_backward_recv_forward( + input_tensor_grad=dtensor_stage3.to_local(), + tensor_shape=None, + recv_dtype=torch.float32, + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + ) + self.assertTrue( + torch.equal( + recv_fwd_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + ) + ) + + @with_comms + def test_send_forward_recv_forward_with_shape_next_device_mesh_none(self): + """ + Test correctness of send_forward_recv_forward() with tensor shapes known. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, _ = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + send_forward( + output_tensor=dtensor_stage1.to_local(), + current_device_mesh=device_mesh_stage1, + peer_device_mesh=device_mesh_stage2, + tensor_shape=shape, + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor = send_forward_recv_forward( + output_tensor=dtensor_stage2._local_tensor, + recv_prev=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=None, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + stage2_recv_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + + @with_comms + def test_send_backward_recv_backward_with_shape_device_mesh_none(self): + """ + Test correctness of send_backward_recv_backward() with tensor shapes known. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor = send_backward_recv_backward( + input_tensor_grad=dtensor_stage2._local_tensor, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=None, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + ) + self.assertTrue( + torch.equal( + stage2_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + if self.rank in device_mesh_stage3.mesh.tolist(): + send_backward( + input_tensor_grad=dtensor_stage3.to_local(), + current_device_mesh=device_mesh_stage3, + peer_device_mesh=device_mesh_stage2, + tensor_shape=shape, + ) + + @with_comms + def test_send_backward_recv_backward_with_shape_p2p_overlap(self): + """ + Test correctness of send_backward_recv_backward() with overlapped p2p on. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor, bwd_wait_handles = send_backward_recv_backward( + input_tensor_grad=dtensor_stage2._local_tensor, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=None, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + + if self.rank in device_mesh_stage3.mesh.tolist(): + stage3_recv_tensor, bwd_wait_handles = send_backward_recv_backward( + input_tensor_grad=dtensor_stage3._local_tensor, + recv_next=False, + tensor_shape=shape, + current_device_mesh=device_mesh_stage3, + prev_device_mesh=device_mesh_stage2, + next_device_mesh=None, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + drain_recv_reqs("backward") + if self.rank in device_mesh_stage2.mesh.tolist(): + self.assertTrue( + torch.equal( + stage2_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + + @with_comms + def test_send_forward_recv_forward_with_shape_p2p_overlap(self): + """ + Test correctness of send_forward_recv_forward() with overlapped p2p on. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + stage1_recv_tensor, fwd_wait_handles = send_forward_recv_forward( + output_tensor=dtensor_stage1._local_tensor, + recv_prev=False, + tensor_shape=shape, + current_device_mesh=device_mesh_stage1, + prev_device_mesh=None, + next_device_mesh=device_mesh_stage2, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor, fwd_wait_handles = send_forward_recv_forward( + output_tensor=dtensor_stage2._local_tensor, + recv_prev=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=None, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + drain_recv_reqs("forward") + if self.rank in device_mesh_stage2.mesh.tolist(): + self.assertTrue( + torch.equal( + stage2_recv_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + + @with_comms + def test_send_backward_recv_backward_with_shape_p2p_overlap_auto_modify(self): + """ + Test correctness of send_backward_recv_backward() with overlapped p2p on. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + tensor_stage3 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2 + dtensor_stage3 = distribute_tensor(tensor_stage3, device_mesh_stage3, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor, bwd_wait_handles = send_backward_recv_backward( + input_tensor_grad=dtensor_stage2._local_tensor, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=None, + next_device_mesh=device_mesh_stage3, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + + if self.rank in device_mesh_stage3.mesh.tolist(): + stage3_recv_tensor, bwd_wait_handles = send_backward_recv_backward( + input_tensor_grad=dtensor_stage3._local_tensor, + recv_next=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage3, + prev_device_mesh=device_mesh_stage2, + next_device_mesh=None, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + + drain_recv_reqs("backward") + if self.rank in device_mesh_stage2.mesh.tolist(): + self.assertTrue( + torch.equal( + stage2_recv_tensor, + torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 2, + ) + ) + + @with_comms + def test_send_forward_recv_forward_with_shape_p2p_overlap_auto_modify(self): + """ + Test correctness of send_forward_recv_forward() with overlapped p2p on. + """ + os.environ["LOCAL_RANK"] = str(self.rank) + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1, device_mesh_stage2, device_mesh_stage3 = self._generate_three_device_meshes() + # stage 1 tensor + shape = (self.sequence_len, self.batch_size, self.input_size) + tensor_stage1 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + dtensor_stage1 = distribute_tensor(tensor_stage1, device_mesh_stage1, placements=[Replicate()]) + tensor_stage2 = torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + 1 + dtensor_stage2 = distribute_tensor(tensor_stage2, device_mesh_stage2, placements=[Replicate()]) + # send to stage 2 + if self.rank in device_mesh_stage1.mesh.tolist(): + stage1_recv_tensor, fwd_wait_handles = send_forward_recv_forward( + output_tensor=dtensor_stage1._local_tensor, + recv_prev=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage1, + prev_device_mesh=None, + next_device_mesh=device_mesh_stage2, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + if self.rank in device_mesh_stage2.mesh.tolist(): + stage2_recv_tensor, fwd_wait_handles = send_forward_recv_forward( + output_tensor=dtensor_stage2._local_tensor, + recv_prev=True, + tensor_shape=shape, + current_device_mesh=device_mesh_stage2, + prev_device_mesh=device_mesh_stage1, + next_device_mesh=None, + recv_dtype=torch.float32, + overlap_p2p_comm=True, + batch_p2p_comm=False, + ) + + drain_recv_reqs("forward") + if self.rank in device_mesh_stage2.mesh.tolist(): + self.assertTrue( + torch.equal( + stage2_recv_tensor, torch.ones(self.sequence_len, self.batch_size, self.input_size, device=device) + ) + ) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/backend/test_pipe.py b/test/parallel/pipeline/backend/test_pipe.py new file mode 100644 index 0000000..65614ac --- /dev/null +++ b/test/parallel/pipeline/backend/test_pipe.py @@ -0,0 +1,342 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch +import numpy as np +import torch.fx as fx +import re +from torch.testing._internal.common_utils import run_tests +from common_dtensor import DTensorTestBase, with_comms +from vescale.pipe import PipeModule, construct_stage_modules, construct_pipeline_split_graph +from vescale.plan import ( + PipelineParallelPlan, + PipelineScheduleType, + PipelineSplitMethodType, + ModeType, + TracerType, +) +from vescale.initialize.deferred_init import deferred_init, is_deferred +from eight_mlp import EightMLP, sharding_plan, sharding_plan_fc +from vescale.dmodule._dmodule import DModule +from vescale.dmodule.api import parallelize_module +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +import torch.distributed as dist +from vescale.ddp.distributed_data_parallel import DistributedDataParallel as DDP +from vescale.optim.distributed_optimizer import DistributedOptimizer +from vescale.dtensor.api import distribute_tensor +from vescale.dtensor.placement_types import Replicate +from torch.fx.passes.split_utils import split_by_tags + + +class PipeModuleTest(DTensorTestBase): + @property + def world_size(self): + return 4 + + @staticmethod + def loss_fn(x): + return x.mean() + + def _setup(self, pp_size: int = 2, dp_size: int = 1, tp_size: int = 2, virtual_chunks: int = 1): + num_layers = 8 + VESCALE_DEVICE_MESH.init_device_mesh("cuda", (pp_size, dp_size, tp_size), mesh_dim_names=("PP", "DP", "TP")) + deferred_mlp = deferred_init(EightMLP, hidden=8) + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=virtual_chunks, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layers)], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B + if virtual_chunks == 1 + else PipelineScheduleType.INTERLEAVED_1F1B, + ) + return deferred_mlp, pipe_config + + @with_comms + def test_generate_stage_dependency(self): + """ + Tests PipeModule's ability to generate inter-stage dependency. + """ + deferred_mlp, config = self._setup() + num_stages = 2 + + _, stage_dependency, p2p_index_mapping = construct_stage_modules( + deferred_mlp, config, VESCALE_DEVICE_MESH, update_split_points=True + ) + + target_deps = np.zeros((num_stages, num_stages)) + target_deps[0, 1] = 1 + target_p2p_mapping = {0: [(0, 0)], 1: [(0, 0)]} + self.assertEqual(stage_dependency, target_deps) + flattened_index_mapping = { + i: [(spec[0].peer_stage_idx, spec[0].peer_output_idx)] for i, spec in p2p_index_mapping.items() + } + self.assertEqual(flattened_index_mapping, target_p2p_mapping) + + @with_comms + def test_generate_stage_dependency_four_stages(self): + """ + Tests PipeModule's ability to generate inter-stage dependency among four pipeline stages. + """ + deferred_mlp, config = self._setup(pp_size=4, dp_size=1, tp_size=1, virtual_chunks=1) + num_stages = 4 + config.num_stages = num_stages + + _, stage_dependency, p2p_index_mapping = construct_stage_modules( + deferred_mlp, config, VESCALE_DEVICE_MESH, update_split_points=True + ) + + target_deps = np.zeros((num_stages, num_stages)) + target_deps[0, 1] = 1 + target_deps[1, 2] = 1 + target_deps[2, 3] = 1 + target_p2p_mapping = {0: [(0, 0)], 1: [(0, 0)], 2: [(1, 0)], 3: [(2, 0)]} + self.assertEqual(stage_dependency, target_deps) + flattened_index_mapping = { + i: [(spec[0].peer_stage_idx, spec[0].peer_output_idx)] for i, spec in p2p_index_mapping.items() + } + self.assertEqual(flattened_index_mapping, target_p2p_mapping) + + @with_comms + def test_forward(self): + """ + Tests PipeModule's forward function. + """ + deferred_mlp, _ = self._setup(virtual_chunks=2) + num_layers = 8 + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=4, + virtual_chunks=2, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layers)], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + ) + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=["PP", "DP", "TP"], + ) + tp_mesh = VESCALE_DEVICE_MESH["TP"] + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + deferred_mlp, + pipe_config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + for i in range(len(stage_modules)): + parallelized_module = parallelize_module( + stage_modules[i], + tp_mesh, + sharding_plan, + factory=False, + ) + stage_modules[i] = parallelized_module + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + _parameters = list(stage_modules[0].parameters()) + list(stage_modules[1].parameters()) + optimizer = torch.optim.SGD(_parameters, **optimizer_fn_kwargs) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pipe_config) + + model_chunk_one = pipe_module[0] + model_chunk_two = pipe_module[1] + assert DModule.is_dmodule(pipe_module.stage_modules[0]) + assert DModule.is_dmodule(pipe_module.stage_modules[1]) + input = torch.randn((3, 8)) + out_chunk_one = pipe_module(input, chunk_id=0) + out_chunk_two = pipe_module(input, chunk_id=1) + assert torch.equal(out_chunk_one, model_chunk_one(input)) + assert torch.equal(out_chunk_two, model_chunk_two(input)) + + +class PipeModuleTraceTest(DTensorTestBase): + @with_comms + def test_compile_mode(self): + """ + Tests correctness of registering hooks on partitioned model graphs. + """ + model = EightMLP(8) + + def hook(sel, args): + print(f"{torch.distributed.get_rank()}: call hook") + return args + + graph = fx.symbolic_trace(model) + input = torch.randn((3, 8)) + rule = r"mlp\d+.*" + for node in graph.graph.nodes: + if re.match(rule, node.name): + if int(node.name[3]) <= 4: + node.tag = "stage0" + else: + node.tag = "stage1" + global_graph = split_by_tags(graph, ["stage0", "stage1"]) + splited_module = global_graph.get_submodule("stage0") + splited_module.mlp1.fc1.register_forward_pre_hook(hook) + splited_module.mlp1.gelu.register_forward_pre_hook(hook) + splited_module.mlp1.fc2.register_forward_pre_hook(hook) + splited_module.mlp2.fc1.register_forward_pre_hook(hook) + splited_module.mlp2.gelu.register_forward_pre_hook(hook) + splited_module.mlp2.fc2.register_forward_pre_hook(hook) + splited_module.mlp3.fc1.register_forward_pre_hook(hook) + splited_module.mlp3.gelu.register_forward_pre_hook(hook) + splited_module.mlp3.fc2.register_forward_pre_hook(hook) + splited_module.mlp4.fc1.register_forward_pre_hook(hook) + splited_module.mlp4.gelu.register_forward_pre_hook(hook) + splited_module.mlp4.fc2.register_forward_pre_hook(hook) + splited_module(input) + + @with_comms + def test_compile_equivalent(self): + """ + Tests correctness of registering hooks on partitioned model graphs. + """ + model = EightMLP(8) + + def hook(sel, args): + print(f"{torch.distributed.get_rank()}: call hook") + return args + + graph = fx.symbolic_trace(model) + input = torch.randn((3, 8)) + rule = r"mlp\d+.*" + for node in graph.graph.nodes: + if re.match(rule, node.name): + if int(node.name[3]) <= 4: + node.tag = "stage0" + else: + node.tag = "stage1" + global_graph = split_by_tags(graph, ["stage0", "stage1"]) + splited_module = global_graph.get_submodule("stage0") + call_modules_fqns = [node.target for node in splited_module.graph.nodes if node.op == "call_module"] + for submodule_path in call_modules_fqns: + splited_module.get_submodule(submodule_path).register_forward_pre_hook(hook) + splited_module(input) + + @with_comms + def test_decomposable_5d_parallelization(self): + """ + Tests decomposable API of writing 5D parallelization from plan to parallelization. + """ + # build device mesh + device_mesh = VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", mesh_shape=(2, 1, 2), mesh_dim_names=["PP", "DP", "TP"] + ) + # deferred init mlp module + deferred_mlp = deferred_init(EightMLP, hidden=8) + # pipe module config + boundaries = ["mlp4", "mlp8"] + num_layers = 8 + pipe_config = PipelineParallelPlan( + num_stages=2, + split_method=PipelineSplitMethodType.MANUAL, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(num_layers)], + split_points=boundaries, + tracer_type=TracerType.TORCH_FX, + tracer_kwargs={"shard_plan": sharding_plan}, + ) + split_graph = construct_pipeline_split_graph(deferred_mlp, pipe_config, update_split_points=True) + + # parallelize and materialize module + model_chunks = [] + for i in range(pipe_config.num_stages): + stage = getattr(split_graph, f"stage{i}") + stage = parallelize_module( + stage, VESCALE_DEVICE_MESH.get_tensor_parallel_mesh(), sharding_plan, factory=False + ) + assert not is_deferred(stage) + model_chunks.append(stage) + if dist.get_rank() == 0: + assert model_chunks[0].mlp1.fc1.weight._spec.placements[0].is_shard() + + # make ddp module + ddp_models = [] + for model_chunk in model_chunks: + ddp_models.append( + DDP( + model_chunk, + VESCALE_DEVICE_MESH.get_data_parallel_mesh(), + accumulate_allreduce_grads_in_fp32=True, + overlap_grad_reduce=True, + use_distributed_optimizer=True, + ) + ) + + if dist.get_rank() == 0: + assert model_chunks[0].mlp1.fc1.weight._spec.placements[0].is_shard() + + # make optimizer + doptim = DistributedOptimizer( + torch.optim.Adam(split_graph.parameters(), lr=0.01), + models=ddp_models, + overlap_param_gather=False, + ) + tp_mesh = VESCALE_DEVICE_MESH.get_tensor_parallel_mesh() + stage_id = VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() + + num_layers = 8 + dataloader = [distribute_tensor(torch.zeros((5, 8)), tp_mesh, [Replicate()]) * i for i in range(num_layers)] + for sample in dataloader: + doptim.zero_grad() + output = ddp_models[stage_id](sample) + loss = output.mean() + loss.backward() + doptim.step() + + @with_comms + def test_manual_split_various_boundary_level(self): + """ + Tests PipeModule's ability to split stage by boundaries of various depths. + """ + VESCALE_DEVICE_MESH.init_device_mesh("cuda", (2, 1, 2), mesh_dim_names=("PP", "DP", "TP")) + deferred_mlp = deferred_init(EightMLP, hidden=8) + pipe_config = PipelineParallelPlan( + num_stages=2, + split_method=PipelineSplitMethodType.MANUAL, + smallest_unsplittable_units=["mlp7", "mlp8"], + split_points=["mlp4.fc1", "mlp8"], + tracer_type=TracerType.TORCH_FX, + tracer_kwargs={"partition_units": ["mlp7", "mlp8"]}, + ) + + split_graph = construct_pipeline_split_graph(deferred_mlp, pipe_config, update_split_points=True) + for i in range(pipe_config.num_stages): + stage = getattr(split_graph, f"stage{i}") + stage = parallelize_module( + stage, VESCALE_DEVICE_MESH.get_tensor_parallel_mesh(), sharding_plan_fc, factory=False + ) + assert not is_deferred(stage) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/backend/test_pipe_parser.py b/test/parallel/pipeline/backend/test_pipe_parser.py new file mode 100644 index 0000000..565b349 --- /dev/null +++ b/test/parallel/pipeline/backend/test_pipe_parser.py @@ -0,0 +1,172 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from torch.testing._internal.common_utils import run_tests +from common_dtensor import DTensorTestBase, with_comms +from vescale.pipe.pipe_parser import PipeParser +from vescale.initialize.deferred_init import deferred_init +from vescale.plan import PipelineParallelPlan, PipelineScheduleType, ModeType, PipelineSplitMethodType +from eight_mlp import EightMLP, EightMLPWithOps, EightMLPWithOpsTail + + +class TestPipeParser(DTensorTestBase): + @with_comms + def test_parse_naive_model(self): + """ + Tests trace capture with torch.fx symbolic tracer under user-defined granularity. + """ + deferred_mlp = deferred_init(EightMLP, hidden=8) + partition_units = ["mlp4", "mlp8"] + pipe_parser = PipeParser() + model_graph = pipe_parser.parse(deferred_mlp) + print(model_graph) + assert not all(node.target in partition_units for node in model_graph.graph.nodes) + + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + smallest_unsplittable_units=partition_units, + ) + model_graph_partition_units = pipe_parser.parse(deferred_mlp, pipe_config) + print(model_graph_partition_units) + assert any(node.target in partition_units for node in model_graph_partition_units.graph.nodes) + + @with_comms + def test_parse_huggingface_model(self): + """ + Tests trace capture with huggingface symbolic tracer under user-defined granularity. + """ + from transformers import LlamaModel, LlamaConfig + from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm + + configuration = LlamaConfig() + configuration.hidden_size = 256 + configuration.intermediate_size = 1376 + configuration.num_attention_heads = 1 + configuration.num_hidden_layers = 2 + model = LlamaModel(configuration) + + # below two lists of partition units refer to the same submodules we never wish to partition + partition_units = ["layers.0", "layers.1", "norm"] + partition_units_equivalent = [LlamaDecoderLayer, LlamaRMSNorm] + pipe_config = PipelineParallelPlan(smallest_unsplittable_units=partition_units) + pipe_config_equivalent = PipelineParallelPlan(smallest_unsplittable_units=partition_units_equivalent) + + pipe_parser = PipeParser() + model_graph = pipe_parser.parse(model) + print(model_graph) + assert not all(node.target in partition_units for node in model_graph.graph.nodes) + + model_graph_partition_units = pipe_parser.parse(model, pipe_config) + print(model_graph_partition_units) + result = [node.target in partition_units for node in model_graph_partition_units.graph.nodes] + assert any(result) + + # the resulting graph should be identical to the one parsed by model_graph_partition_units + model_graph_partition_units_equivalent = pipe_parser.parse(model, pipe_config_equivalent) + print(model_graph_partition_units_equivalent) + result_two = [node.target in partition_units for node in model_graph_partition_units_equivalent.graph.nodes] + assert any(result_two) + self.assertEqual(result, result_two) + + @with_comms + def test_uniform_split(self): + """ + Tests uniform stage split. + """ + deferred_mlp = deferred_init(EightMLP, hidden=8) + layers = 8 + pipe_parser = PipeParser() + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=1, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(layers)], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + ) + model_graph_partition_units = pipe_parser.parse(deferred_mlp, pipe_config) + print(model_graph_partition_units) + splited_graph = pipe_parser.partition_stage(deferred_mlp, model_graph_partition_units, pipe_config) + self.assertEqual( + [node.name for node in splited_graph.stage0.graph.nodes][1:-1], ["mlp1", "mlp2", "mlp3", "mlp4"] + ) + self.assertEqual( + [node.name for node in splited_graph.stage1.graph.nodes][1:-1], ["mlp5", "mlp6", "mlp7", "mlp8"] + ) + + @with_comms + def test_uniform_split_model_with_ops(self): + """ + Tests uniform stage split with torch operators as graph components. + """ + deferred_mlp = deferred_init(EightMLPWithOpsTail, hidden=8) + layers = 8 + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=1, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(layers)], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + ) + pipe_parser = PipeParser() + model_graph_partition_units = pipe_parser.parse(deferred_mlp, pipe_config) + print(model_graph_partition_units) + splited_graph = pipe_parser.partition_stage(deferred_mlp, model_graph_partition_units, pipe_config) + self.assertEqual( + [node.name for node in splited_graph.stage0.graph.nodes][1:-1], + ["add", "mlp1", "mul", "mlp2", "mul_1", "mlp3", "mlp4"], + ) + self.assertEqual( + [node.name for node in splited_graph.stage1.graph.nodes][1:-1], + ["mlp5", "mlp6", "mlp7", "mlp8", "mul_2", "mul_3", "add_1"], + ) + + @with_comms + def test_uniform_split_on_modules(self): + """ + Tests uniform stage split on modules with modules and torch operators. + """ + deferred_mlp = deferred_init(EightMLPWithOps, hidden=8) + layers = 8 + pipe_parser = PipeParser() + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=1, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(layers)], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + uniform_split_ops=True, + ) + model_graph_partition_units = pipe_parser.parse(deferred_mlp, pipe_config) + print(model_graph_partition_units) + splited_graph = pipe_parser.partition_stage(deferred_mlp, model_graph_partition_units, pipe_config) + stage_one_modules = ["add", "mlp1", "mul", "mlp2", "mul_1", "mul_2", "mul_3", "mlp3", "mlp4"] + stage_two_modules = ["mlp5", "mlp6", "mlp7", "mlp8"] + self.assertEqual([node.name for node in splited_graph.stage0.graph.nodes][1:-1], stage_one_modules) + self.assertEqual([node.name for node in splited_graph.stage1.graph.nodes][1:-1], stage_two_modules) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/backend/test_shard_plan.py b/test/parallel/pipeline/backend/test_shard_plan.py new file mode 100644 index 0000000..28f663f --- /dev/null +++ b/test/parallel/pipeline/backend/test_shard_plan.py @@ -0,0 +1,72 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch +from torch.testing._internal.common_utils import run_tests +from common_dtensor import DTensorTestBase, with_comms +from vescale.pipe.pipe_parser import PipeParser +from vescale.initialize.deferred_init import deferred_init +from eight_mlp import EightMLP, sharding_plan +from vescale.dmodule.api import parallelize_module +from vescale.dtensor.api import distribute_tensor +from vescale.devicemesh_api.api import VESCALE_DEVICE_MESH +from vescale.dtensor.placement_types import Replicate +from vescale.plan import PipelineParallelPlan, PipelineSplitMethodType + + +class ShardPlanRegistrationTest(DTensorTestBase): + @with_comms + def test_manual_split_register_hook(self): + """ + Tests manual stage split and registers hooks. + """ + VESCALE_DEVICE_MESH.init_device_mesh("cuda", (2, 1, 2), mesh_dim_names=("PP", "DP", "TP")) + deferred_mlp = deferred_init(EightMLP, hidden=8) + partition_units = ["mlp1", "mlp8"] + pipe_config = PipelineParallelPlan( + num_stages=2, + split_method=PipelineSplitMethodType.UNIFORM, + smallest_unsplittable_units=partition_units, + ) + pipe_parser = PipeParser() + input = torch.randn((3, 8)) + model_graph = pipe_parser.parse( + deferred_mlp, + pipe_config, + **{"shard_plan": sharding_plan}, + ) + pipe_spec = pipe_parser.partition_stage(deferred_mlp, model_graph, pipe_config) + model_chunks = [] + model_partition = pipe_spec.stage0 + model = parallelize_module( + model_partition, VESCALE_DEVICE_MESH.get_tensor_parallel_mesh(), sharding_plan, factory=False + ) + + # hooks are successfully registered on target modules, as they now have been hierarchically flattened! + def hook(sel, args): + print("hook registered. Successful registration will trigger this printout!") + return args + + model.get_submodule("mlp1").register_forward_pre_hook(hook) + d_input = distribute_tensor(input, VESCALE_DEVICE_MESH.get_tensor_parallel_mesh(), [Replicate()]) + d_out = model(d_input) + model_chunks.append(model) + assert model_chunks[0].mlp1.fc1.weight._spec.placements[0].is_shard() + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/backend/test_shared_params.py b/test/parallel/pipeline/backend/test_shared_params.py new file mode 100644 index 0000000..6e3a695 --- /dev/null +++ b/test/parallel/pipeline/backend/test_shared_params.py @@ -0,0 +1,301 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +from common_dtensor import DTensorTestBase, with_comms +from vescale.dtensor.api import distribute_tensor +from vescale.optim.base_optimizer import BasicOptimizer +from vescale.initialize import materialize_module +from vescale.ddp.distributed_data_parallel import DistributedDataParallel as DDP +from vescale.plan import ( + PipelineParallelPlan, + PipelineSplitMethodType, + PipelineScheduleType, + ModeType, +) +from vescale.pipe import PipeModule, build_shared_module_group, construct_stage_modules, construct_pipeline_split_graph +from vescale.initialize.deferred_init import deferred_init +from eight_mlp import sharding_plan, EightMLPSharedEmbed +from vescale.dtensor.placement_types import Replicate +from vescale.dmodule.api import parallelize_module + + +microbatch_size = 16 +factor = 16 +batch_size = microbatch_size * factor +RANDOM_SEED = 9999 + + +class SharedParamsTest(DTensorTestBase): + @property + def world_size(self): + return 8 + + @with_comms + def test_sync_embedding_weights_two_stages(self): + """ + Test correctness of synchronizing "shared_units" (embedding) + weights upon engine initialization. + """ + pp_size = 2 + dp_size = 2 + tp_size = 2 + deferred_mlp = deferred_init(EightMLPSharedEmbed, hidden=8) + partition_units = [f"mlp{i + 1}" for i in range(8)] + ["embed1", "embed2"] + pp_plan = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=1, + smallest_unsplittable_units=partition_units, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + shared_modules=[ + ["embed1", "embed2"] + ], # each sublist represents a group of modules to synchronize params/grads + ) + split_graph = construct_pipeline_split_graph(deferred_mlp, pp_plan, update_split_points=True) + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(pp_size, dp_size, tp_size), + mesh_dim_names=["PP", "DP", "TP"], + ) + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + deferred_mlp, + pp_plan, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + for module in stage_modules: + materialize_module(module) + module.cuda() + + combined_parameters = list(stage_modules[0].parameters()) + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + optimizer = torch.optim.SGD(combined_parameters, **optimizer_fn_kwargs) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pp_plan) + + build_shared_module_group( + pipe_module, + split_graph, + pp_plan.num_stages, + pp_plan.virtual_chunks, + pp_plan.shared_modules, + VESCALE_DEVICE_MESH, + ) + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 0: + embedding = pipe_module[0].get_submodule("embed1").get_word_embeddings_weight().data + else: + embedding = pipe_module[0].get_submodule("embed2").get_word_embeddings_weight().data + pipe_module.sync_shared_params(VESCALE_DEVICE_MESH, group_id=0, share_params=True, chunk_id=0) + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 0: + sync_embedding = pipe_module[0].get_submodule("embed1").get_word_embeddings_weight().data + else: + sync_embedding = pipe_module[0].get_submodule("embed2").get_word_embeddings_weight().data + assert not torch.testing.assert_close(embedding, sync_embedding) + + @with_comms + def test_sync_embedding_weights_four_stages(self): + """ + Test correctness of synchronizing "shared_units" (embedding) + weights given four stages partitioned. + """ + pp_size = 4 + dp_size = 2 + tp_size = 1 + model = EightMLPSharedEmbed(hidden=8).cuda() + partition_units = [f"mlp{i + 1}" for i in range(8)] + ["embed1", "embed2"] + pp_plan = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=1, + smallest_unsplittable_units=partition_units, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + split_points=["mlp2", "mlp5", "mlp7", "embed2"], + shared_modules=[ + ["embed1", "embed2"] + ], # each sublist represents a group of modules to synchronize params/grads + ) + + split_graph = construct_pipeline_split_graph(model, pp_plan, update_split_points=True) + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(pp_size, dp_size, tp_size), + mesh_dim_names=["PP", "DP", "TP"], + ) + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + pp_plan, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + combined_parameters = list(stage_modules[0].parameters()) + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + optimizer = torch.optim.SGD(combined_parameters, **optimizer_fn_kwargs) + basic_optimizer = BasicOptimizer(optimizer, models=stage_modules) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pp_plan) + + build_shared_module_group( + pipe_module, + split_graph, + pp_plan.num_stages, + pp_plan.virtual_chunks, + pp_plan.shared_modules, + VESCALE_DEVICE_MESH, + ) + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 0: + embedding = pipe_module[0].get_submodule("embed1").get_word_embeddings_weight().data + elif VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 3: + embedding = pipe_module[0].get_submodule("embed2").get_word_embeddings_weight().data + else: + embedding = None + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() in [0, 3]: + pipe_module.sync_shared_params(VESCALE_DEVICE_MESH, group_id=0, share_params=True, chunk_id=0) + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 0: + sync_embedding = pipe_module[0].get_submodule("embed1").get_word_embeddings_weight().data + assert not torch.testing.assert_close(embedding, sync_embedding) + elif VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 3: + sync_embedding = pipe_module[0].get_submodule("embed2").get_word_embeddings_weight().data + assert not torch.testing.assert_close(embedding, sync_embedding) + + @with_comms + def test_sync_embedding_gradients(self): + """ + Test correctness of synchronizing "shared_units" (embedding) + weights given uniform partition results. + """ + pp_size = 2 + dp_size = 4 + tp_size = 1 + model = EightMLPSharedEmbed(hidden=8).cuda() + partition_units = [f"mlp{i + 1}" for i in range(8)] + ["embed1", "embed2"] + + pp_plan = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.UNIFORM, + num_stages=2, + virtual_chunks=1, + smallest_unsplittable_units=partition_units, + schedule_type=PipelineScheduleType.SIMPLE_1F1B, + shared_modules=[ + ["embed1", "embed2"] + ], # each sublist represents a group of modules to synchronize params/grads + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + + split_graph = construct_pipeline_split_graph(model, pp_plan, update_split_points=True) + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(pp_size, dp_size, tp_size), + mesh_dim_names=["PP", "DP", "TP"], + ) + tp_mesh = VESCALE_DEVICE_MESH["TP"] + dp_mesh = VESCALE_DEVICE_MESH["DP"] + + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + pp_plan, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + for i in range(len(stage_modules)): + parallelized_module = parallelize_module( + stage_modules[i], + tp_mesh, + sharding_plan, + factory=False, + ) + ddp_module = DDP( + parallelized_module, + dp_mesh, + accumulate_allreduce_grads_in_fp32=True, + overlap_grad_reduce=True, + use_distributed_optimizer=False, + disable_bucketing=False, + bucket_size=40000000, + ) + stage_modules[i] = ddp_module + combined_parameters = list(stage_modules[0].parameters()) + optimizer = torch.optim.SGD(combined_parameters, **optimizer_fn_kwargs) + basic_optimizer = BasicOptimizer(optimizer, models=stage_modules) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pp_plan) + + build_shared_module_group( + pipe_module, + split_graph, + pp_plan.num_stages, + pp_plan.virtual_chunks, + pp_plan.shared_modules, + VESCALE_DEVICE_MESH, + ) + loss_fn = nn.MSELoss() + input_tensor = distribute_tensor(torch.ones(3).long().cuda(), tp_mesh, [Replicate()]) + + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 0: + embed = pipe_module[0].module.embed1 + else: + embed = pipe_module[0].module.embed2 + output = embed(input_tensor) + target = torch.zeros_like(output) + target = distribute_tensor(target, tp_mesh, [Replicate()]) + losses = loss_fn(output, target) + losses.backward() + old_grad = embed.embedding.weight.main_grad.clone() + pipe_module.sync_shared_params(VESCALE_DEVICE_MESH, group_id=0, share_params=False, chunk_id=0) + if VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() == 0: + embed = pipe_module[0].module.embed1 + else: + embed = pipe_module[0].module.embed2 + new_grad = embed.embedding.weight.main_grad.clone() + assert not torch.equal(old_grad, new_grad) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/backend/test_trace_parser.py b/test/parallel/pipeline/backend/test_trace_parser.py new file mode 100644 index 0000000..df32f87 --- /dev/null +++ b/test/parallel/pipeline/backend/test_trace_parser.py @@ -0,0 +1,133 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + + +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.pipe.tracer import ModelTracer, HFModelTracer, register_partition_module, hf_symbolic_trace +from common_dtensor import DTensorTestBase, with_comms +from transformers import LlamaModel, LlamaConfig +from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm + + +class MLP(nn.Module): + def __init__(self, features_in, features_out, value): + super().__init__() + self.value = value + self.fc1 = nn.Linear(features_in, 2 * features_in, bias=False) + self.fc1.weight.data.fill_(value) + self.fc2 = nn.Linear(2 * features_in, features_out, bias=False) + self.fc2.weight.data.fill_(value * 2) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + return t + + +class EightMLP(nn.Module): + def __init__(self, hidden=1024): + super().__init__() + self.mlp1 = MLP(hidden, hidden, 0) + self.mlp2 = MLP(hidden, hidden, 1) + self.mlp3 = MLP(hidden, hidden, 2) + self.mlp4 = MLP(hidden, hidden, 3) + self.mlp5 = MLP(hidden, hidden, 4) + self.mlp6 = MLP(hidden, hidden, 5) + self.mlp7 = MLP(hidden, hidden, 6) + self.mlp8 = MLP(hidden, hidden, 7) + self.sequence = nn.Sequential( + self.mlp1, + self.mlp2, + self.mlp3, + self.mlp4, + self.mlp5, + self.mlp6, + self.mlp7, + self.mlp8, + ) + + def forward(self, x): + return self.sequence(x) + + +class TracerTest(DTensorTestBase): + @property + def world_size(self): + return 1 + + @with_comms + def test_simple_model_tracer(self): + """ + Test fx tracer to capture native symbolic trace + of simple model. + """ + model = EightMLP(16) + tracer = ModelTracer() + traced_graph = tracer.trace(model) + print("Simple Model Graph Trace:") + print(traced_graph) + + @with_comms + def test_simple_model_tracer_with_partition_units(self): + """ + Test fx tracer to capture symbolic trace with granularity of + MLP level (do not dive into operators of MLP) of simple model. + """ + model = EightMLP(16) + register_partition_module(model.mlp1) + register_partition_module(model.mlp2) + register_partition_module(model.mlp3) + register_partition_module(model.mlp4) + register_partition_module(model.mlp5) + register_partition_module(model.mlp6) + register_partition_module(model.mlp7) + register_partition_module(model.mlp8) + tracer = ModelTracer() + traced_graph = tracer.trace(model) + print(traced_graph) + + @with_comms + def test_huggingface_model_tracer_with_partition_units(self): + """ + Test huggingface tracer to capture symbolic trace with granularity + of LlamaDecoderLayer and LlamaRMSNorm. + """ + configuration = LlamaConfig() + configuration.hidden_size = 1024 + configuration.intermediate_size = 5504 + configuration.num_attention_heads = 1 + configuration.num_hidden_layers = 2 + + model = LlamaModel(configuration) + submodule_qualified_names = ["layers.0", "layers.1", "norm"] + # submodules indicated by submodule_qualified_names are modules that have the classes below + partition_unit_modules = [LlamaDecoderLayer, LlamaRMSNorm] + submodule_qualified_names + traced_graph = hf_symbolic_trace( + model, + input_names=["input_ids", "attention_mask"], + tracer_cls=HFModelTracer, + partition_modules=partition_unit_modules, + ) + print("HF Model Graph Trace:") + print(traced_graph) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/e2e/test_pp_accuracy_alignment.py b/test/parallel/pipeline/e2e/test_pp_accuracy_alignment.py new file mode 100644 index 0000000..163f453 --- /dev/null +++ b/test/parallel/pipeline/e2e/test_pp_accuracy_alignment.py @@ -0,0 +1,247 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.plan import ( + PipelineParallelPlan, + PipelineScheduleType, + ModeType, + PipelineSplitMethodType, +) +from vescale.pipe import PipeModule, construct_stage_modules +from vescale.engine import PipeEngine +from common_dtensor import DTensorTestBase, with_comms +from vescale.devicemesh_api import VESCALE_DEVICE_MESH + +microbatch_size = 2 +factor = 8 +batch_size = microbatch_size * factor +stage = 4 +RANDOM_SEED = 9999 + + +class MLP(nn.Module): + def __init__(self, features_in, feature_middle, features_out, value, idx=1): + super().__init__() + self.value = value + self.idx = idx + self.counter = 0 + self.fc1 = nn.Linear(features_in, feature_middle, bias=False) + self.fc2 = nn.Linear(feature_middle, features_out, bias=False) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + torch.save(t, f"{os.environ['model_name']}_mlp{self.value}_fwd{self.counter}_out_tensor.pt") + self.counter += 1 + return t + + +class EightMLP(nn.Module): + def __init__(self, hidden=1024, fixed_size=True): + super().__init__() + self.mlp1 = MLP(hidden, hidden, hidden, 1, 1) + self.mlp2 = MLP(hidden, hidden, hidden, 2, 2) + self.mlp3 = MLP(hidden, hidden, hidden, 1, 3) + self.mlp4 = MLP(hidden, hidden, hidden, 2, 4) + self.mlp5 = MLP(hidden, hidden, hidden, 1, 5) + self.mlp6 = MLP(hidden, hidden, hidden, 2, 6) + self.mlp7 = MLP(hidden, hidden, hidden, 1, 7) + self.mlp8 = MLP(hidden, hidden, hidden, 2, 8) + + def forward(self, x): + x = self.mlp1(x) + x = self.mlp2(x) + x = self.mlp3(x) + x = self.mlp4(x) + x = self.mlp5(x) + x = self.mlp6(x) + x = self.mlp7(x) + x = self.mlp8(x) + return x + + +class PipelineAccuracyAlignmentTest(DTensorTestBase): + @property + def world_size(self): + return 4 + + @staticmethod + def loss_fn(x): + return x.mean() + + @staticmethod + def save_mlp_parameter(model: MLP, f_name): + torch.save(model.fc1.weight, f"{f_name}.fc1") + torch.save(model.fc2.weight, f"{f_name}.fc2") + + @staticmethod + def load_mlp_parameter(f_prefix): + fc1_weight = torch.load(f"{f_prefix}.fc1").to("cuda:0") + fc2_weight = torch.load(f"{f_prefix}.fc2").to("cuda:0") + return (fc1_weight, fc2_weight) + + def check_model_weight_diff(self, f_prefix): + def helper(f1, f2): + golden_weights = self.load_mlp_parameter(f1) + pp_weights = self.load_mlp_parameter(f2) + torch.testing.assert_close(golden_weights[0], pp_weights[0]) + torch.testing.assert_close(golden_weights[1], pp_weights[1]) + + helper(f"golden_mlp{self.rank + 1}", f"{f_prefix}_mlp{self.rank + 1}") + + def check_out_tensors(self, model_name): + def helper(f1, f2): + golden_out = torch.load(f1).to("cuda:0") + pp_out = torch.load(f2).to("cuda:0") + torch.testing.assert_close(golden_out, pp_out) + + for i in range(1, 3): + for j in range(8): + helper(f"golden_mlp{i}_fwd{j}_out_tensor.pt", f"{model_name}_mlp{i}_fwd{j}_out_tensor.pt") + torch.cuda.synchronize() + + def test_accuracy_alignment(self, fixed_size=True): + """ + Tests alignment of updated parameter and output activations of single device model and + the model partitioned into four stages with pipeline parallelism API. + """ + if self.rank == 0: + self._run_no_pp_model(fixed_size=fixed_size) + torch.cuda.synchronize() + n_gpus = torch.cuda.device_count() + assert n_gpus >= 2, "Requires at least 2 GPUs to run model with pp engine" + self._run_engine_with_1f1b(fixed_size=fixed_size) + if self.rank == 0: + self.check_out_tensors("pp") + self.check_model_weight_diff("engine_1f1b") + + def _run_no_pp_model(self, fixed_size=True): + os.environ["model_name"] = "golden" + model = EightMLP(16, fixed_size=fixed_size).to("cuda:0") + torch.save(model.state_dict(), "baseline_model.pt") + optimizer = torch.optim.SGD( + model.parameters(), + lr=0.01, + momentum=0, + dampening=0, + weight_decay=0, + nesterov=False, + maximize=False, + foreach=None, + differentiable=False, + ) + torch.manual_seed(9999) + batch = [torch.ones(microbatch_size, 128, 16, dtype=torch.float32).to("cuda:0") for _ in range(factor)] + for mb in batch: + out = model(mb) + loss = self.loss_fn(out) + loss.backward() + optimizer.step() + torch.save(out, "golden_out.pt") + torch.save(loss, "golden_loss.pt") + self.save_mlp_parameter(model.mlp1, "golden_mlp1") + self.save_mlp_parameter(model.mlp2, "golden_mlp2") + self.save_mlp_parameter(model.mlp3, "golden_mlp3") + self.save_mlp_parameter(model.mlp4, "golden_mlp4") + self.save_mlp_parameter(model.mlp5, "golden_mlp5") + self.save_mlp_parameter(model.mlp6, "golden_mlp6") + self.save_mlp_parameter(model.mlp7, "golden_mlp7") + self.save_mlp_parameter(model.mlp8, "golden_mlp8") + + @with_comms + def _run_engine_with_1f1b(self, fixed_size=True): + os.environ["model_name"] = "pp" + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + model = EightMLP(16, fixed_size=fixed_size).cuda() + model.load_state_dict(torch.load("baseline_model.pt")) + + pipe_config = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=2, + smallest_unsplittable_units=["mlp1", "mlp2", "mlp3", "mlp4", "mlp5", "mlp6", "mlp7", "mlp8"], + split_points=["mlp2", "mlp4", "mlp6", "mlp8"], + batch_p2p_comm=False, + overlap_p2p_comm=True, + schedule_type=PipelineScheduleType.INTERLEAVED_1F1B, + ) + + optimizer_fn_kwargs = { + "lr": 0.01, + "momentum": 0, + "dampening": 0, + "weight_decay": 0, + "nesterov": False, + "maximize": False, + "foreach": None, + "differentiable": False, + } + + torch.manual_seed(9999) + with torch.no_grad(): + batch = [torch.ones(microbatch_size, 128, 16, dtype=torch.float32).to(device) for _ in range(factor)] + + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=["PP", "DP", "TP"], + ) + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, + pipe_config, + VESCALE_DEVICE_MESH, + update_split_points=True, + ) + _parameters = list(stage_modules[0].parameters()) + list(stage_modules[1].parameters()) + optimizer = torch.optim.SGD(_parameters, **optimizer_fn_kwargs) + pipe_module = PipeModule(stage_modules, optimizer, None, stage_dependency, p2p_index_mapping, pipe_config) + engine = PipeEngine( + pipe_module, + VESCALE_DEVICE_MESH, + self.loss_fn, + pipe_config, + ) + + engine.forward_backward(batch) + optimizer = engine.get_optimizer + optimizer.step() + + if self.rank == 0: + self.save_mlp_parameter(engine.module[0].get_submodule("mlp1"), "engine_1f1b_mlp1") + self.save_mlp_parameter(engine.module[1].get_submodule("mlp5"), "engine_1f1b_mlp5") + if self.rank == 1: + self.save_mlp_parameter(engine.module[0].get_submodule("mlp2"), "engine_1f1b_mlp2") + self.save_mlp_parameter(engine.module[1].get_submodule("mlp6"), "engine_1f1b_mlp6") + if self.rank == 2: + self.save_mlp_parameter(engine.module[0].get_submodule("mlp3"), "engine_1f1b_mlp3") + self.save_mlp_parameter(engine.module[1].get_submodule("mlp7"), "engine_1f1b_mlp7") + if self.rank == 3: + self.save_mlp_parameter(engine.module[0].get_submodule("mlp4"), "engine_1f1b_mlp4") + self.save_mlp_parameter(engine.module[1].get_submodule("mlp8"), "engine_1f1b_mlp8") + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/instruction/four_mlp.py b/test/parallel/pipeline/instruction/four_mlp.py new file mode 100644 index 0000000..b6ac2f4 --- /dev/null +++ b/test/parallel/pipeline/instruction/four_mlp.py @@ -0,0 +1,71 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch.nn as nn +from vescale.dtensor.placement_types import Shard, Replicate + + +class MLP(nn.Module): + def __init__(self, features_in, features_out, value): + super().__init__() + self.value = value + self.fc1 = nn.Linear(features_in, 16, bias=False) + self.fc1.weight.data.fill_(value) + self.fc2 = nn.Linear(16, features_out, bias=False) + self.fc2.weight.data.fill_(value * 2) + self.gelu = nn.GELU() + + def forward(self, x): + t = self.fc1(x) + t = self.gelu(t) + t = self.fc2(t) + return t + + +class FourMLP(nn.Module): + def __init__(self, hidden=64, fixed_size=True): + super().__init__() + if fixed_size: + self.mlp1 = MLP(hidden, hidden, 1) + self.mlp2 = MLP(hidden, hidden, 2) + self.mlp3 = MLP(hidden, hidden, 3) + self.mlp4 = MLP(hidden, hidden, 4) + else: + self.mlp1 = MLP(hidden * 1, hidden * 2, 1) + self.mlp2 = MLP(hidden * 2, hidden * 3, 2) + self.mlp3 = MLP(hidden * 3, hidden * 4, 3) + self.mlp4 = MLP(hidden * 4, hidden * 5, 4) + + def forward(self, x): + x = self.mlp1(x) + x = self.mlp2(x) + x = self.mlp3(x) + x = self.mlp4(x) + return x + + +sharding_plan = { + "forward": { + ".input": [[Replicate()]], + r"mlp\d.fc1.input": [[Replicate()]], + r"mlp\d.fc2.output": [[Replicate()]], + }, + "parameter": { + r"mlp\d.fc1.weight": [Shard(0)], + r"mlp\d.fc2.weight": [Shard(1)], + }, +} diff --git a/test/parallel/pipeline/instruction/test_multistage_schedule.py b/test/parallel/pipeline/instruction/test_multistage_schedule.py new file mode 100644 index 0000000..8695c0c --- /dev/null +++ b/test/parallel/pipeline/instruction/test_multistage_schedule.py @@ -0,0 +1,190 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from common_dtensor import DTensorTestBase, with_comms +from vescale.pipe._schedules.instruction_base import StageDeps +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +import numpy as np +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.plan.spec import PipelineP2PSpec + + +class MLP(nn.Module): + def __init__(self, n_features): + super().__init__() + self.fc1 = nn.Linear(n_features, n_features * 2, bias=False) + torch.nn.init.uniform_(self.fc1.weight, 0, 1) + self.fc2 = nn.Linear(n_features * 2, n_features) + torch.nn.init.uniform_(self.fc2.weight, 0, 1) + self.gelu = nn.GELU() + + def forward(self, x, y=None): + out = self.fc2(self.gelu(self.fc1(x))) + if y is not None: + out = out + y + return out + + +class FourMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden) + self.mlp2 = MLP(hidden) + self.mlp3 = MLP(hidden) + self.mlp4 = MLP(hidden) + + def forward(self, x): + stage1 = self.mlp1(x) + stage2 = self.mlp2(stage1) + stage3 = self.mlp3(stage2, x) + stage4 = self.mlp4(stage3) + return stage4 + + +class MultiStageCommTest(DTensorTestBase): + def test_send_order(self): + """ + Tests send order. + + stage 0: a , c + stage 1: b + stage 2: dataloader + + stage 2: forward(c,b,dataloader,a) + + """ + a = torch.tensor(0) + b = torch.tensor(1) + c = torch.tensor(2) + d = torch.tensor(3) + p2p_tensors = [a, c, b] + p2p_index = [PipelineP2PSpec(0, 2), PipelineP2PSpec(1, 0), PipelineP2PSpec(2, 0), PipelineP2PSpec(0, 0)] + local_inputs = [d] + + p2p_index_without_local = list(filter(lambda item: item.peer_stage_idx != 2, p2p_index)) + p2p_send_order = sorted(p2p_index_without_local, key=lambda x: (x.peer_stage_idx, x.peer_output_idx)) + p2p_tensor_order = [p2p_send_order.index(item) for item in p2p_index_without_local] + ordered_p2p_tensors = [p2p_tensors[x] for x in p2p_tensor_order] + + assert ordered_p2p_tensors == [c, b, a] + + args = [] + local_input_mapping = list(filter(lambda item: item.peer_stage_idx == 2, p2p_index)) + for item in p2p_index: + if item.peer_stage_idx == 2: + index = local_input_mapping.index(item) + args.append(local_inputs[index]) + else: + index = p2p_send_order.index(item) + args.append(p2p_tensors[index]) + assert args == [c, b, d, a] + + @with_comms + def test_stage_deps(self): + """ + Tests abstraction of inter-stage communication dependency. + """ + # initialize global device mesh + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=("PP", "DP", "TP"), + ) + print(VESCALE_DEVICE_MESH.get()) + + # case 1 - sequential input is one + single_deps = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) + stage = StageDeps( + single_deps, + VESCALE_DEVICE_MESH.get_global_pipeline_parallel_meshes(), + [], + ) + if torch.distributed.distributed_c10d.get_rank() == 0: + print(stage) + + # case 2 - sequential multi input + single_deps = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) + p2p_index_mapping = {1: [PipelineP2PSpec(0, 0), PipelineP2PSpec(0, 1)]} + stage = StageDeps( + single_deps, + VESCALE_DEVICE_MESH.get_global_pipeline_parallel_meshes(), + [], + p2p_index_mapping=p2p_index_mapping, + ) + if torch.distributed.distributed_c10d.get_rank() == 0: + print(stage) + + # case 3 - sequential multi input with local_dataloader + """ + The adjacency matrix for 4 stages is formulated as a 4x4 matrix. The meaning can be interpreted as followed: + Row (Stage) 0: [0, 1, 0, 0]. stage 0 sends output to stage 1 (index position 1). + Row (Stage) 1: [0, 0, 1, 0]: stage 1 sends output to stage 2 (index position 2). + Row (Stage) 2: [0, 0, 0, 1]: stage 2 sends output to stage 3 (index position 3). + Row (Stage) 3: [0, 0, 0, 0]: stage 3 sends no output to any other stage. + """ + single_deps = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) + p2p_index_mapping = {1: [PipelineP2PSpec(0, 2), PipelineP2PSpec(1, 0), PipelineP2PSpec(0, 0)]} + stage = StageDeps( + single_deps, + VESCALE_DEVICE_MESH.get_global_pipeline_parallel_meshes(), + [], + p2p_index_mapping=p2p_index_mapping, + ) + if torch.distributed.distributed_c10d.get_rank() == 0: + print(stage) + + # case 4 - multi branch input with single data + """ + The adjacency matrix for 4 stages is formulated as a 4x4 matrix. The meaning can be interpreted as followed: + Row (Stage) 0: [0, 1, 0, 0]. stage 0 sends output to stage 1 (index position 1). + Row (Stage) 1: [0, 0, 1, 0]: stage 1 sends output to stage 2 (index position 2). + Row (Stage) 2: [0, 0, 0, 1]: stage 2 sends output to stage 3 (index position 3). + Row (Stage) 3: [0, 0, 0, 0]: stage 3 sends no output to any other stage. + """ + single_deps = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) + p2p_index_mapping = {2: [PipelineP2PSpec(0, 0), PipelineP2PSpec(1, 0)]} + stage = StageDeps( + single_deps, + VESCALE_DEVICE_MESH.get_global_pipeline_parallel_meshes(), + [], + p2p_index_mapping=p2p_index_mapping, + ) + if torch.distributed.distributed_c10d.get_rank() == 0: + print(stage) + + # case 5 - vpp test + """ + The adjacency matrix for 4 stages is formulated as a 4x4 matrix. The meaning can be interpreted as followed: + Row (Stage) 0: [0, 1, 0, 0]. stage 0 sends output to stage 1 (index position 1). + Row (Stage) 1: [0, 0, 1, 0]: stage 1 sends output to stage 2 (index position 2). + Row (Stage) 2: [0, 0, 0, 1]: stage 2 sends output to stage 3 (index position 3). + Row (Stage) 3: [0, 0, 0, 0]: stage 3 sends no output to any other stage. + """ + single_deps = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]]) + stage = StageDeps( + single_deps, + VESCALE_DEVICE_MESH.get_global_pipeline_parallel_meshes(), + [0, 1], + ) + if torch.distributed.distributed_c10d.get_rank() == 0: + print(stage) + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/instruction/test_pipe_instruction_register.py b/test/parallel/pipeline/instruction/test_pipe_instruction_register.py new file mode 100644 index 0000000..ba4b36f --- /dev/null +++ b/test/parallel/pipeline/instruction/test_pipe_instruction_register.py @@ -0,0 +1,60 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import unittest +from unittest import TestCase +from vescale.pipe._schedules.instruction_base import register_instruction, registed_functions, InstructionBuilder + + +class InstructionRegistrationTest(TestCase): + def test_pp_registed_function(self): + """ + Tests instruction registration. + """ + + @register_instruction(name="instruction_one") + def instruction_one(input): + print(input) + return input + + assert "instruction_one" in registed_functions + + def test_instruction_constructor(self): + """ + Tests instruction construction. + """ + + @register_instruction(name="I1") + def instruction_one(input): + return input + 1 + + @register_instruction(name="I2") + def instruction_two(input): + return input * 2 + + @register_instruction(name="B") + def bubble(input): + return input + + instructions = {0: "B,I1,I1,I1,I1,I2,I2", 1: "B,I2,I2,I2,I2,I1,I1,I1"} + builder = InstructionBuilder() + builder.build_from_dict(instructions) + builder.draw_instructions() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/parallel/pipeline/instruction/test_schedule.py b/test/parallel/pipeline/instruction/test_schedule.py new file mode 100644 index 0000000..9a185dc --- /dev/null +++ b/test/parallel/pipeline/instruction/test_schedule.py @@ -0,0 +1,529 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os +from common_dtensor import DTensorTestBase, with_comms +from vescale.pipe._schedules.instruction_base import get_linear_pp_module_dep2 +from vescale.pipe._schedules.pipedream_flush import PipeDream +from vescale.pipe._schedules.looping_bfs import InterleavedPipeDreramFlush +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +import torch +import torch.nn as nn +from torch.testing._internal.common_utils import run_tests +from vescale.dtensor.api import distribute_tensor +from vescale.dtensor.device_mesh import DeviceMesh +from vescale.dmodule.api import parallelize_module +from vescale.dtensor.placement_types import Replicate +from vescale.plan.spec import PipelineScheduleType +from vescale.pipe.pipe_emmiter import ScheduleEngine + + +class MLP(nn.Module): + def __init__(self, n_features): + super().__init__() + self.fc1 = nn.Linear(n_features, n_features * 2, bias=False) + torch.nn.init.uniform_(self.fc1.weight, 0, 1) + self.fc2 = nn.Linear(n_features * 2, n_features) + torch.nn.init.uniform_(self.fc2.weight, 0, 1) + self.gelu = nn.GELU() + + def forward(self, x): + return self.fc2(self.gelu(self.fc1(x))) + + def forward_utils(p2p, dataloader): + if p2p is not None: + return p2p + else: + return dataloader + + +class FourMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlp1 = MLP(hidden) + self.mlp2 = MLP(hidden) + self.mlp3 = MLP(hidden) + self.mlp4 = MLP(hidden) + + def forward(self, x): + return self.mlp4(self.mlp3(self.mlp2(self.mlp1(x)))) + + +class EightMLP(nn.Module): + def __init__(self, hidden): + super().__init__() + self.mlps = [MLP(hidden) for _ in range(8)] + + def forward(self, x): + all_input_x = [] + for idx, mlp in enumerate(self.mlps): + x = mlp(x) + x.retain_grad() + all_input_x.append(x) + print(f"mlp: {idx} output : {x}") + return x, all_input_x + + +class PipelineScheduleTest(DTensorTestBase): + @property + def world_size(self): + return 4 + + @staticmethod + def loss_fn(x): + return x.sum() + + @with_comms + def test_1f1b_schedules(self): + """ + Test generation of simple 1f1b schedule. + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1 = DeviceMesh(device, [0]) + device_mesh_stage2 = DeviceMesh(device, [1]) + device_mesh_stage3 = DeviceMesh(device, [2]) + device_mesh_stage4 = DeviceMesh(device, [3]) + meshes = (device_mesh_stage1, device_mesh_stage2, device_mesh_stage3, device_mesh_stage4) + microbatch = 8 + batch = 8 + stage = 4 + schedule = PipeDream(stage, meshes, batch) + if torch.distributed.distributed_c10d.get_rank() == 0: + print(schedule) + + @with_comms + def test_interleaved_1f1b_schedules(self): + """ + Test generation of interleaved 1f1b schedule. + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + device_mesh_stage1 = DeviceMesh(device, [0]) + device_mesh_stage2 = DeviceMesh(device, [1]) + device_mesh_stage3 = DeviceMesh(device, [2]) + device_mesh_stage4 = DeviceMesh(device, [3]) + meshes = (device_mesh_stage1, device_mesh_stage2, device_mesh_stage3, device_mesh_stage4) + batches = 8 + num_chunks = 2 + schedule = InterleavedPipeDreramFlush( + num_chunks, meshes, default_shape=[1, 1, 3], default_dtype=torch.float32, batches=batches + ) + if self.rank == 0: + print(schedule) + + @with_comms + def test_runtime_engine_with_profiling(self): + """ + Tests runtime engine with distributed nD timeline profiling. + """ + # initialize global device mesh + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=("PP", "DP", "TP"), + ) + global local_rank + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + from vescale.ndtimeline import init_ndtimers, flush, wait + + init_ndtimers(rank=int(local_rank), local=int(local_rank), enable_streamer=True) + n_hidden = 3 + batches = 8 + model = FourMLP(n_hidden) + all_batches_out = [] + if self.rank == 3: + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(3) + model.cuda(3) + out = model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + print(loss) + print(" ====================================== ") + fwd_plan = { + ".input": [[Replicate()]], + ".output": [[Replicate()]], + } + model_list = [] + + tp_mesh = VESCALE_DEVICE_MESH.get_tensor_parallel_mesh() + if local_rank == 0: + model.mlp1 = parallelize_module(model.mlp1, tp_mesh, {"parameter": None, "forward": fwd_plan}) + model_list = [model.mlp1] + elif self.rank == 1: + model.mlp2 = parallelize_module(model.mlp2, tp_mesh, {"parameter": None, "forward": fwd_plan}) + model_list = [model.mlp2] + elif self.rank == 2: + model.mlp3 = parallelize_module(model.mlp3, tp_mesh, {"parameter": None, "forward": fwd_plan}) + model_list = [model.mlp3] + elif self.rank == 3: + model.mlp4 = parallelize_module(model.mlp4, tp_mesh, {"parameter": None, "forward": fwd_plan}) + model_list = [model.mlp4] + deps = get_linear_pp_module_dep2(model_list, VESCALE_DEVICE_MESH.get_global_tensor_parallel_meshes()) + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append(distribute_tensor(data.float(), tp_mesh, placements=[Replicate()])) + pipe_engine = ScheduleEngine( + deps=deps, + meshes=VESCALE_DEVICE_MESH.get_global_tensor_parallel_meshes(), + schedule=PipelineScheduleType.SIMPLE_1F1B, + batches=batches, + data_iterator=data_iterator, + stage_id=local_rank, + shape=(1, 1, 3), + dtype=torch.float32, + ) + _, all_forward = ScheduleEngine.execute(pipe_engine) + if self.rank == 3: + loss_per_microbatch = [item[1] for item in all_forward] + for t1, t2 in zip(loss_per_microbatch, all_batches_out): + self.assertEqual(t1._local_tensor, t2) + flush() + wait() + + @with_comms + def test_interleaved_1f1b_emmiter(self): + """ + Test schedule instructions generated by ScheduleEngine's pipeline emitter. + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(self.rank) + n_hidden = 3 + batches = 8 + num_chunks = 2 + meshes = [DeviceMesh(device, [i]) for i in range(self.world_size)] + model = EightMLP(n_hidden) + fwd_plan = { + ".input": [[Replicate()]], + ".output": [[Replicate()]], + } + vpp_module_chunk_list = [] + if self.rank == 0: + model.mlps[0] = parallelize_module(model.mlps[0], meshes[0], {"parameter": None, "forward": fwd_plan}) + model.mlps[4] = parallelize_module(model.mlps[4], meshes[0], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[0], model.mlps[4]] + elif self.rank == 1: + model.mlps[1] = parallelize_module(model.mlps[1], meshes[1], {"parameter": None, "forward": fwd_plan}) + model.mlps[5] = parallelize_module(model.mlps[5], meshes[1], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[1], model.mlps[5]] + elif self.rank == 2: + model.mlps[2] = parallelize_module(model.mlps[2], meshes[2], {"parameter": None, "forward": fwd_plan}) + model.mlps[6] = parallelize_module(model.mlps[6], meshes[2], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[2], model.mlps[6]] + elif self.rank == 3: + model.mlps[3] = parallelize_module(model.mlps[3], meshes[3], {"parameter": None, "forward": fwd_plan}) + model.mlps[7] = parallelize_module(model.mlps[7], meshes[3], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[3], model.mlps[7]] + + deps = get_linear_pp_module_dep2(vpp_module_chunk_list, meshes) + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append( + distribute_tensor( + data.float(), DeviceMesh(device, [self.rank], _validate_mesh=False), placements=[Replicate()] + ) + ) + pipe_engine = ScheduleEngine( + deps, + meshes, + PipelineScheduleType.INTERLEAVED_1F1B, + batches, + iter(data_iterator), + self.rank, + (1, 1, 3), + dtype=torch.float32, + num_chunks=num_chunks, + ) + + @with_comms + def test_runtime_interleaved_1f1b_engine_batch(self): + """ + Test parallelized DModules to perform interleaved 1f1b training. + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(self.rank) + n_hidden = 3 + batches = 8 + num_chunks = 2 + meshes = [DeviceMesh(device, [i]) for i in range(self.world_size)] + model = EightMLP(n_hidden) + all_batches_out = [] + if self.rank == 3: + true_model = model + for i in range(8): + true_model.mlps[i] = true_model.mlps[i].cuda(3) + true_model.train() + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(3) + out, all_output_x = true_model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + for idx, output in enumerate(all_output_x): + print(f"mlp{idx}.grad is {output.grad}") + print(" ====================================== ") + fwd_plan = { + ".input": [[Replicate()]], + ".output": [[Replicate()]], + } + vpp_module_chunk_list = [] + if self.rank == 0: + model.mlps[0] = parallelize_module(model.mlps[0], meshes[0], {"parameter": None, "forward": fwd_plan}) + model.mlps[4] = parallelize_module(model.mlps[4], meshes[0], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[0], model.mlps[4]] + elif self.rank == 1: + model.mlps[1] = parallelize_module(model.mlps[1], meshes[1], {"parameter": None, "forward": fwd_plan}) + model.mlps[5] = parallelize_module(model.mlps[5], meshes[1], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[1], model.mlps[5]] + elif self.rank == 2: + model.mlps[2] = parallelize_module(model.mlps[2], meshes[2], {"parameter": None, "forward": fwd_plan}) + model.mlps[6] = parallelize_module(model.mlps[6], meshes[2], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[2], model.mlps[6]] + elif self.rank == 3: + model.mlps[3] = parallelize_module(model.mlps[3], meshes[3], {"parameter": None, "forward": fwd_plan}) + model.mlps[7] = parallelize_module(model.mlps[7], meshes[3], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[3], model.mlps[7]] + deps = get_linear_pp_module_dep2(vpp_module_chunk_list, meshes) + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append( + distribute_tensor( + data.float(), DeviceMesh(device, [self.rank], _validate_mesh=False), placements=[Replicate()] + ) + ) + pipe_engine = ScheduleEngine( + deps, + meshes, + PipelineScheduleType.INTERLEAVED_1F1B, + batches, + [iter(data_iterator) for _ in range(num_chunks)], + self.rank, + (1, 1, 3), + dtype=torch.float32, + num_chunks=num_chunks, + loss_fn=self.loss_fn, + ) + if self.rank == 0: + print("schedule", pipe_engine.p_emmiter.instruction_generator.schema) + _, forward_datas = ScheduleEngine.execute(pipe_engine) + if self.rank == 3: + loss_per_microbatch = [item[1] for item in forward_datas] + for t1, t2 in zip(loss_per_microbatch, all_batches_out): + self.assertEqual(t1._local_tensor, t2) + + @with_comms + def test_runtime_interleaved_1f1b_engine_p2p(self): + """ + Test step-by-step initialization of pipeline engine, generation + of simple 1f1b schedule and execution of pipeline engine with + p2p overlapped communication. + """ + device = f"cuda:{self.rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(self.rank) + n_hidden = 3 + batches = 8 + num_chunks = 2 + meshes = [DeviceMesh(device, [i]) for i in range(self.world_size)] + model = EightMLP(n_hidden) + all_batches_out = [] + if self.rank == 3: + true_model = model + for i in range(8): + true_model.mlps[i] = true_model.mlps[i].cuda(3) + true_model.train() + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(3) + out, all_output_x = true_model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + for idx, output in enumerate(all_output_x): + print(f"mlp{idx}.grad is {output.grad}") + print(" ====================================== ") + fwd_plan = { + ".input": [[Replicate()]], + ".output": [[Replicate()]], + } + vpp_module_chunk_list = [] + if self.rank == 0: + model.mlps[0] = parallelize_module(model.mlps[0], meshes[0], {"parameter": None, "forward": fwd_plan}) + model.mlps[4] = parallelize_module(model.mlps[4], meshes[0], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[0], model.mlps[4]] + elif self.rank == 1: + model.mlps[1] = parallelize_module(model.mlps[1], meshes[1], {"parameter": None, "forward": fwd_plan}) + model.mlps[5] = parallelize_module(model.mlps[5], meshes[1], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[1], model.mlps[5]] + elif self.rank == 2: + model.mlps[2] = parallelize_module(model.mlps[2], meshes[2], {"parameter": None, "forward": fwd_plan}) + model.mlps[6] = parallelize_module(model.mlps[6], meshes[2], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[2], model.mlps[6]] + elif self.rank == 3: + model.mlps[3] = parallelize_module(model.mlps[3], meshes[3], {"parameter": None, "forward": fwd_plan}) + model.mlps[7] = parallelize_module(model.mlps[7], meshes[3], {"parameter": None, "forward": fwd_plan}) + vpp_module_chunk_list = [model.mlps[3], model.mlps[7]] + deps = get_linear_pp_module_dep2(vpp_module_chunk_list, meshes) + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append( + distribute_tensor(data.float(), DeviceMesh(device, [0], _validate_mesh=False), placements=[Replicate()]) + ) + pipe_engine = ScheduleEngine( + deps, + meshes, + PipelineScheduleType.INTERLEAVED_1F1B, + batches, + [iter(data_iterator) for _ in range(num_chunks)], + self.rank, + (1, 1, 3), + dtype=torch.float32, + num_chunks=num_chunks, + overlap_p2p_comm=True, + batch_p2p_comm=False, + loss_fn=self.loss_fn, + ) + if self.rank == 0: + print("schedule", pipe_engine.p_emmiter.instruction_generator.schema) + _, forward_datas = ScheduleEngine.execute(pipe_engine) + if self.rank == 3: + loss_per_microbatch = [item[1] for item in forward_datas] + print(loss_per_microbatch, all_batches_out) + for t1, t2 in zip(loss_per_microbatch, all_batches_out): + self.assertEqual(t1._local_tensor, t2) + + @with_comms + def test_zerobubble_engine(self): + """ + Tests zero-bubble pipeline schedule with profiling. + """ + # initialize global device mesh + VESCALE_DEVICE_MESH.init_device_mesh( + device_type="cuda", + mesh_shape=(4, 1, 1), + mesh_dim_names=("PP", "DP", "TP"), + ) + global local_rank + local_rank = self.rank + device = f"cuda:{local_rank}" + # must do this: https://pytorch.org/docs/stable/distributed.html + torch.cuda.set_device(device) + os.environ["LOCAL_RANK"] = str(local_rank) + from vescale.ndtimeline import init_ndtimers, flush, wait + + init_ndtimers(rank=int(local_rank), local_rank=int(local_rank), enable_streamer=True) + num_chunks = 2 + n_hidden = 3 + batches = 8 + model = EightMLP(n_hidden) + for i in range(8): + model.mlps[i] = model.mlps[i].cuda() + all_batches_out = [] + if self.rank == 0: + true_model = model + for i in range(8): + true_model.mlps[i] = true_model.mlps[i].cuda(0) + true_model.train() + for i in range(batches): + print(f" ===========batch: {i}================= ") + data = torch.zeros(1, 1, n_hidden) + i + data = data.float().cuda(0) + out, all_output_x = true_model(data) + loss = out.sum() + all_batches_out.append(loss) + loss.backward(create_graph=True) + for idx, output in enumerate(all_output_x): + print(f"mlp{idx}.grad is {output.grad}") + print(" ====================================== ") + fwd_plan = { + ".input": [[Replicate()]], + ".output": [[Replicate()]], + } + model_list = [] + + if self.rank == 0: + model_list = [model.mlps[0], model.mlps[7]] + elif self.rank == 1: + model_list = [model.mlps[1], model.mlps[6]] + elif self.rank == 2: + model_list = [model.mlps[2], model.mlps[5]] + elif self.rank == 3: + model_list = [model.mlps[3], model.mlps[4]] + deps = get_linear_pp_module_dep2(model_list, VESCALE_DEVICE_MESH.get_global_tensor_parallel_meshes()) + data_iterator = [] + for i in range(batches): + data = torch.zeros(1, 1, n_hidden) + i + data_iterator.append(data.float().cuda()) + + w = n_hidden * 2 * 4 + a = n_hidden * 4 + mem_f = 2 * w + 2 * a # forward weight size + mem_w = -2 * a + mem_b = -mem_w - mem_f + pipe_engine = ScheduleEngine( + deps=deps, + meshes=VESCALE_DEVICE_MESH.get_global_tensor_parallel_meshes(), + schedule=PipelineScheduleType.ZERO_BUBBLE, + batches=batches, + data_iterator=[iter(data_iterator) for _ in range(num_chunks)], + stage_id=local_rank, + shape=(1, 1, 3), + dtype=torch.float32, + f_cost=6, + b_cost=4, + w_cost=4, + c_cost=1, + f_mem=mem_f, + b_mem=mem_b, + w_mem=mem_w, + max_mem=mem_f * 4 * 2, + ) + _, all_forward = ScheduleEngine.execute(pipe_engine) + if self.rank == 0: + loss_per_microbatch = [item[1] for item in all_forward] + print(loss_per_microbatch, all_batches_out) + for t1, t2 in zip(loss_per_microbatch, all_batches_out): + self.assertEqual(t1, t2) + + flush() + wait() + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/instruction/test_userdefine_schedule.py b/test/parallel/pipeline/instruction/test_userdefine_schedule.py new file mode 100644 index 0000000..680a4d9 --- /dev/null +++ b/test/parallel/pipeline/instruction/test_userdefine_schedule.py @@ -0,0 +1,197 @@ +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from common_dtensor import DTensorTestBase, with_comms +from torch.testing._internal.common_utils import run_tests +from vescale.pipe._schedules.instruction_base import ( + register_instruction, + VESCALE_INTRUCTION_BUILDER as builder, + StageDeps, +) +from vescale.initialize.deferred_init import deferred_init +from vescale.pipe import PipeParser +from vescale.pipe.pipe_stage import _generate_stage_dependencies +from vescale.dmodule.api import parallelize_module +from vescale.dtensor.device_mesh import DeviceMesh +import torch +from four_mlp import FourMLP, sharding_plan +from vescale.pipe._schedules.pipedream_flush import maybe_tensor, cross_mesh_send, cross_mesh_recv + +from torch.distributed._functional_collectives import send, recv + +from vescale.plan.pipeline_parallel import PipelineParallelPlan +from vescale.plan.spec import PipelineSplitMethodType + + +class PowerUserScheduleTest(DTensorTestBase): + @with_comms + def test_user_define_schedule(self): + """ + Tests user-defined pipeline schedule. + """ + global_mesh = DeviceMesh("cuda", [[0, 1], [2, 3]]) + torch.cuda.set_device(self.rank) + + @register_instruction(name="send") + def send_forward(): + topo = builder.topo + send_data = builder.last + send_comms = topo.send_tables[builder.stage_id] + send_comm = send_comms[0] + mapping_group = send_comm.cur_mesh.get_mapping_rank(send_comm.peer_mesh) + send(maybe_tensor(send_data), mapping_group, torch.distributed.distributed_c10d._get_default_group()) + cross_mesh_send(send_comm, send_data) + + @register_instruction(name="recv") + def recv_forward(): + topo = builder.topo + recv_comms = topo.recv_tables[builder.stage_id] + recv_comm = recv_comms[0] + recv_tensor = torch.empty((1, 1, 8), requires_grad=True, dtype=torch.float32).cuda() + mapping_group = recv_comm.cur_mesh.get_mapping_rank(recv_comm.peer_mesh) + recv_tensor = recv(recv_tensor, mapping_group, torch.distributed.distributed_c10d._get_default_group()) + recv_dtensor = cross_mesh_recv(recv_comm, recv_tensor) + return recv_dtensor + + @register_instruction(name="forward") + def forward(): + model = builder.model + last_data = builder.last + activation = model(last_data) + return activation + + @register_instruction(name="load_data") + def load_data(): + dataloader = builder.dataloader + pos = builder.pos + data_id = pos // 3 + return dataloader[data_id] + + instruction_list = { + 0: "load_data,forward,send,load_data,forward,send,load_data,forward,send", + 1: "recv,forward,recv,forward,recv,forward", + } + builder.build_from_dict(instruction_list) + builder.draw_instructions() + + deferred_model = deferred_init(FourMLP, hidden=8) + parser = PipeParser() + pipe_config = PipelineParallelPlan( + num_stages=2, + split_method=PipelineSplitMethodType.MANUAL, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(4)], + split_points=["mlp2", "mlp4"], + ) + parser_args = {"shard_plan": sharding_plan} + graph = parser.parse(deferred_model, pipe_config, **parser_args) + root_graph = parser.partition_stage(deferred_model, graph, pipe_config) + + if self.rank in [0, 1]: + pipeline_stage_id = 0 + elif self.rank in [2, 3]: + pipeline_stage_id = 1 + + stage_model_pp = root_graph.get_submodule(f"stage{pipeline_stage_id}") + + stage_model_pp_tp = parallelize_module( + stage_model_pp, + global_mesh.get_submesh([1]), + sharding_plan, + factory=False, + ) + + global_tp_meshes = [ + DeviceMesh("cuda", [0, 1], _validate_mesh=False), + DeviceMesh("cuda", [2, 3], _validate_mesh=False), + ] + np_deps, p2p_index_mapping = _generate_stage_dependencies(root_graph, 2, 1) + + deps = StageDeps(np_deps, global_tp_meshes, [stage_model_pp_tp], p2p_index_mapping) + builder.topo = deps + builder.model = stage_model_pp_tp + builder.stage_id = pipeline_stage_id + + data_iterator = [] + if self.rank in [0, 1]: + for i in range(3): + data = torch.zeros((1, 1, 8), dtype=torch.float32) + i + data_iterator.append(data) + builder.dataloader = data_iterator + outputs = builder.run(pipeline_stage_id) + if self.rank in [2, 3]: + print(outputs) + + def _define_instructions(self): + @register_instruction(name="send") + def send_forward(*args, **kwargs): + send_data = args[0] + dst = builder.send_dist + send(maybe_tensor(send_data), dst, torch.distributed.distributed_c10d._get_default_group()) + return (send_data,), {} + + @register_instruction(name="recv") + def recv_forward(*args, **kwargs): + dst = builder.recv_dist + recv_tensor = torch.empty_like(args[0]) + recv_tensor = recv(recv_tensor, dst, torch.distributed.distributed_c10d._get_default_group()) + return (recv_tensor,), {} + + # instruction should be stateless. + @register_instruction(name="forward") + def forward(model, *args, **kwargs): + activation = model(*args, **kwargs) + return (activation,), {} + + instruction_list = { + 0: "forward,send", + 1: "recv,forward", + } + + builder.build_from_dict(instruction_list) + builder.draw_instructions() + + def _parallelize_model(self, global_mesh): + deferred_model = deferred_init(FourMLP, hidden=8) + parser = PipeParser() + pipe_config = PipelineParallelPlan( + num_stages=2, + split_method=PipelineSplitMethodType.MANUAL, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(4)], + split_points=["mlp2", "mlp4"], + ) + parser_args = {"shard_plan": sharding_plan} + graph = parser.parse(deferred_model, **parser_args) + root_graph = parser.partition_stage(deferred_model, graph, pipe_config) + + if self.rank in [0, 1]: + pipeline_stage_id = 0 + elif self.rank in [2, 3]: + pipeline_stage_id = 1 + + stage_model_pp = root_graph.get_submodule(f"stage{pipeline_stage_id}") + + tp_submesh = global_mesh.get_submesh([1]) + stage_model_pp_tp = parallelize_module( + stage_model_pp, + tp_submesh, + sharding_plan, + factory=False, + ) + + return stage_model_pp_tp, root_graph + + +if __name__ == "__main__": + run_tests() diff --git a/test/parallel/pipeline/instruction/test_zerobubble.py b/test/parallel/pipeline/instruction/test_zerobubble.py new file mode 100644 index 0000000..3d1f6a3 --- /dev/null +++ b/test/parallel/pipeline/instruction/test_zerobubble.py @@ -0,0 +1,103 @@ +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import unittest +from four_mlp import FourMLP +import torch +import torch.optim as optim + + +class ZeroBubbleTest(unittest.TestCase): + def test_split_backward(self): + """ + Tests how to separately compute activation gradient and parameter gradient + in zero bubble pipeline schedule. + """ + model = FourMLP(hidden=8) + + stage0 = model.mlp1 + stage1 = model.mlp2 + + input = torch.randn(8, 8, requires_grad=True) + + stage0_out = stage0(input) + stage1_out = stage1(stage0_out) + loss = stage1_out.sum() + optimizer = optim.SGD(model.parameters(), lr=0.01) + + # calc zbv grad + optimizer.zero_grad() + + # calc activation grad (B) + activation_grad_output = torch.autograd.grad(loss, stage1_out, retain_graph=True) + activation_grad_stage1 = torch.autograd.grad( + stage1_out, + stage0_out, + grad_outputs=activation_grad_output, + retain_graph=True, + allow_unused=True, + materialize_grads=True, + ) + activation_grad_stage0 = torch.autograd.grad( + stage0_out, + input, + grad_outputs=activation_grad_stage1, + retain_graph=True, + allow_unused=True, + materialize_grads=True, + ) + + # calc params grad (W) + nps1 = {} + for key, value in stage1.named_parameters(): + nps1[key] = value + + nps0 = {} + for key, value in stage0.named_parameters(): + nps0[key] = value + + parameters_grad_stage1 = torch.autograd.grad( + stage1_out, + nps1.values(), + grad_outputs=activation_grad_output, + retain_graph=True, + allow_unused=True, + materialize_grads=True, + ) + parameters_grad_stage0 = torch.autograd.grad( + stage0_out, + nps0.values(), + grad_outputs=activation_grad_stage1, + retain_graph=True, + allow_unused=True, + materialize_grads=True, + ) + + # calc normal grad + optimizer.zero_grad() + loss.backward() + + # validate grads are same + print("fc1.weight.grad", stage1.fc1.weight.grad) + print("fc2.weight.grad", stage1.fc2.weight.grad) + + torch.testing.assert_close(stage1.fc1.weight.grad, parameters_grad_stage1[0]) + torch.testing.assert_close(stage1.fc2.weight.grad, parameters_grad_stage1[1]) + torch.testing.assert_close(stage0.fc1.weight.grad, parameters_grad_stage0[0]) + torch.testing.assert_close(stage0.fc2.weight.grad, parameters_grad_stage0[1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/vescale/devicemesh_api/__init__.py b/vescale/devicemesh_api/__init__.py index a9c8ea7..ed86ee4 100644 --- a/vescale/devicemesh_api/__init__.py +++ b/vescale/devicemesh_api/__init__.py @@ -15,4 +15,4 @@ # ################################################################################ -from .api import VESCALE_DEVICE_MESH +from .api import VESCALE_DEVICE_MESH, VeDeviceMesh diff --git a/vescale/dtensor/__init__.py b/vescale/dtensor/__init__.py index 2c2763d..2899061 100644 --- a/vescale/dtensor/__init__.py +++ b/vescale/dtensor/__init__.py @@ -25,7 +25,7 @@ ) from vescale.dtensor.device_mesh import DeviceMesh, mesh_resources from vescale.dtensor.api import normalize_placements -from vescale.dtensor.dtensor import DTensor +from vescale.dtensor.dtensor import DTensor, make_dtensor from vescale.dtensor.ops.utils import normalize_to_torch_size from vescale.dtensor.placement_types import DTensorSpec, Placement, Replicate, TensorMeta diff --git a/vescale/dtensor/_diff.py b/vescale/dtensor/_diff.py index fed72c5..d60fd0d 100644 --- a/vescale/dtensor/_diff.py +++ b/vescale/dtensor/_diff.py @@ -21,8 +21,9 @@ import logging - VESCALE_DISABLE_REDISTRIBUTE = os.environ.get("VESCALE_DISABLE_REDISTRIBUTE", "1") == "1" +VESCALE_DUMMY_P2P = os.environ.get("VESCALE_DUMMY_P2P", "0") == "1" +VESCALE_DUMP_INSTRUCTION = os.environ.get("VESCALE_DUMP_INSTRUCTION", "0") == "1" global VESCALE_SHARDING_SUGGETSION VESCALE_SHARDING_SUGGETSION = [] diff --git a/vescale/dtensor/dtensor.py b/vescale/dtensor/dtensor.py index ef2aeeb..719d0b9 100644 --- a/vescale/dtensor/dtensor.py +++ b/vescale/dtensor/dtensor.py @@ -493,15 +493,15 @@ def to_local( grad_placements: Optional[Sequence[Placement]] = None, async_output: bool = True, ) -> torch.Tensor: - # NOTE: moving impl code here for performance, as here is on the critial path but api function is NEVER used if grad_placements is not None: grad_placements: Tuple[Placement] = normalize_placements( grad_placements, self._spec.mesh.ndim, tensor_ndim=self.ndim ) - + return _ToTorchTensor.apply(self, grad_placements, async_output) + def redistribute( self, device_mesh: Optional[DeviceMesh] = None, @@ -539,3 +539,24 @@ def tolist(self) -> Union[List, Number]: - This operation is not dispatched but a torch function. """ return self._local_tensor.tolist() + + +def make_dtensor( + local_tensor: torch.Tensor, + device_mesh: DeviceMesh, + placements: Tuple[Placement, ...], + *, + shape: torch.Size, + dtype: torch.dtype, + requires_grad: bool, + stride: Tuple[int, ...], +): + return DTensor( + local_tensor, + device_mesh, + placements, + shape=shape, + dtype=dtype, + requires_grad=requires_grad, + stride=stride, + ) diff --git a/vescale/dtensor/placement_types.py b/vescale/dtensor/placement_types.py index 5332102..bb4f474 100644 --- a/vescale/dtensor/placement_types.py +++ b/vescale/dtensor/placement_types.py @@ -56,9 +56,9 @@ def serialize_from_tensor(tensor: torch.Tensor): elif tensor[0] == 1: return Partial() elif tensor[0] == 2: - return Shard(dim=tensor[1]) + return Shard(dim=tensor[1].item()) elif tensor[0] == 3: - return InterleavedShard(dim=tensor[1], interleaved_size=tensor[2]) + return InterleavedShard(dim=tensor[1].item(), interleaved_size=tensor[2].item()) class Shard(Placement): diff --git a/vescale/engine/__init__.py b/vescale/engine/__init__.py new file mode 100644 index 0000000..78249da --- /dev/null +++ b/vescale/engine/__init__.py @@ -0,0 +1,18 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from .pipe import PipeEngine diff --git a/vescale/engine/pipe.py b/vescale/engine/pipe.py new file mode 100644 index 0000000..4cd66c8 --- /dev/null +++ b/vescale/engine/pipe.py @@ -0,0 +1,237 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from collections import defaultdict +from typing import Any, List, Callable +from vescale.pipe.pipe_stage import PipeModule +from vescale.plan.pipeline_parallel import PipelineParallelPlan +from vescale.pipe.pipe_emmiter import ScheduleEngine, StageDeps +from vescale.devicemesh_api import VeDeviceMesh +from vescale.plan.spec import PipelineScheduleType +from vescale.ddp.distributed_data_parallel import DistributedDataParallel as DDP +from copy import deepcopy +import torch +import torch.nn as nn +import torch.distributed as dist +import os + + +class PipeEngine: + def __init__( + self, + module: PipeModule, + global_mesh: VeDeviceMesh, + loss_fn: Callable, + config: PipelineParallelPlan, + ): + """ + Training engine for pipeline parallelism and multi-dimensional + parallelism that underlies pipeline parallelism (distributed optimizer, data parallel, + tensor model parallel, and sequence parallel, etc). + The training engine is responsible for materializes stage partitioning, module registration, + training, and optimizer synchronization. + """ + self.module = module + self.virtual_chunks_per_stage = config.virtual_chunks + self.engine_config = config + self.optimizer = self.module.get_optimizer + self.lr_scheduler = self.module.get_lr_scheduler + self.global_mesh = global_mesh + if isinstance(loss_fn, nn.Module): + self.loss_fn = loss_fn + else: + try: + self.loss_fn = deepcopy(loss_fn.__func__) + except: # noqa: E722 + self.loss_fn = loss_fn + self.schedule_engine = None + self.reuse_comm_shape = self.engine_config.reuse_p2p_tensor_shape + if self.reuse_comm_shape: + os.environ["REUSE_COMM_SHAPE"] = "1" + if ( + self.engine_config.schedule_type == PipelineScheduleType.INTERLEAVED_1F1B + and self.virtual_chunks_per_stage == 1 + ): + print("[warning]: #virtual pipeline chunks is 1. Falling back to simple 1F1B schedule.") + self.engine_config.schedule_type = PipelineScheduleType.SIMPLE_1F1B + self.schedule_type = self.engine_config.schedule_type + + def build_schedule(self, minibatches, data_shape=None): + """ + Build pipeline parallel training schedules. + """ + meshes = self.global_mesh.get_global_tensor_parallel_meshes() + dp_rank, tp_rank = self.global_mesh.get_data_parallel_rank(), self.global_mesh.get_tensor_parallel_rank() + tp_meshes_dict = defaultdict(list) + + def _locate_tp_mesh(_rank): + for tp_mesh in meshes: + if _rank in tp_mesh.mesh.tolist(): + return tp_mesh + else: + raise ValueError("TP submesh not found.") + + for _rank in range(torch.distributed.get_world_size()): + _coordinate = self.global_mesh.get_strategy_coordinate(_rank) + tp_mesh = _locate_tp_mesh(_rank) + _dp_rank, _tp_rank = _coordinate[1], _coordinate[2] + tp_meshes_dict[(_dp_rank, _tp_rank)].append(tp_mesh) + + new_meshes = tp_meshes_dict[(dp_rank, tp_rank)] + meshes = new_meshes + first_stage_rank = self.global_mesh.get_strategy_coordinate(local_rank=0)[0] + # FIXME: the input can either be PipeModule, or a sequence of DDP modules? In the latter case, how to get stage dependency + pipe_module = self.module + stage_dep_matrix, p2p_index_mapping = pipe_module.stage_deps, pipe_module.p2p_index_mapping + stage_dependency = StageDeps( + dep=stage_dep_matrix, + meshes=meshes, + vpp_module_list=pipe_module, + p2p_index_mapping=p2p_index_mapping, + ) + num_minibatches = self._align_num_batches(first_stage_rank, len(minibatches)) + # TODO: insert shape inference + batch_p2p_comm = self.engine_config.batch_p2p_comm + # if on interleaved 1f1b schedule, set batch_p2p_comm to False to execute p2p communication + schedule_type = self.schedule_type + if schedule_type in [PipelineScheduleType.INTERLEAVED_1F1B, PipelineScheduleType.ZERO_BUBBLE]: + data_iterator = [iter(minibatches) for _ in range(self.virtual_chunks_per_stage)] + batch_p2p_comm = False + elif schedule_type == PipelineScheduleType.SIMPLE_1F1B: + data_iterator = minibatches + else: + raise NotImplementedError(f"Schedule {schedule_type} not implemented yet.") + return ScheduleEngine( + stage_dependency, + meshes, + schedule_type, + num_minibatches, + data_iterator=data_iterator, + stage_id=self.global_mesh.get_pipeline_parallel_rank(), + shape=data_shape, + dtype=self.engine_config.p2p_tensor_dtype, + num_chunks=self.virtual_chunks_per_stage, + input_shapes=None, + input_shapes_unpad=None, + # send_dtypes_map=self.module.recv_dtypes_dict, + overlap_p2p_comm=self.engine_config.overlap_p2p_comm, + batch_p2p_comm=batch_p2p_comm, + loss_fn=self.loss_fn, + global_mesh=self.global_mesh, + forward_only=self.engine_config.forward_only, + ) + + def forward_backward( + self, + minibatch, + reuse_schedule=False, + data_shape=None, + debug_mode: bool = False, + ): + """ + Execute the pipeline schedule to complete forward, + backward, and gradient step of one minibatch. + + Invoke Scheduler's execute_pipeline() to run a minibatch. + """ + assert isinstance(minibatch, List), "Input must be a list of microbatches" + if reuse_schedule: + if self.schedule_engine is None: + schedule_engine = self.build_schedule(minibatch, data_shape=data_shape) + else: + schedule_engine = self.schedule_engine + schedule_engine.set_data_iterator(minibatch, data_shape=data_shape) + else: + schedule_engine = self.build_schedule(minibatch, data_shape=data_shape) + # returns model output tensors and losses per microbatch + return ScheduleEngine.execute(schedule_engine, debug_mode=debug_mode) + + def forward(self, *args: Any, **kwargs: Any): + raise ValueError("Forward is done in PipeEngine.forward_backward()!") + + def __call__(self, *args: Any, **kwargs: Any): + return self.forward_backward(*args, **kwargs) + + def backward(self, *args: Any, **kwargs: Any): + raise ValueError("Backward is done in PipeEngine.forward_backward()!") + + @property + def get_optimizer(self): + """ + Return this stage's optimizer. + """ + return self.optimizer + + @property + def get_lr_scheduler(self): + return self.lr_scheduler + + def zero_grad_buffer(self, zero_buffer: bool = True): + for vpp_module in self.module.stage_modules.values(): + if isinstance(vpp_module, DDP): + vpp_module.zero_grad_buffer(zero_buffer) + + def finish_grad_sync(self): + for vpp_module in self.module.stage_modules.values(): + if isinstance(vpp_module, DDP): + vpp_module.finish_grad_sync() + + def train(self, mode: bool = True): + for vpp_module in self.module.stage_modules.values(): + vpp_module.train(mode) + + def eval(self): + for vpp_module in self.module.stage_modules.values(): + vpp_module.eval() + + def parameters(self, including_frozen=False): + """ + Return meta information of the entire model's + parameters. + """ + if including_frozen: + return self.module.parameters() + else: + return filter(lambda p: p.requires_grad, self.module.parameters()) + + def sync_shared_params(self, group_id: int = 0, share_params=True) -> None: + """ + Synchronize gradients and weights among groups of specified units, dictated by + "partition_units" in PipelineConfig. Typically, this function is used for + synchronizing gradients and weights of embeddings layers in Transformer-based + architecture. + Args: + group_id (int): specify groups of modules across stages to synchronize. Default by 0. + share_params (bool): if True, sync weight parameters; otherwise, share gradients. + """ + local_rank = dist.distributed_c10d.get_rank() + tp_coordinate = self.module.device_mesh_management.get_tensor_parallel_rank() + if self.module.shared_module_mapping and local_rank in dist.distributed_c10d.get_process_group_ranks( + self.module.shared_module_process_groups[group_id][tp_coordinate] + ): + self.module.sync_shared_params(self.global_mesh, group_id=group_id, share_params=share_params) + + def _align_num_batches(self, first_stage_rank, batches): + """ + Aligns all ranks must have the same number of mini-batches as rank 0. + """ + num_batches = torch.tensor([batches], dtype=torch.int64).cuda(dist.get_rank()) + dist.broadcast(num_batches, src=first_stage_rank) + is_consistent = num_batches.item() == batches + if not is_consistent: + batches = num_batches.item() + return batches diff --git a/vescale/initialize/__init__.py b/vescale/initialize/__init__.py index 8d130f9..e410817 100644 --- a/vescale/initialize/__init__.py +++ b/vescale/initialize/__init__.py @@ -15,4 +15,4 @@ # ################################################################################ -from .deferred_init import deferred_init, is_deferred, materialize_dtensor, materialize_dparameter +from .deferred_init import deferred_init, is_deferred, materialize_dtensor, materialize_dparameter, materialize_module diff --git a/vescale/initialize/deferred_init.py b/vescale/initialize/deferred_init.py index fdab336..5498ca8 100644 --- a/vescale/initialize/deferred_init.py +++ b/vescale/initialize/deferred_init.py @@ -18,6 +18,7 @@ from torchdistx.deferred_init import deferred_init as _deferred_init from torchdistx.deferred_init import is_deferred as _is_deferred from torchdistx.deferred_init import _C + from torchdistx.deferred_init import materialize_module as _materialize_module IMPORT_DEFER = True except: # noqa: E722 @@ -81,6 +82,19 @@ def is_deferred(obj: Union[torch.Tensor, nn.Parameter, nn.Module]) -> bool: return _is_deferred(obj) +def materialize_module(obj: nn.Module): + """Materializes deferred initialized ``nn.Module`` object. + + Args: + obj: + An ``nn.Module`` instance. + """ + if not IMPORT_DEFER: + return False + + _materialize_module(obj) + + def materialize_dtensor( tensor: torch.Tensor, device_mesh: Optional[DeviceMesh] = None, diff --git a/vescale/model/base_gpt/__init__.py b/vescale/model/base_gpt/__init__.py new file mode 100644 index 0000000..f3b869e --- /dev/null +++ b/vescale/model/base_gpt/__init__.py @@ -0,0 +1,5 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ diff --git a/vescale/model/base_gpt/attention.py b/vescale/model/base_gpt/attention.py new file mode 100644 index 0000000..66c615d --- /dev/null +++ b/vescale/model/base_gpt/attention.py @@ -0,0 +1,531 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import math + +import torch +from torch import nn + +from vescale.dtensor.api import from_local +from vescale.model.base_gpt.checkpoint import checkpoint +from vescale.model.base_gpt.enums import AttnMaskType, AttnType +from vescale.model.base_gpt.fuse_softmax import FusedScaleMaskSoftmax +from vescale.model.base_gpt.rotary import apply_rotary_pos_emb +from vescale.model.random import get_cuda_rng_tracker +from vescale.model.utils import attention_mask_func, divide + +try: + from flash_attn.flash_attn_interface import flash_attn_unpadded_func +except ImportError: + try: + from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func + except ImportError: + flash_attn_unpadded_func = None + +try: + from einops import rearrange +except ImportError: + rearrange = None + + +class CoreAttention(nn.Module): + def __init__(self, layer_number, config, attn_mask_type=AttnMaskType.padding): + super().__init__() + self.fp16 = config.fp16 + self.bf16 = config.bf16 + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + self.attn_mask_type = attn_mask_type + self.sequence_parallel = config.sequence_parallel + + self.config = config + + # Per attention head and per partition values. + + coeff = None + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + + self.scale_mask_softmax = FusedScaleMaskSoftmax( + self.fp16, + self.bf16, + self.attn_mask_type, + config.masked_softmax_fusion, + attention_mask_func, + self.attention_softmax_in_fp32, + coeff, + ) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.reshape(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + q_t = query_layer.transpose(0, 1) + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.empty( + (output_size[0] * output_size[1] // query_layer._spec.mesh.size(), output_size[2], output_size[3]), + dtype=query_layer.dtype, + device=query_layer.device, + ) + matmul_input_buffer = from_local(matmul_input_buffer, query_layer._spec.mesh, q_t._spec.placements) + + # Raw attention scores. [b * np, sq, sk] + projection_size = self.config.kv_channels * self.config.num_attention_heads + hidden_size_per_attention_head = divide(projection_size, self.config.num_attention_heads) + norm_factor = math.sqrt(hidden_size_per_attention_head) + norm_factor *= self.layer_number + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / norm_factor), + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + if not self.sequence_parallel: + with get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + attention_probs = from_local(attention_probs, attention_scores._spec.mesh, attention_scores._spec.placements) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + context_layer = context_layer.view(*context_layer.size()[:-2], -1) + + return context_layer + + +class FlashSelfAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + + def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): + super().__init__() + assert flash_attn_unpadded_func is not None, ( + "Please install FlashAttention first, " "e.g., with pip install flash-attn" + ) + assert rearrange is not None, "Please install einops first, e.g., with pip install einops" + self.causal = causal + self.softmax_scale = softmax_scale + self.dropout_p = attention_dropout + + def forward(self, q, k, v): + """Implements the multihead softmax attention. + Arguments + --------- + q, k, v: The tensor containing the query, key, and value. (B, S, H, D) + """ + + assert all(i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)) + assert all(i.is_cuda for i in (q, k, v)) + + batch_size, seqlen_q = q.shape[0], q.shape[1] + seqlen_k = k.shape[1] + + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q.device) + + if self.training: + # during training q,k,v always have same seqlen + assert seqlen_k == seqlen_q + + is_causal = self.causal + cu_seqlens_k = cu_seqlens_q + dropout_p = self.dropout_p + else: + # turn off FA causal mask after first inference autoregressive iteration + # only on first autoregressive step q,k,v have same seqlen + is_causal = seqlen_q == seqlen_k + cu_seqlens_k = torch.arange( + 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=q.device + ) + dropout_p = 0 + + output = flash_attn_unpadded_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + seqlen_q, + seqlen_k, + dropout_p, + softmax_scale=self.softmax_scale, + causal=is_causal, + ) + + output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + return output + + +class ParallelAttention(nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__( + self, + config, + layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=AttnMaskType.padding, + ): + super().__init__() + self.layer_number = max(1, layer_number) + self.attention_type = attention_type + self.attn_mask_type = attn_mask_type + self.config = config + + self.group_query_attention = config.group_query_attention + self.num_query_groups = config.num_query_groups + + query_projection_size = config.kv_channels * config.num_attention_heads + if self.group_query_attention: + kv_projection_size = config.kv_channels * config.num_query_groups + else: + kv_projection_size = config.kv_channels * config.num_attention_heads + + self.use_flash_attn = ( + config.use_flash_attn + and attention_type == AttnType.self_attn + and self.attn_mask_type == AttnMaskType.causal + ) + if self.use_flash_attn: + if flash_attn_unpadded_func is None: + raise ImportError("FlashAttention is not installed, please install with " "pip install flash-attn") + assert attention_type == AttnType.self_attn, ( + "FlashAttention code path only supports " "self-attention for now" + ) + assert self.attn_mask_type == AttnMaskType.causal, ( + "FlashAttention code path only " "supports causal mask for now" + ) + if rearrange is None: + raise ImportError("einops is not installed, please install with pip install einops") + + # Strided linear layer. + if attention_type == AttnType.self_attn: + self.query_key_value = nn.Linear( + config.hidden_size, + query_projection_size + 2 * kv_projection_size, + bias=config.add_bias_linear, + ) + config.init_method(self.query_key_value.weight) + else: + assert attention_type == AttnType.cross_attn + + if self.group_query_attention: + raise NotImplementedError("Grouped query attention not implemented for cross-attention.") + assert query_projection_size == kv_projection_size + + self.query = nn.Linear( + config.hidden_size, + query_projection_size, + bias=self.add_bias_linear, + ) + config.init_method(self.query.weight) + self.key_value = nn.Linear(config.hidden_size, 2 * kv_projection_size, bias=config.add_bias_linear) + config.init_method(self.key_value.weight) + + self.core_attention = CoreAttention(self.layer_number, config, self.attn_mask_type) + self.checkpoint_core_attention = config.recompute_granularity == "selective" + + if self.use_flash_attn: + self.core_attention_flash = FlashSelfAttention(causal=True, attention_dropout=config.attention_dropout) + + # Output. + self.dense = nn.Linear( + query_projection_size, + config.hidden_size, + bias=False, + ) + self.dense_bias = torch.empty(config.hidden_size) if config.add_bias_linear else None + config.output_layer_init_method(self.dense.weight) + + def _checkpointed_attention_forward(self, query_layer, key_layer, value_layer, attention_mask, rotary_pos_emb=None): + """Forward method with activation checkpointing.""" + + def custom_forward(*inputs): + query_layer = inputs[0] + key_layer = inputs[1] + value_layer = inputs[2] + attention_mask = inputs[3] + output_ = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + return output_ + + q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None else rotary_pos_emb + + hidden_states = checkpoint( + custom_forward, False, query_layer, key_layer, value_layer, attention_mask, q_pos_emb, k_pos_emb + ) + + return hidden_states + + def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention_heads): + query_projection_size = self.config.kv_channels * self.config.num_attention_heads + hidden_size_per_attention_head = divide(query_projection_size, self.config.num_attention_heads) + return torch.empty( + inference_max_sequence_len, + batch_size, + num_attention_heads, + hidden_size_per_attention_head, + dtype=self.params_dtype, + device=torch.cuda.current_device(), + ) + + def forward( + self, hidden_states, attention_mask=None, encoder_output=None, inference_params=None, rotary_pos_emb=None + ): + # hidden_states: [sq, b, h] + + # Per attention head and per partition values. + world_size = self.hidden_states._spec.mesh.size() # TP + query_projection_size = self.config.kv_channels * self.config.num_attention_heads + self.hidden_size_per_attention_head = divide(query_projection_size, self.config.num_attention_heads) + self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size) + + if self.group_query_attention: + self.num_query_groups_per_partition = divide(self.num_query_groups, world_size) + else: + self.num_query_groups_per_partition = self.num_attention_heads_per_partition + + self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size) + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + is_first_step = False + if inference_params: + if self.layer_number not in inference_params.key_value_memory_dict: + inf_max_seq_len = inference_params.max_sequence_length + inf_max_batch_size = inference_params.max_batch_size + inference_key_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size, self.num_query_groups_per_partition + ) + inference_value_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size, self.num_query_groups_per_partition + ) + + inference_params.key_value_memory_dict[self.layer_number] = ( + inference_key_memory, + inference_value_memory, + ) + is_first_step = True + else: + inference_key_memory, inference_value_memory = inference_params.key_value_memory_dict[self.layer_number] + + # ===================== + # Query, Key, and Value + # ===================== + if self.attention_type == AttnType.self_attn: + # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] + mixed_x_layer, _ = self.query_key_value(hidden_states) + + # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + self.num_query_groups_per_partition, + ( + (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2) + * self.hidden_size_per_attention_head + ), + ) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn] + (query_layer, key_layer, value_layer) = torch.split( + mixed_x_layer, + [ + ( + self.num_attention_heads_per_partition + // self.num_query_groups_per_partition + * self.hidden_size_per_attention_head + ), + self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head, + ], + dim=3, + ) + + # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] - + query_layer = query_layer.view( + query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head + ) + else: + # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] + mixed_kv_layer, _ = self.key_value(encoder_output) + + # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] + new_tensor_shape = mixed_kv_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head, + ) + mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) + + # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] + (key_layer, value_layer) = split_tensor_along_last_dim(mixed_kv_layer, 2) + + # Attention head [sq, b, h] --> [sq, b, hp] + query_layer, _ = self.query(hidden_states) + # [sq, b, hp] --> [sq, b, np, hn] + new_tensor_shape = query_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + query_layer = query_layer.view(*new_tensor_shape) + + # ================================== + # Adjust key and value for inference + # ================================== + + # duplicate the pos_emb for self attention + if rotary_pos_emb is not None: + if isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = rotary_pos_emb + else: + rotary_pos_emb = (rotary_pos_emb,) * 2 + + if inference_params: + batch_start = inference_params.batch_size_offset + batch_end = batch_start + key_layer.size(1) + assert batch_end <= inference_key_memory.size(1) + sequence_start = inference_params.sequence_len_offset + sequence_end = sequence_start + key_layer.size(0) + assert sequence_end <= inference_key_memory.size(0) + # Copy key and values. + inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = key_layer + inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = value_layer + key_layer = inference_key_memory[:sequence_end, batch_start:batch_end, ...] + value_layer = inference_value_memory[:sequence_end, batch_start:batch_end, ...] + + # adjust the key rotary positional embedding + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + # need to cross check this condition during inference + # if not set_inference_key_value_memory: + if not is_first_step: + # In inference, we compute one token at a time. + # Select the correct positional embedding + # (only the last token in the sequence) + q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end] + else: + # In the first forward pass of inference, + # we use the entire provided prefix. + # q_pos_emb here has the rope embeddings of the entire + # prefix + to-be-generated output so + # we slice to just the prefix. + q_pos_emb = q_pos_emb[:sequence_end, :, :, :] + k_pos_emb = k_pos_emb[:sequence_end, :, :, :] + rotary_pos_emb = (q_pos_emb, k_pos_emb) + + # ================================== + # core attention computation + # ================================== + + # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn] + if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1: + key_layer = key_layer.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + value_layer = value_layer.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + + # apply relative positional encoding (rotary embedding) + if rotary_pos_emb is not None: + q_pos_emb, k_pos_emb = rotary_pos_emb + query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb) + # TODO, can apply positional embedding to value_layer so it has + # absolute positional embedding. + # otherwise, only relative positional embedding takes effect + # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + if not self.use_flash_attn: + if self.checkpoint_core_attention: + context_layer = self._checkpointed_attention_forward( + query_layer, key_layer, value_layer, attention_mask + ) + else: + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + else: + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (query_layer, key_layer, value_layer)) + if not self.sequence_parallel: + with get_cuda_rng_tracker().fork(): + context_layer = self.core_attention_flash(q, k, v) + else: + context_layer = self.core_attention_flash(q, k, v) + context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, self.dense_bias diff --git a/vescale/model/base_gpt/checkpoint.py b/vescale/model/base_gpt/checkpoint.py new file mode 100644 index 0000000..d6f6cd7 --- /dev/null +++ b/vescale/model/base_gpt/checkpoint.py @@ -0,0 +1,133 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +from typing import Any, Tuple + +import torch + +from vescale.dtensor.dtensor import DTensor +from vescale.dtensor.placement_types import Replicate +from vescale.model.random import _set_cuda_rng_state, get_cuda_rng_tracker + + +def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]: + if isinstance(inputs, tuple): + out = [] + for inp in inputs: + if not isinstance(inp, torch.Tensor): + out.append(inp) + continue + + x = inp.detach() + x.requires_grad = inp.requires_grad + out.append(x) + return tuple(out) + else: + raise RuntimeError( + "Only tuple of tensors is supported. Got Unsupported input type: ", + type(inputs).__name__, + ) + + +def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False): + """Break a tensor into equal 1D chunks across tensor parallel ranks. + + Returns a Tensor or View with this rank's portion of the data. + + Arguments: + tensor: The tensor to split + + Keyword Arguments: + new_buffer (bool): If True, returns a new Tensor. + If False, returns a view into the existing Tensor. + Default is False + + """ + device_mesh = tensor.device_mesh + partition_size = torch.numel(tensor) // device_mesh.size() + start_index = partition_size * device_mesh.get_rank() + end_index = start_index + partition_size + if new_buffer: + data = torch.empty(partition_size, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False) + data.copy_(tensor.view(-1)[start_index:end_index]) + else: + data = tensor.view(-1)[start_index:end_index] + return data + + +class CheckpointFunction(torch.autograd.Function): + """This function is adapted from torch.utils.checkpoint with + two main changes: + 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` + 2) the states in the model parallel tracker are also properly + tracked/set/reset. + """ + + @staticmethod + def forward(ctx, run_function, distribute_saved_activations, *args): + ctx.run_function = run_function + ctx.distribute_saved_activations = distribute_saved_activations + + # Copy the rng states. + ctx.fwd_cpu_rng_state = torch.get_rng_state() + ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() + ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + with torch.no_grad(): + outputs = run_function(*args) + + # Divide hidden states across model parallel group and only keep + # the chunk corresponding to the current rank. + if distribute_saved_activations: + ctx.input_0_shape = args[0].data.shape + assert isinstance(args[0].data, DTensor) + args[0].data = split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True) + + # Store everything. + ctx.save_for_backward(*args) + + return outputs + + @staticmethod + def backward(ctx, *args): + if not torch.autograd._is_checkpoint_valid(): + raise RuntimeError("Checkpointing is not compatible with .grad(), " "please use .backward() if possible") + inputs = ctx.saved_tensors + if ctx.distribute_saved_activations: + assert isinstance(inputs[0].data, DTensor) + inputs[0].data.redistribute(inputs[0].data.device_mesh, [Replicate()]) + + # Store the current states. + bwd_cpu_rng_state = torch.get_rng_state() + bwd_cuda_rng_state = torch.cuda.get_rng_state() + bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() + + # Set the states to what it used to be before the forward pass. + torch.set_rng_state(ctx.fwd_cpu_rng_state) + _set_cuda_rng_state(ctx.fwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker) + + # Compute the forward pass. + detached_inputs = detach_variable(inputs) + with torch.enable_grad(): + outputs = ctx.run_function(*detached_inputs) + + # Set the states back to what it was at the start of this function. + torch.set_rng_state(bwd_cpu_rng_state) + _set_cuda_rng_state(bwd_cuda_rng_state) + get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker) + + if isinstance(outputs, DTensor): + outputs = (outputs,) + torch.autograd.backward(outputs, args) + grads = tuple(inp.grad if isinstance(inp, DTensor) else inp for inp in detached_inputs) + return (None, None) + grads + + +def checkpoint(function, distribute_saved_activations, *args): + """Checkpoint a model or part of the model. + This has been directly copied from torch.utils.checkpoint.""" + return CheckpointFunction.apply(function, distribute_saved_activations, *args) diff --git a/vescale/model/base_gpt/enums.py b/vescale/model/base_gpt/enums.py new file mode 100644 index 0000000..841dffd --- /dev/null +++ b/vescale/model/base_gpt/enums.py @@ -0,0 +1,27 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import enum + + +class ModelType(enum.Enum): + encoder_or_decoder = 1 + encoder_and_decoder = 2 + + +class LayerType(enum.Enum): + encoder = 1 + decoder = 2 + + +class AttnType(enum.Enum): + self_attn = 1 + cross_attn = 2 + + +class AttnMaskType(enum.Enum): + padding = 1 + causal = 2 diff --git a/vescale/model/base_gpt/fuse_layer_norm.py b/vescale/model/base_gpt/fuse_layer_norm.py new file mode 100644 index 0000000..e1e5801 --- /dev/null +++ b/vescale/model/base_gpt/fuse_layer_norm.py @@ -0,0 +1,119 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +"""This code is copied fron NVIDIA apex: + https://github.com/NVIDIA/apex +with some changes.""" + +import importlib +import numbers + +import torch +from torch import nn +from torch.nn import init +from torch.nn.parameter import Parameter + + +fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda") + +try: + from apex.contrib.layer_norm.layer_norm import FastLayerNormFN + + HAVE_PERSIST_LAYER_NORM = True +except ImportError: + HAVE_PERSIST_LAYER_NORM = False + +try: + from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction + + HAVE_FUSED_LAYER_NORM = True +except ImportError: + HAVE_FUSED_LAYER_NORM = False + + +class MixedFusedLayerNorm(nn.Module): + def __init__( + self, + normalized_shape, + eps=1e-5, + no_persist_layer_norm=True, + param_dtype=torch.float32, + sequence_parallel=False, + apply_layernorm_1p=False, + ): + super().__init__() + + self.apply_layernorm_1p = apply_layernorm_1p + + # List of hiddens sizes supported in the persistent layer norm kernel + # If the hidden size is not supported, fall back to the non-persistent + # kernel. + persist_ln_hidden_sizes = [ + 1024, + 1536, + 2048, + 2304, + 3072, + 3840, + 4096, + 5120, + 6144, + 8192, + 10240, + 12288, + 12800, + 15360, + 16384, + 18432, + 20480, + 24576, + 25600, + 30720, + 32768, + 40960, + 49152, + 65536, + ] + if normalized_shape not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM: + no_persist_layer_norm = True + + if isinstance(normalized_shape, numbers.Integral): + normalized_shape = (normalized_shape,) + self.normalized_shape = torch.Size(normalized_shape) + self.eps = eps + self.weight = Parameter(torch.Tensor(*normalized_shape).to(param_dtype)) + self.bias = Parameter(torch.Tensor(*normalized_shape).to(param_dtype)) + self.reset_parameters() + self.no_persist_layer_norm = no_persist_layer_norm + self.sequence_parallel = sequence_parallel + + def reset_parameters(self): + if self.apply_layernorm_1p: + init.zeros_(self.weight) + init.zeros_(self.bias) + else: + init.ones_(self.weight) + init.zeros_(self.bias) + + def forward(self, input): + weight = self.weight + 1 if self.apply_layernorm_1p else self.weight + + if self.no_persist_layer_norm: + assert ( + FusedLayerNormAffineFunction is not None + ), "FusedLayerNormAffineFunction is not available, please install apex from https://github.com/NVIDIA/apex" + out = FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps, False) + return out + else: + output = FastLayerNormFN.apply(input, weight, self.bias, self.eps) + + # Apex's fast layer norm function outputs a 'view' tensor (i.e., has + # a populated '_base' field). This will result in schedule.py's + # deallocate_output_tensor() throwing an error, so a viewless tensor is + # created to prevent this. + # output = make_viewless_tensor( + # inp=output, requires_grad=input.requires_grad, keep_graph=True) + return output diff --git a/vescale/model/base_gpt/fuse_softmax.py b/vescale/model/base_gpt/fuse_softmax.py new file mode 100644 index 0000000..25f3021 --- /dev/null +++ b/vescale/model/base_gpt/fuse_softmax.py @@ -0,0 +1,203 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import torch +from torch import nn + +from vescale.model.base_gpt.enums import AttnMaskType + + +class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply upper triangular mask (typically used in gpt models). + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + import scaled_upper_triang_masked_softmax_cuda + + scale_t = torch.tensor([scale]) + softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0]) + + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_upper_triang_masked_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + input_grads = scaled_upper_triang_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0]) + + return input_grads, None + + +class ScaledMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply the mask. + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, mask, scale): + import scaled_masked_softmax_cuda + + scale_t = torch.tensor([scale]) + softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_masked_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +class ScaledSoftmax(torch.autograd.Function): + """ + Fused operation which performs following two operations in sequence + 1. Scale the tensor. + 2. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + import scaled_softmax_cuda + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +class FusedScaleMaskSoftmax(nn.Module): + """ + fused operation: scaling + mask + softmax + + Arguments: + input_in_fp16: flag to indicate if input in fp16 data format. + input_in_bf16: flag to indicate if input in bf16 data format. + attn_mask_type: attention mask type (pad or causal) + scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion + mask_func: mask function to be applied. + softmax_in_fp32: if true, softmax in performed at fp32 precision. + scale: scaling factor used in input tensor scaling. + """ + + def __init__( + self, + input_in_fp16, + input_in_bf16, + attn_mask_type, + scaled_masked_softmax_fusion, + mask_func, + softmax_in_fp32, + scale, + ): + super().__init__() + self.input_in_fp16 = input_in_fp16 + self.input_in_bf16 = input_in_bf16 + assert not ( + self.input_in_fp16 and self.input_in_bf16 + ), "both fp16 and bf16 flags cannot be active at the same time." + self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16 + self.attn_mask_type = attn_mask_type + self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion + self.mask_func = mask_func + self.softmax_in_fp32 = softmax_in_fp32 + self.scale = scale + + assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled" + + def forward(self, input, mask): + # [b, np, sq, sk] + assert input.dim() == 4 + + if self.is_kernel_available(mask, *input.size()): + return self.forward_fused_softmax(input, mask) + else: + return self.forward_torch_softmax(input, mask) + + def is_kernel_available(self, mask, b, np, sq, sk): + attn_batches = b * np + + if ( + self.scaled_masked_softmax_fusion # user want to fuse + and self.input_in_float16 # input must be fp16 + and 16 < sk <= 4096 # sk must be 16 ~ 2048 + and sq % 4 == 0 # sq must be divisor of 4 + and sk % 4 == 0 # sk must be divisor of 4 + and attn_batches % 4 == 0 # np * b must be divisor of 4 + ): + if 0 <= sk <= 4096: + batch_per_block = self.get_batch_per_block(sq, sk, b, np) + + if self.attn_mask_type == AttnMaskType.causal: + if attn_batches % batch_per_block == 0: + return True + else: + if sq % batch_per_block == 0: + return True + return False + + def forward_fused_softmax(self, input, mask): + b, np, sq, sk = input.size() + scale = self.scale if self.scale is not None else 1.0 + + if self.attn_mask_type == AttnMaskType.causal: + assert sq == sk, "causal mask is only for self attention" + # input is 3D tensor (attn_batches, sq, sk) + input = input.view(-1, sq, sk) + probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) + return probs.view(b, np, sq, sk) + else: + # input is 4D tensor (b, np, sq, sk) + if mask is not None: + return ScaledMaskedSoftmax.apply(input, mask, scale) + else: + return ScaledSoftmax.apply(input, scale) + + def forward_torch_softmax(self, input, mask): + if self.input_in_float16 and self.softmax_in_fp32: + input = input.float() + + if self.scale is not None: + input = input * self.scale + mask_output = self.mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) + + if self.input_in_float16 and self.softmax_in_fp32: + if self.input_in_fp16: + probs = probs.half() + else: + probs = probs.bfloat16() + + return probs + + @staticmethod + def get_batch_per_block(sq, sk, b, np): + import scaled_masked_softmax_cuda + + return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np) diff --git a/vescale/model/base_gpt/jit_func.py b/vescale/model/base_gpt/jit_func.py new file mode 100644 index 0000000..c129688 --- /dev/null +++ b/vescale/model/base_gpt/jit_func.py @@ -0,0 +1,40 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +from typing import Optional + +import torch + + +def bias_dropout_add(x, bias, residual, prob, training): + # type: (torch.Tensor, Optional[torch.Tensor], torch.Tensor, float, bool) -> torch.Tensor + if bias is not None: + x = x + bias + out = torch.nn.functional.dropout(x, p=prob, training=training) + out = residual + out + return out + + +def get_bias_dropout_add(training): + def _bias_dropout_add(x, bias, residual, prob): + return bias_dropout_add(x, bias, residual, prob, training) + + return _bias_dropout_add + + +@torch.compile +def bias_dropout_add_fused_train( + x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float +) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, True) + + +# @torch.jit.script +@torch.compile +def bias_dropout_add_fused_inference( + x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float +) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, False) diff --git a/vescale/model/base_gpt/mlp.py b/vescale/model/base_gpt/mlp.py new file mode 100644 index 0000000..f2c33fc --- /dev/null +++ b/vescale/model/base_gpt/mlp.py @@ -0,0 +1,101 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import torch +from torch import nn + +from vescale.model.utils import bias_gelu_impl, openai_gelu + + +class SwitchMLP(nn.Module): + """ + Routes input to one of N MLP "experts" + """ + + def __init__(self, hidden_size, num_experts): + super().__init__() + self.router = nn.Linear(hidden_size, num_experts) + self.experts = torch.nn.ModuleList() + for _ in range(num_experts): + self.experts.append(ParallelMLP(hidden_size)) + + def forward(self, hidden_states): + # hidden_states: [s, b, h] + s = hidden_states.size(0) + b = hidden_states.size(1) + h = hidden_states.size(2) + route = self.router(hidden_states) + route = torch.nn.functional.softmax(route, dim=2) + max_prob, max_ind = torch.max(route, dim=2) + max_prob = torch.unsqueeze(max_prob, 2) # [s b 1] + + # TODO (rprenger) TODO this could be made easier to read + # Converting [s, b, h] to [s*b, h]. + # Each vector could be routed differently + # [s*b h] + hidden_states = hidden_states.view(-1, hidden_states.size(2)) + max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1] + max_ind = max_ind.view(-1) # [s*b] + + output_total = torch.empty_like(hidden_states) + output_bias_total = torch.empty_like(hidden_states) + # TODO (rprenger) This does each expert in serial, but it could be parallelized + + for expert_num, expert in enumerate(self.experts): + local_indices = (max_ind == expert_num).nonzero() + hidden = hidden_states[local_indices, :] + output, output_bias = expert(hidden) + output_bias = output_bias.expand_as(output) + output_total[local_indices, :] = output + output_bias_total[local_indices, :] = output_bias + + output_total = output_total * max_prob + output_bias_total = output_bias_total * max_prob + output_total = output_total.view(s, b, h) + output_bias_total = output_bias_total.view(s, b, h) + + return output_total, output_bias_total + + +class ParallelMLP(nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, h, param_dtype=torch.float32, bias_gelu_fusion=None): + super().__init__() + + # Project to 4h. + self.dense_h_to_4h = nn.Linear(h, h * 4, bias=False, dtype=param_dtype) + # torch.nn.init.normal_(self.dense_h_to_4h.weight, mean=0.0, std=0.02) + torch.nn.init.xavier_normal_(self.dense_h_to_4h.weight) + self.dense_h_to_4h_bias = nn.Parameter(torch.zeros(4 * h, dtype=param_dtype)) + + self.bias_gelu_fusion = bias_gelu_fusion + self.activation_func = openai_gelu + + # Project back to h. + self.dense_4h_to_h = nn.Linear(4 * h, h, bias=False, dtype=param_dtype) + torch.nn.init.xavier_uniform_(self.dense_4h_to_h.weight) + # torch.nn.init.normal_(self.dense_4h_to_h.weight, mean=0.0, std=0.02) + self.dense_4h_to_h_bias = nn.Parameter(torch.zeros(h, dtype=param_dtype)) + + def forward(self, hidden_states): + intermediate_parallel = self.dense_h_to_4h(hidden_states) + bias_parallel = self.dense_h_to_4h_bias + + if self.bias_gelu_fusion: + intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) + else: + intermediate_parallel = self.activation_func(intermediate_parallel + bias_parallel) + + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + output_bias = self.dense_4h_to_h_bias + return output, output_bias diff --git a/vescale/model/base_gpt/rotary.py b/vescale/model/base_gpt/rotary.py new file mode 100644 index 0000000..eaa8d76 --- /dev/null +++ b/vescale/model/base_gpt/rotary.py @@ -0,0 +1,52 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +from typing import Union + +import torch +from torch import Tensor + +from vescale.dtensor.dtensor import DTensor + + +def _rotate_half(x: Union[Tensor, DTensor]) -> Union[Tensor, DTensor]: + """Change sign so the last dimension becomes [-odd, +even] + + Args: + x (Tensor): Input tensor + + Returns: + Tensor: Tensor rotated half + """ + + x1, x2 = torch.chunk(x, 2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(t: Union[Tensor, DTensor], freqs: Union[Tensor, DTensor]) -> Union[Tensor, DTensor]: + """Apply rotary positional embedding to input tensor T. + + check https://kexue.fm/archives/8265 for detailed formulas + + Args: + t (Tensor): Input tensor T is of shape [seq_length, ... , dim] + freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim] + + Returns: + Tensor: The input tensor after applying RoPE + """ + rot_dim = freqs.shape[-1] + + # ideally t_pass is empty so rotary pos embedding is applied to all tensor t + t, t_pass = t.narrow(-1, 0, rot_dim), t.narrow(-1, rot_dim, max(t.size()[-1] - rot_dim, 0)) + + # first part is cosine component + # second part is sine component, need to change signs with _rotate_half method + cos_ = torch.cos(freqs).to(t.dtype) + sin_ = torch.sin(freqs).to(t.dtype) + + t = (t * cos_) + (_rotate_half(t) * sin_) + return torch.cat((t, t_pass), dim=-1) diff --git a/vescale/model/base_gpt/transformer_block.py b/vescale/model/base_gpt/transformer_block.py new file mode 100644 index 0000000..a2c09be --- /dev/null +++ b/vescale/model/base_gpt/transformer_block.py @@ -0,0 +1,135 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import torch +from torch import nn +from contextlib import nullcontext +from vescale.dtensor.dtensor import DTensor +from vescale.initialize.deferred_init import deferred_init +from vescale.model.base_gpt.transformer_layer import ParallelTransformerLayer +from vescale.model.random import get_cuda_rng_tracker + + +class TransformerBlock(nn.Module): + """Transformer class.""" + + def __init__( + self, + num_layer, + args, + drop_path_rate=0.0, + pre_process=True, + deferred_init=False, + ): + super().__init__() + + self.config = args + self.drop_path_rate = drop_path_rate + self.pre_process = pre_process + self.num_layer = num_layer + self.deferred_init = deferred_init + + # required for pipeline parallel schedules + self.input_tensor = None + self._build_layers() + + def _build_layers(self): + # Transformer layers. + # @jcasper can we improve how we deal with layer_number? + # currently it's only used in CoreAttention? + # if self.apply_query_key_layer_scaling: + # coeff = self.layer_number + # self.norm_factor *= coeff + def build_layer(layer_number): + if self.deferred_init: + layer_config = { + "init_method": self.config.init_method, + "output_layer_init_method": self.config.output_layer_init_method, + "layer_number": layer_number, + "args": self.config, + "drop_path_rate": self.drop_path_rate, + } + layer = deferred_init(ParallelTransformerLayer, **layer_config) + else: + layer = ParallelTransformerLayer( + self.config.init_method, + self.config.output_layer_init_method, + layer_number, + self.config, + self.drop_path_rate, + ) + + return layer + + # offset is implicit in TransformerLayer + self.transformer_layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layer)]) + self.layers = torch.nn.Sequential() + for i in range(len(self.transformer_layers)): + self.layers.append(self.transformer_layers[i]) + + def _get_layer(self, layer_number): + return self.transformer_layers[layer_number] + + def set_input_tensor(self, input_tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward( + self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None + ): + # hidden_states (float): [s, b, h] + # attention_mask (bool): [1, 1, s, s] + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + # hidden_states = make_viewless_tensor( + # inp=hidden_states, + # requires_grad=True, + # keep_graph=True, + # ) + + rng_context = nullcontext() + if isinstance(hidden_states, DTensor): + placements = hidden_states.placements + # check sbh, for s + is_sp = any(placement.is_shard(dim=0) for placement in placements) + if is_sp: + rng_context = get_cuda_rng_tracker().fork() + + with rng_context: + for layer in self.transformer_layers: + hidden_states = layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_output=encoder_output, + enc_dec_attn_mask=enc_dec_attn_mask, + inference_params=inference_params, + ) + + return hidden_states diff --git a/vescale/model/base_gpt/transformer_layer.py b/vescale/model/base_gpt/transformer_layer.py new file mode 100644 index 0000000..f9931d1 --- /dev/null +++ b/vescale/model/base_gpt/transformer_layer.py @@ -0,0 +1,194 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +from contextlib import nullcontext + +import torch +from torch import nn + +from vescale.model.attention.dmodule_parallel_attention import ParallelAttention +from vescale.model.base_gpt.fuse_layer_norm import MixedFusedLayerNorm as LayerNorm +from vescale.model.base_gpt.jit_func import ( + bias_dropout_add_fused_inference, + bias_dropout_add_fused_train, + get_bias_dropout_add, +) +from vescale.model.base_gpt.mlp import ParallelMLP, SwitchMLP + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample + (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=0.0): + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_state): + if self.drop_prob == 0.0 or not self.training: + return hidden_state + keep_prob = 1 - self.drop_prob + # work with diff dim tensors, not just 2D ConvNets + # hidden_state: [s, b, h] + random_tensor = keep_prob + torch.rand_like(hidden_state) + random_tensor.floor_() # binarize + output = hidden_state.div(keep_prob) * random_tensor + return output + + +class ParallelTransformerLayer(nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__( + self, + init_method, + output_layer_init_method, + layer_number, + args, + drop_path_rate=0.0, + ): + super().__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm = args.apply_residual_connection_post_layernorm + + self.bf16 = args.bf16 + + # Layernorm on the input data. + self.input_layernorm = LayerNorm( + args.hidden_size, + eps=args.layernorm_epsilon, + no_persist_layer_norm=not args.persist_layer_norm, + param_dtype=args.param_dtype, + ) + + # Self attention. + self.self_attention = ParallelAttention( + args.hidden_size, + args.kv_channels, + args.num_attention_heads, + args.world_size, + 1, # n_shared_qhead + args.param_dtype, + ) + self.hidden_dropout = args.hidden_dropout + self.bias_dropout_fusion = args.bias_dropout_fusion + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNorm( + args.hidden_size, + eps=args.layernorm_epsilon, + no_persist_layer_norm=not args.persist_layer_norm, + param_dtype=args.param_dtype, + ) + + # MLP + if args.num_experts is not None: + self.mlp = SwitchMLP(init_method, output_layer_init_method, args) + else: + self.mlp = ParallelMLP(args.hidden_size, param_dtype=args.param_dtype) + + # Set bias+dropout+add fusion grad_enable execution handler. + TORCH_MAJOR = int(torch.__version__.split(".")[0]) + TORCH_MINOR = int(torch.__version__.split(".")[1]) + use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) + self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad + + def forward( + self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None + ): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + + # Self attention. + attention_output, attention_bias = self.self_attention( + layernorm_output, attention_mask, inference_params=inference_params + ) + + # assert not torch.isnan(attention_output.to_local() + # ).any(), attention_output + # assert not torch.isnan(attention_bias.to_local()).any(), attention_bias + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + if self.drop_path is None: + # jit scripting for a nn.module (with dropout) is not + # trigerring the fusion kernel. For now, we use two + # different nn.functional routines to account for varying + # dropout semantics during training and inference phases. + if self.bias_dropout_fusion: + if self.training: + bias_dropout_add_func = bias_dropout_add_fused_train + else: + bias_dropout_add_func = bias_dropout_add_fused_inference + else: + bias_dropout_add_func = get_bias_dropout_add(self.training) + + with self.bias_dropout_add_exec_handler(): + layernorm_input = bias_dropout_add_func( + attention_output, attention_bias.expand_as(residual), residual, self.hidden_dropout + ) + else: + out = attention_output + attention_bias + out = torch.nn.functional.dropout(out, p=self.hidden_dropout, training=self.training) + layernorm_input = residual + self.drop_path(out) + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + # assert not torch.isnan(layernorm_output).any() + + # MLP. + mlp_output, mlp_bias = self.mlp(layernorm_output) + # assert not torch.isnan(mlp_output.to_local()).any(), mlp_output + # assert not torch.isnan(mlp_bias.to_local()).any(), mlp_bias + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + if self.drop_path is None: + with self.bias_dropout_add_exec_handler(): + output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + # output = dtensor.utils.make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True) + # + else: + out = mlp_output + mlp_bias + out = torch.nn.functional.dropout(out, p=self.hidden_dropout, training=self.training) + output = residual + self.drop_path(out) + + return output + + def forward_util(self, input_tensor, data): + ret = { + "hidden_states": input_tensor if input_tensor is not None else data["hidden_states"], + "attention_mask": data["attention_mask"], + } + return [ret["hidden_states"], ret["attention_mask"]] + + def output_utils(self, p2p_tensor): + p2p_tensor = torch.permute(p2p_tensor, (0, 2, 1)) + return p2p_tensor diff --git a/vescale/model/base_gpt/utils.py b/vescale/model/base_gpt/utils.py new file mode 100644 index 0000000..3a67817 --- /dev/null +++ b/vescale/model/base_gpt/utils.py @@ -0,0 +1,27 @@ +################################################################################ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import functools +from typing import Callable + +from optree import tree_map +from vescale.dtensor.dtensor import DTensor + + +def switch_dtensor(func: Callable): + @functools.wraps(func) + def wrap(*args, **kwargs): + def to_tensor(x): + if isinstance(x, DTensor): + return x.to_local() + return x + + new_args = tree_map(to_tensor, args) + new_kwargs = tree_map(to_tensor, kwargs) + out = func(*new_args, **new_kwargs) + return out + + return wrap diff --git a/vescale/ndtimeline/README.md b/vescale/ndtimeline/README.md new file mode 100644 index 0000000..9f21442 --- /dev/null +++ b/vescale/ndtimeline/README.md @@ -0,0 +1,55 @@ +# ndtimeline (N-Dimension Timeline) + +## Why ndtimeline? + +- When training LLM (Large Language Models) on an extremely large scale, several challenges need to be overcome: + + - Sink machines (i.e. stragglers) tend to initiate CUDA kernels slowly, significantly reducing training efficiency. + - Traditional tools such as torch profile/nsys can only offer information within one physical machine, whereas communication occurs among multiple or even thousands of machines. + - Although torch profile can provide details about training, it comes at a considerable cost, making it impractical to be constantly enabled. The large size of the tracing file is also a significant issue, making analysis difficult. + +- We require a mechanism to jointly collect and visualize training details across numerous machines with low costs and a small tracing log to effectively detect stragglers and confirm the training status. + +## What is ndtimeline? + +### Insights +- With `CUDA Events` provided by device, we can record durations of interesting parts. +- We can utilize a reference `CUDA Event` as a clock with a Unix timestamp, allowing comparisons between events to provide a full span that includes both the starting time and duration. +- Clocks among different machines are challenging to synchronize precisely, but we can simulate a global clock through communication to offer a consistent view of spans. +- To minimize overhead, we can record events in multiple stages and flush them in another thread at specific intervals +- To maximize flexibility, `ndtimeline` exposes handlers for users to inject during pre and post metric processing and perform any desired operations. +- As metric collectors are located in each training process, they ensure the same semantics as parallelism on each rank, facilitating the easy extension of ndtimeline when new types of parallelism are introduced. + +### Architecture +Assume there are two ranks on one machine. + +NDtimeline + +### important details + - Communication Stream + - Torch does not inherently offer an interface to obtain the stream for communication of nccl. `ProcessgroupNCCL` maintains a `CUDA Stream` pool and generates a stream from it when the user does not manually set the `CUDA Stream `before communication. + - We modify torch to establish a new interface for this purpose. + - An important **caveat**: The first communication operation will be lost in the tracing statistics because the `CUDA Stream` is allocated lazily when the first communication operation is initiated. Therefore, you may encounter some information logs such as `p2p stream is not available, skipped` as expected. + - is_internal_vescale + - We employ `ndtimeline` to emit metrics to an internal service, which may potentially disclose important secrets. Hence, we have to remove some `NDHandler` implementations and the corresponding meta information. + +## How to use ndtimeline? +- Both **Higher** and **Lower** api are provided + - `ndtimeline.api` provides three key higher apis: `init_ndtimers`, `wait` and `flush` + - `init_ndtimers` Metrics injected in vescale are predefined in `ndtimeline.predefined`. The method for processing metrics, named as `NDHandler` defined in `ndtimeline.handlers`, can also be initialized using init_ndtimers. + - `wait` ensures that metrics are flushed and should be called at the end of main.py. + - `flush` flushes the collected metrics and calibrates the simulated global clock. + - `ndtimeline.api` provides another two api: `inc_step` and `set_global_step` + - They are introduced to align with the traditional approach for maintaining the global step instead of a `global_step_getter` function. +- **Lower** api are rather complex + - Investigate `ndtimeline.timer.NDTimerManager` and `ndtimeline.sock_streamer.NDtimelineStreamer` to understand how to use them. + +- An Example + + - Demo for default tracing file + + NDtimeline_trace + + - Users can utilize `init_ndtimers` and pass `ndtimeline.handlers.LocalTimelineNDHandler` as the post handler. A tracing file on the machine will then be generated in the current directory. + - Using the Chrome built-in tracing file viewer at https://ui.perfetto.dev/, one can visualize the tracing file. + - In case you need a tracing file related to ranks on different machines, you can implement an MQHandler by yourself and send all metrics to a central storage. This provides you with a method to filter and generate the tracing file for specified ranks. \ No newline at end of file diff --git a/vescale/ndtimeline/__init__.py b/vescale/ndtimeline/__init__.py new file mode 100644 index 0000000..cf86cbb --- /dev/null +++ b/vescale/ndtimeline/__init__.py @@ -0,0 +1,87 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from . import handlers # noqa: F401 +from . import exceptions # noqa: F401 +from . import logger # noqa: F401 +from . import predefined # noqa: F401 + +from .binary_protocol import serialize_to_package, encode_package, loads_fn, dumps_fn +from .pool import DefaultEventPool, CudaEventPool +from .world_info import WorldInfo, TrainingInfo, TopoInfo +from .timer import ( + NDTimerManager, + NDTimerManagerSingleton, + DeviceTimerMeta, + ndtimeit, + NDMetricLevel, + ndtimer, + ndtimeit_p2p, +) +from .sock_streamer import NDtimelineStreamer +from .variables import ( + NDTIMELINE_INNER_GLOBAL_STEP_KEY, + SOCK_TIMEOUT_CLIENT, + SOCK_PARENT_DIR, + SOCK_PATH, + NDTIMELINE_STREAM_KEY, +) +from .stream import get_nccl_p2p_stream, get_nccl_coll_stream +from .api import flush, wait, init_ndtimers, set_global_step, inc_step + +__all__ = [ + "handlers", + "logger", + "exceptions", + "predefined", + "serialize_to_package", + "encode_package", + "loads_fn", + "dumps_fn", + "DefaultEventPool", + "CudaEventPool", + "WorldInfo", + "TrainingInfo", + "TopoInfo", + "NDTimerManager", + "NDTimerManagerSingleton", + "DeviceTimerMeta", + "ndtimeit", + "NDtimelineStreamer", + "NDTIMELINE_INNER_GLOBAL_STEP_KEY", + "SOCK_TIMEOUT_CLIENT", + "SOCK_PARENT_DIR", + "SOCK_PATH", + "NDTIMELINE_STREAM_KEY", + "NDMetricLevel", + "get_nccl_p2p_stream", + "get_nccl_coll_stream", + "ndtimer", + "ndtimeit_p2p", + "flush", + "wait", + "init_ndtimers", + "set_global_step", + "inc_step", +] + +try: + import _internal + + __all__.append("_internal") +except ImportError: + pass diff --git a/vescale/ndtimeline/api.py b/vescale/ndtimeline/api.py new file mode 100644 index 0000000..6ca266a --- /dev/null +++ b/vescale/ndtimeline/api.py @@ -0,0 +1,396 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from typing import Optional, List, Callable, Tuple +import math +from copy import deepcopy + +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal + +import torch +import torch.distributed as dist + +from .is_internal import is_internal_vescale + +if is_internal_vescale(): + from vescale.fsdp import FullyShardedDataParallel, ShardingStrategy + from vescale.fsdp._init_utils import HYBRID_SHARDING_STRATEGIES + from ._internal import _get_ip_by_env, _get_role_id, _get_run_id, _get_trial_id +else: + # make python happy + class FullyShardedDataParallel: + pass + + class ShardingStrategy: + pass + + HYBRID_SHARDING_STRATEGIES = "" + + +from vescale.dtensor.device_mesh import DeviceMesh +from vescale.devicemesh_api import VESCALE_DEVICE_MESH +from .timer import NDTimerManagerSingleton, DeviceTimerMeta, NDMetricLevel +from .handlers import NDHandler, SockNDHandler, LocalTimelineNDHandler +from .world_info import WorldInfo +from .sock_streamer import NDtimelineStreamer +from .logger import NDTimelineLogger +from .predefined import ( + FORWARD_COMPUTE, + BACKWARD_COMPUTE, + CROSS_MESH_RECV, + CROSS_MESH_SEND, + RECV_FORWARD, + RECV_BACKWARD, + SEND_FORWARD, + SEND_BACKWARD, + SEND_FORWARD_RECV_BACKWARD, + SEND_BACKWARD_RECV_FORWARD, + UNSHARD_AG, + GRAD_RS, + GRAD_AR, +) +from .fsdp_patch import patch_fsdp + + +def init_ndtimers( + rank: Optional[int] = None, + mode: Literal["fsdp", "hybrid"] = "hybrid", + wrapped_fsdp_module: Optional[FullyShardedDataParallel] = None, + device_mesh: Optional[DeviceMesh] = None, + mesh_shape: Optional[Tuple[int, ...]] = None, + local_rank: Optional[int] = None, + step_getter: Optional[Callable[[], int]] = None, + enable_streamer: bool = True, + n_rank_per_host: Optional[int] = None, + pre_handlers: Optional[List[NDHandler]] = None, + post_handlers: Optional[List[NDHandler]] = None, + user_spcified_timers: Optional[List[DeviceTimerMeta]] = None, + level: NDMetricLevel = NDMetricLevel.DEBUG, + ip: str = "0.0.0.0", + **kwargs, +): + """ + High level api to enable timers. + It MUST be called after both torch.cuda.set_device and default process group are initialized. + + Args: + rank (int): rank id. If rank is None, it will be determined by torch.distributed.get_rank. + + mode (str): `fsdp` or `hybrid` mode, `fsdp` currently is only supported in internal version. + + wrapped_fsdp_module (FullyShardedDataParallel): `FullyShardedDataParallel` wrapped torch.nn.module, + only used in fsdp mode and only valid in internal version. + + device_mesh (DeviceMesh): only used in fsdp mode and only valid in internal version. + + mesh_shape (Tuple): only used in fsdp mode and only valid in internal version. + + local_rank (int): local rank id. If local_rank is None, it will be determined by VESCALE_DEVICE_MESH. + + step_getter (Callable[[], int]): func to get current global step. If it is None, steps will be always set as 0. + Another choice is to use `set_global_step` and `inc_step` to maintain step. + + enable_streamer (bool): If set, a streamer process will be forked and then post_handlers can be enabled. + + n_rank_per_host (int): number of devices on one machine. If it is None, it will be determined by torch.cuda.device_count. + + pre_handlers (List[NDHandler]): List of NDHandlers triggered immediately after `flush` on each training process. + `SockNDHandler` will be automatically injected in pre_handlers when streamer enabled and no pre_handlers are given. + + post_handlers (List[NDHandler]): List of NDHandlers triggered in streamer process. + `LocalTimelineNDHandler` will be automatically injected when streamer enabled and no post_handlers are given. + + user_spcified_timers (List[DeviceTimerMeta]): List of DeviceTimerMeta registered by user. + + level (NDMetricLevel): metrics of which the level is lower than this will be ignored. + + ip (str): pod/host ip. + + Returns: + Nothing + """ + + post_handlers = [] if post_handlers is None else post_handlers + pre_handlers = [] if pre_handlers is None else pre_handlers + user_spcified_timers = [] if user_spcified_timers is None else user_spcified_timers + + if mode not in ["hybrid", "fsdp"]: + raise NotImplementedError(f"mode {mode} not implemented") + + if mode == "fsdp" and not is_internal_vescale(): + raise NotImplementedError("fsdp is not currently supported for opensource version") + + if mode != "fsdp" and wrapped_fsdp_module is not None: + raise ValueError("wrapped_fsdp_module and mode should be set accordingly") + + if NDTimerManagerSingleton.is_initialized(): + NDTimelineLogger().warning("timers initialized, no need for initialization") + return + + local_rank = VESCALE_DEVICE_MESH.get_local_rank() if local_rank is None else local_rank + rank = torch.distributed.get_rank() if rank is None else rank + n_rank_per_host = torch.cuda.device_count() if n_rank_per_host is None else n_rank_per_host + + world_size = dist.get_world_size() + ddp_rank, ddp_size = 0, 1 + if mode == "hybrid": + tp_size = VESCALE_DEVICE_MESH.get_strategy_size("TP") + dp_size = VESCALE_DEVICE_MESH.get_strategy_size("DP") + pp_size = VESCALE_DEVICE_MESH.get_strategy_size("PP") + + tp_rank = VESCALE_DEVICE_MESH.get_tensor_parallel_rank() + pp_rank = VESCALE_DEVICE_MESH.get_pipeline_parallel_rank() + dp_rank = VESCALE_DEVICE_MESH.get_data_parallel_rank() + + assert ( + tp_size * dp_size * pp_size == world_size + ), f"tp_size: {tp_size}, dp_size: {dp_size}, pp_size: {pp_size}, world_size: {world_size}" + elif mode == "fsdp": + tp_size, pp_size = 1, 1 + tp_rank, pp_rank = 0, 0 + + patch_fsdp() + if wrapped_fsdp_module is not None: + intra_node_group = wrapped_fsdp_module.process_group + inter_node_group = getattr(wrapped_fsdp_module, "_inter_node_pg", None) + dp_rank, dp_size, ddp_rank, ddp_size = _calculate_topo( + intra_node_group, + inter_node_group, + wrapped_fsdp_module.sharding_strategy, + world_size, + ) + elif device_mesh is not None: + dp_rank, dp_size, ddp_rank, ddp_size = _calculate_topo_by_shape(tuple(device_mesh.mesh.shape), rank) + elif mesh_shape is not None: + dp_rank, dp_size, ddp_rank, ddp_size = _calculate_topo_by_shape(mesh_shape, rank) + else: + raise ValueError("for fsdp, device_mesh or wrapped_fsdp_module or mesh_shape must be given at least 1") + + if enable_streamer: + if local_rank == 0: + if len(post_handlers) > 0: + NDtimelineStreamer.init(local_rank, post_handlers) + else: + NDtimelineStreamer.init( + local_rank, + [ + LocalTimelineNDHandler(n_rank_per_host), + ], + ) + if len(pre_handlers) == 0 or all(not isinstance(handler, SockNDHandler) for handler in pre_handlers): + pre_handlers.append(SockNDHandler()) + + trial_id, run_id, role_id = 0, 0, 0 + + if is_internal_vescale(): + if ip == "0.0.0.0": + ip = _get_ip_by_env() + trial_id = _get_trial_id() + run_id = _get_run_id() + role_id = _get_role_id() + + NDTimerManagerSingleton( + WorldInfo( + rank=rank, + local_rank=local_rank, + tp_rank=tp_rank, + pp_rank=pp_rank, + dp_rank=dp_rank, + ddp_rank=ddp_rank, + tp_size=tp_size, + pp_size=pp_size, + dp_size=dp_size, + ddp_size=ddp_size, + world_size=world_size, + ip=ip, + trial_id=trial_id, + run_id=run_id, + role_id=role_id, + ), + init_cuda_dist=True, + handlers=pre_handlers, + metric_level=level, + ) + + extra = {} + mq_sinks = [] + if is_internal_vescale(): + from ._internal import MQNDHandler + + for handler in post_handlers: + if isinstance(handler, MQNDHandler): + mq_sinks.extend(handler.mq_sinks) + if len(mq_sinks) != 0: + extra = {"sinks": mq_sinks} + + if mode == "hybrid": + predefined_timers = [ + DeviceTimerMeta(SEND_BACKWARD, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(SEND_FORWARD, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(RECV_FORWARD, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(RECV_BACKWARD, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(SEND_FORWARD_RECV_BACKWARD, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(SEND_BACKWARD_RECV_FORWARD, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(CROSS_MESH_RECV, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(CROSS_MESH_SEND, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(FORWARD_COMPUTE, is_cpu_op=False, step_getter=step_getter), + DeviceTimerMeta(BACKWARD_COMPUTE, is_cpu_op=False, step_getter=step_getter), + ] + else: + predefined_timers = [ + DeviceTimerMeta( + UNSHARD_AG, + is_cpu_op=False, + step_getter=step_getter, + common_extra=deepcopy(extra), + ), + DeviceTimerMeta( + GRAD_RS, + is_cpu_op=False, + step_getter=step_getter, + common_extra=deepcopy(extra), + ), + DeviceTimerMeta( + GRAD_AR, + is_cpu_op=False, + step_getter=step_getter, + common_extra=deepcopy(extra), + ), + DeviceTimerMeta( + FORWARD_COMPUTE, + is_cpu_op=False, + step_getter=step_getter, + common_extra=deepcopy(extra), + ), + DeviceTimerMeta( + BACKWARD_COMPUTE, + is_cpu_op=False, + step_getter=step_getter, + common_extra=deepcopy(extra), + ), + ] + predefined_timers.extend(user_spcified_timers) + NDTimerManagerSingleton().register_timers(predefined_timers) + + +def wait(): + """ + High level api for timers to exit gracefully + """ + if NDTimerManagerSingleton.is_initialized(): + NDTimerManagerSingleton().wait() + + +def set_global_step(global_step: int = 0): + """ + Another choice to set global step when `global_step_getter` is None + """ + if NDTimerManagerSingleton.is_initialized(): + NDTimerManagerSingleton().global_step = global_step + + +def inc_step(step: int = 1): + """ + Another choice beside `global_step_getter` to increase global step when `global_step_getter` is None + """ + if NDTimerManagerSingleton.is_initialized(): + step_increased = NDTimerManagerSingleton().global_step + step + NDTimerManagerSingleton().global_step = step_increased + + +def flush( + step_range: Optional[range] = None, + next_iter_enabled: bool = True, + submit2handler: bool = True, + dynamic_calibrate: bool = False, + keep_timer_state: bool = False, + sequential_calibrate: bool = True, +): + """ + High level api for timers to flush metrics to handlers. + + Args: + step_range (range): global step range. Theorically, NO step_getter is acceptable if user use lower level api. + Therefore, step_range is used to allocating steps to metrics. If step_getter is given, step_range can be ignored. + + next_iter_enabled (bool): whether timers continue to be enabled after flushed + + submit2handler (bool): whether metrics should be dropped. False means dropping metrics. + + dynamic_calibrate (bool): whether calibrate clocks at least every 20 minutes. + + keep_timer_state (bool): keep timers being enable or disabled state after flushed, if True; next_iter_enabled ignored if True + + sequential_calibrate (bool): calibrate clocks in main thread or other threads + + Returns: + Nothing + + """ + if NDTimerManagerSingleton.is_initialized(): + step_range = range(0, 1) if step_range is None else step_range + NDTimerManagerSingleton().async_flush( + step_range, + next_iter_enabled=next_iter_enabled, + submit2handler=submit2handler, + dynamic_calibrate=dynamic_calibrate, + keep_timer_state=keep_timer_state, + sequential_calibrate=sequential_calibrate, + ) + + +def _calculate_topo( + intra_node_group: dist.ProcessGroup, + inter_node_group: dist.ProcessGroup, + sharding_strategy: ShardingStrategy, + world_size: int, +) -> Tuple[int, int, int, int]: + if sharding_strategy in HYBRID_SHARDING_STRATEGIES: + ddp_size = inter_node_group.size() + ddp_rank = inter_node_group.rank() + dp_size = intra_node_group.size() + dp_rank = intra_node_group.rank() + assert ( + world_size == intra_node_group.size() * inter_node_group.size() + ), f"world_size: {world_size} intra_node_group: {dp_size} inter_node_group: {ddp_size}" + return dp_rank, dp_size, ddp_rank, ddp_size + elif sharding_strategy == ShardingStrategy.FULL_SHARD: + dp_size = intra_node_group.size() + dp_rank = intra_node_group.rank() + assert world_size == intra_node_group.size(), f"world_size: {world_size}" + return dp_rank, dp_size, 0, 1 + else: + raise NotImplementedError("not implemented for ddp") + + +def _calculate_topo_by_shape(mesh_shape: Tuple[int, ...], rank: int) -> Tuple[int, int, int, int]: + for m in mesh_shape: + assert m > 0 and isinstance(m, int) + if len(mesh_shape) == 2: + dim0, dim1 = mesh_shape[0], mesh_shape[1] + ddp_size, dp_size = dim0, dim1 + mesh = torch.arange(math.prod(mesh_shape)).view(mesh_shape) + ddp_rank, dp_rank = torch.where(mesh == rank) + ddp_rank, dp_rank = int(ddp_rank), int(dp_rank) + return dp_rank, dp_size, ddp_rank, ddp_size + elif len(mesh_shape) == 1: + return rank, math.prod(mesh_shape), 0, 1 + else: + raise ValueError(f"invalid mesh_shape {mesh_shape}") diff --git a/vescale/ndtimeline/binary_protocol.py b/vescale/ndtimeline/binary_protocol.py new file mode 100644 index 0000000..75a69bd --- /dev/null +++ b/vescale/ndtimeline/binary_protocol.py @@ -0,0 +1,139 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import pickle +import io +import gc +from typing import Any, Callable + +from .exceptions import ProtocolValidationError + + +def dumps(v): + return pickle.dumps(v, protocol=4) + + +def loads(binary): + gc.disable() + res = pickle.loads(binary) + gc.enable() + return res + + +dumps_fn = dumps +loads_fn = loads + + +# +---------------------------------------------------------------+ + +# | Magic Number 1Byte | Protocol Version 1Byte | Reserved 2Byte | + +# +---------------------------------------------------------------+ + +# | Payload Length 4Byte | + +# +---------------------------------------------------------------+ + +# | Payload | + +# +---------------------------------------------------------------+ + +# | EOF Symbol 1Byte | + +# +------------------+ + +# Both Payload Length and Maigc Number are Little Endian + + +MAGIC_NUMBER = (0x9C).to_bytes(length=1, byteorder="little") +MAGIC_BYTES_LEN = len(MAGIC_NUMBER) +PROTOCOL_VERSION_0 = (0x0).to_bytes(length=1, byteorder="little") +PROTOCOL_VERSION_BYTES_LEN = len(PROTOCOL_VERSION_0) +RESERVED = b"00" +RESERVED_BYTES_LEN = len(RESERVED) +EOF_SYMBOL = b"\n" +EOF_SYMBOL_BYTES_LEN = len(EOF_SYMBOL) +MAX_PAYLOAD_LEN = 1024 * 1024 * 128 # 128MiB +PAYLOAD_LEN_BYTES_LEN = 4 + + +# encode_package encode payload to package +def encode_package(payload: bytes) -> bytes: + payload_len = len(payload) + if payload_len > MAX_PAYLOAD_LEN: + raise ValueError(f"payload size {payload_len}, larger than max size {MAX_PAYLOAD_LEN}") + payload_len_bytes = payload_len.to_bytes(length=PAYLOAD_LEN_BYTES_LEN, byteorder="little") + # memory efficient + return b"".join([MAGIC_NUMBER, PROTOCOL_VERSION_0, RESERVED, payload_len_bytes, payload, EOF_SYMBOL]) + + +# v: any pickable object +def serialize_to_package(v: Any): + # payload = pickle.dumps(v, protocol=4) + payload = dumps_fn(v) + return encode_package(payload) + + +def recv_and_validate(recv_func: Callable, preload_data: bytearray) -> bytes: + magic_bytes = read_or_recv(MAGIC_BYTES_LEN, recv_func, preload_data) + if magic_bytes != MAGIC_NUMBER: + raise ProtocolValidationError("MAGIC_NUMBER field is broken") + pt_version_bytes = read_or_recv(PROTOCOL_VERSION_BYTES_LEN, recv_func, preload_data) + if pt_version_bytes != PROTOCOL_VERSION_0: + raise ProtocolValidationError("PROTOCOL_VERSION_0 field is broken") + reserved_bytes = read_or_recv(RESERVED_BYTES_LEN, recv_func, preload_data) + if reserved_bytes != RESERVED: + raise ProtocolValidationError(f"RESERVED field is {reserved_bytes}, should be {RESERVED}") + payload_len_bytes = read_or_recv(PAYLOAD_LEN_BYTES_LEN, recv_func, preload_data) + payload_len = int.from_bytes(payload_len_bytes, byteorder="little") + if payload_len > MAX_PAYLOAD_LEN: + raise ProtocolValidationError(f"payload_len {payload_len} loger than {MAX_PAYLOAD_LEN}") + payload = read_or_recv(payload_len, recv_func, preload_data) + eof = read_or_recv(EOF_SYMBOL_BYTES_LEN, recv_func, preload_data) + if eof != EOF_SYMBOL: + raise ProtocolValidationError("EOF field is broken") + return payload + + +def recv_to_buf(size: int, recv: Callable, preload_data: bytearray): + assert len(preload_data) <= size + buf = io.BytesIO() + buf.write(preload_data) + remaining = size - len(preload_data) + del preload_data[: len(preload_data)] + while remaining > 0: + chunk = recv(8192) + n = len(chunk) + if n == 0: + raise BrokenPipeError("recv 0 byte from socket") + if n <= remaining: + buf.write(chunk) + remaining -= n + else: + buf.write(chunk[:remaining]) + preload_data.extend(chunk[remaining:]) + return buf.getvalue() + return buf.getvalue() + + +def read_or_recv(size: int, recv: Callable, preload_data: bytearray): + if len(preload_data) >= size: + res = bytes(preload_data[:size]) + del preload_data[:size] + return res + else: + return recv_to_buf(size, recv, preload_data) diff --git a/vescale/ndtimeline/exceptions.py b/vescale/ndtimeline/exceptions.py new file mode 100644 index 0000000..35b995b --- /dev/null +++ b/vescale/ndtimeline/exceptions.py @@ -0,0 +1,28 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + + +class ProtocolValidationError(ValueError): + def __init__(self, msg): + super().__init__(msg) + self.msg = msg + + +class NDHandlerError(RuntimeError): + def __init__(self, msg): + super().__init__(msg) + self.msg = msg diff --git a/vescale/ndtimeline/fsdp_patch.py b/vescale/ndtimeline/fsdp_patch.py new file mode 100644 index 0000000..c3301a8 --- /dev/null +++ b/vescale/ndtimeline/fsdp_patch.py @@ -0,0 +1,28 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from .is_internal import is_internal_vescale + +if is_internal_vescale(): + from ._internal import patch_fsdp, is_fsdp_patched +else: + + def patch_fsdp(): + pass + + def is_fsdp_patched(): + return False diff --git a/vescale/ndtimeline/handlers/__init__.py b/vescale/ndtimeline/handlers/__init__.py new file mode 100644 index 0000000..171eac8 --- /dev/null +++ b/vescale/ndtimeline/handlers/__init__.py @@ -0,0 +1,34 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from .sock_handler import SockNDHandler +from .parser_handler import ParserNDHandler +from .logging_handler import LoggingNDHandler +from .local_raw_handler import LocalRawNDHandler +from .local_timeline_handler import LocalTimelineNDHandler +from .handler_base import NDHandler +from .do_nothing_handler import DoNothingNDHandler + +__all__ = [ + "SockNDHandler", + "ParserNDHandler", + "NDHandler", + "LoggingNDHandler", + "LocalRawNDHandler", + "DoNothingNDHandler", + "LocalTimelineNDHandler", +] diff --git a/vescale/ndtimeline/handlers/chrome_trace_event.py b/vescale/ndtimeline/handlers/chrome_trace_event.py new file mode 100644 index 0000000..809ea6d --- /dev/null +++ b/vescale/ndtimeline/handlers/chrome_trace_event.py @@ -0,0 +1,291 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import random +from dataclasses import dataclass +from typing import Union, Optional, List, Dict, Tuple +from abc import ABC, abstractmethod + + +class TracingEvent(ABC): + """ + chrome trace event format see doc: + https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview# + """ + + @abstractmethod + def to_objects(self) -> List[dict]: + pass + + +@dataclass +class CompleteEvent(TracingEvent): + name: str + cat: str + pid: Union[str, int] + tid: Union[str, int] + + # 起始和持续时间长度(单位都是us) + ts: float + dur: float + + args: Optional[dict] = None + + def to_objects(self) -> List[dict]: + return [ + { + "name": self.name, + "cat": self.cat, + "pid": self.pid, + "tid": self.tid, + "args": self.args or {}, + "ts": self.ts, + "dur": self.dur, + "ph": "X", + } + ] + + +@dataclass +class BeginEvent(TracingEvent): + name: str + cat: str + pid: Union[str, int] + tid: Union[str, int] + + # 起始和持续时间长度(单位都是us) + ts: float + stack: Optional[List[int]] = None + + args: Optional[dict] = None + + def to_objects(self) -> List[dict]: + return [ + { + "name": self.name, + "cat": self.cat, + "pid": self.pid, + "tid": self.tid, + "args": self.args or {}, + "ts": self.ts, + "ph": "B", + } + ] + + +@dataclass +class EndEvent(TracingEvent): + name: str + cat: str + pid: Union[str, int] + tid: Union[str, int] + + # 起始和持续时间长度(单位都是us) + ts: float + stack: Optional[List[int]] = None + + args: Optional[dict] = None + + def to_objects(self) -> List[dict]: + return [ + { + "name": self.name, + "cat": self.cat, + "pid": self.pid, + "tid": self.tid, + "args": self.args or {}, + "ts": self.ts, + "ph": "E", + } + ] + + +flow_event_id_counter = 0 + + +@dataclass +class FlowEvent(TracingEvent): + # {"ph": "f", "id": 246, "pid": "172.20.133.93", "tid": 13, "ts": 1669171992173028, \ + # "cat": "async_gpu", "name": "cudaLaunchKernel", "bp": "e"} + name: str + cat: str + + # list of (pid, tid, ts) + flows: List[Tuple[Union[str, int], Union[str, int], float]] + + def to_objects(self) -> List[dict]: + global flow_event_id_counter + flow_event_id_counter += 1 + gen_id = flow_event_id_counter # use stable predictable id + ret = [] + # 起始时间比结束时间更晚的话没意义,不会被渲染,所以修正一下 + for i in range(1, len(self.flows)): + _, _, ts0 = self.flows[i - 1] + pid, tid, ts1 = self.flows[i] + if ts1 <= ts0: + self.flows[i] = (pid, tid, ts0 + 1) + for f in self.flows: + pid, tid, ts = f + ret.append( + { + "name": self.name, + "cat": self.cat, + "pid": pid, + "tid": tid, + "ts": ts, + "ph": "t", + "bp": "e", + "id": gen_id, + } + ) + ret[0]["ph"] = "s" + ret[-1]["ph"] = "f" + ret[-1]["ts"] += 1 + return ret + + +@dataclass +class CounterEvent(TracingEvent): + name: str + pid: Union[str, int] + + # 起始和持续时间长度(单位都是us) + ts: float + + # 计数的数据序列 + data: Dict[str, Union[int, float]] + + def to_objects(self) -> List[dict]: + return [ + { + "name": self.name, + "pid": self.pid, + "args": self.data, + "ts": self.ts, + "ph": "C", + } + ] + + +class CombinedEvents(TracingEvent): + """ + 将多个tracing event合并一起,表示成1个event,最后按顺序展开每个object + """ + + def __init__(self, events: List[TracingEvent]): + self.events = events + + def to_objects(self) -> List[dict]: + obj = [] + for e in self.events: + obj.extend(e.to_objects()) + return obj + + +@dataclass +class ProcessMetadataEvent(TracingEvent): + pid: Union[str, int] + sort_index: int + process_name: Optional[str] = None + process_labels: List[str] = None + + def to_objects(self) -> List[dict]: + ret = [ + { + "name": "process_sort_index", + "pid": self.pid, + "ph": "M", + "args": { + "sort_index": self.sort_index, + }, + } + ] + if self.process_labels is not None: + ret.append( + { + "name": "process_labels", + "pid": self.pid, + "ph": "M", + "args": { + "labels": ",".join(self.process_labels), + }, + } + ) + if self.process_name is not None: + ret.append( + { + "name": "process_name", + "pid": self.pid, + "ph": "M", + "args": { + "name": self.process_name, + }, + } + ) + return ret + + +@dataclass +class ThreadMetadataEvent(TracingEvent): + pid: Union[str, int] + tid: Union[str, int] + sort_index: int + thread_name: Optional[str] = None + + def to_objects(self) -> List[dict]: + ret = [ + { + "name": "thread_sort_index", + "pid": self.pid, + "tid": self.tid, + "ph": "M", + "args": { + "sort_index": self.sort_index, + }, + } + ] + if self.thread_name is not None: + ret.append( + { + "name": "thread_name", + "pid": self.pid, + "tid": self.tid, + "ph": "M", + "args": { + "name": self.thread_name, + }, + } + ) + return ret + + +class DummyEvent(TracingEvent): + def to_objects(self) -> List[dict]: + return [ + { + "name": "dummy", + "cat": "dummy", + "pid": random.randint(1, 100), + "tid": random.randint(1, 100), + "args": { + "content": "*" * random.randint(100, 1000), + }, + "ts": random.randint(1, 9999), + "dur": random.randint(1, 100), + "ph": "i", + } + ] diff --git a/vescale/ndtimeline/handlers/do_nothing_handler.py b/vescale/ndtimeline/handlers/do_nothing_handler.py new file mode 100644 index 0000000..ad4b9eb --- /dev/null +++ b/vescale/ndtimeline/handlers/do_nothing_handler.py @@ -0,0 +1,36 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from typing import Dict, List, Any + +from .handler_base import NDHandler +from ..world_info import WorldInfo + + +class DoNothingNDHandler(NDHandler): + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + pass diff --git a/vescale/ndtimeline/handlers/handler_base.py b/vescale/ndtimeline/handlers/handler_base.py new file mode 100644 index 0000000..055240e --- /dev/null +++ b/vescale/ndtimeline/handlers/handler_base.py @@ -0,0 +1,79 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from typing import List, Any, Dict +from abc import ABC, abstractmethod +from ..variables import NDTIMELINE_FLUSH_SEPCIAL +from ..world_info import WorldInfo + + +class NDHandler(ABC): + def __init__(self, designated_key="", ignore_metrics=None) -> None: + super().__init__() + self._dispatch_key = self.__class__.__name__ + self._ignore_metrics = ignore_metrics if ignore_metrics is not None else [NDTIMELINE_FLUSH_SEPCIAL] + if designated_key != "": + self._dispatch_key = designated_key + + @property + def dispatch_key(self): + return self._dispatch_key + + @property + def ignore_metrics(self): + return self._ignore_metrics + + def __repr__(self) -> str: + return f"NDHandler instance with dispatch key: {self._dispatch_key}" + + def __call__( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + if metric_name in self.ignore_metrics: + return + return self.call_impl( + metric_name, + elapsed, + recent_elapsed_raw_parts, + recent_since_start_raw_parts, + tags, + step_range, + world_info, + extra, + ) + + @abstractmethod + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + pass diff --git a/vescale/ndtimeline/handlers/local_raw_handler.py b/vescale/ndtimeline/handlers/local_raw_handler.py new file mode 100644 index 0000000..6db4cc7 --- /dev/null +++ b/vescale/ndtimeline/handlers/local_raw_handler.py @@ -0,0 +1,67 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import logging +import os +from typing import List, Dict, Any +from logging import Formatter +from logging.handlers import RotatingFileHandler + +from .handler_base import NDHandler +from ..world_info import WorldInfo +from ..variables import LOCAL_LOGGING_PATH + + +CHUNK_SZ = 1024 * 1024 * 128 # 128 MiB +BACKUP_CNT = 8 + + +class LocalRawNDHandler(NDHandler): + def __init__( + self, run_id: int, log_path: str = LOCAL_LOGGING_PATH, chunk_sz: int = CHUNK_SZ, backup_cnt: int = BACKUP_CNT + ) -> None: + """if a trial of log exceeds `chunk_sz`, it will be dropped""" + super().__init__() + if not os.path.exists(log_path): + os.makedirs(log_path, exist_ok=True) + file_name = f"timeline_run{run_id}_raw.log" + formatter = Formatter("%(asctime)s - %(message)s") + handler = RotatingFileHandler( + filename=os.path.join(log_path, file_name), maxBytes=chunk_sz, backupCount=backup_cnt + ) + handler.setFormatter(formatter) + self.logger = logging.getLogger("LocalRawNDHandler") + self.logger.propagate = False + self.logger.addHandler(handler) + self.logger.setLevel(logging.DEBUG) + + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + msg = ( + f"metric_name: {metric_name}, elapsed: {elapsed}, recent_elapsed_raw_parts: {recent_elapsed_raw_parts}, recent_since_start_raw_parts: {recent_since_start_raw_parts}," + f" tags: {tags}, step_range: {step_range}, world_info: {world_info}" + ) + self.logger.info(msg) diff --git a/vescale/ndtimeline/handlers/local_timeline_handler.py b/vescale/ndtimeline/handlers/local_timeline_handler.py new file mode 100644 index 0000000..907c5ef --- /dev/null +++ b/vescale/ndtimeline/handlers/local_timeline_handler.py @@ -0,0 +1,201 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import json +from typing import List, Dict, Any, Set, Deque, Optional +from collections import deque + +import torch + +from .chrome_trace_event import CompleteEvent, ThreadMetadataEvent, ProcessMetadataEvent +from ..world_info import WorldInfo +from ..variables import NDTIMELINE_FLUSH_SEPCIAL +from .handler_base import NDHandler +from .parser_handler import parse_record, DeviceTimerStreamRecord + + +# thread_index_table + + +def build_thread_index_table(tab, metrics, index, index_name): + for m in metrics: + tab[m] = (index, index_name) + + +major_metrics = { + "forward-compute", + "backward-compute", + "embedding-grads-all-reduce", + "optimizer", + "optimizer-clip-main-grad", + "optimizer-inner-step", + "optimizer-copy-to-main-grad", + "optimizer-copy-main-to-model-params", +} + +tp_stream_metrics = { + "tp-allreduce", + "tp-allgather", + "tp-reducescatter", + "layernorm-grads-all-reduce", +} + +dp_stream_metrics = { + "grads-reduce-scatter", + "params-all-gather", + "separate-grads-all-reduce", + "grads-reduce-scatter-nonoverlapping", + "params-all-gather-nonoverlapping", +} + +pp_batch_stream_metrics = { + "backward-send-backward-recv", + "backward-send-forward-recv", + "forward-send-backward-recv", + "forward-send-forward-recv", + "forward-backward-send-forward-backward-recv", + "cross-mesh-recv", + "cross-mesh-send", +} + +pp_forward_stream_metrics = { + "forward-recv", + "backward-send", +} + +pp_backward_stream_metrics = { + "forward-send", + "backward-recv", +} + + +thread_sort_index = {} +build_thread_index_table(thread_sort_index, major_metrics, 0, "main") +build_thread_index_table(thread_sort_index, pp_forward_stream_metrics, 1, "pp ->") +build_thread_index_table(thread_sort_index, pp_backward_stream_metrics, 2, "pp <-") +build_thread_index_table(thread_sort_index, pp_batch_stream_metrics, 3, "pp send/recv") +build_thread_index_table(thread_sort_index, tp_stream_metrics, 4, "tp collective") +build_thread_index_table(thread_sort_index, dp_stream_metrics, 5, "dp collective") +sort_index_other = 6 +index_name_other = "other" + + +events = [] +tid_names = {} # tid -> (pid, name) + +MAX_UINT64 = 18446744073709551615 +NEGTIVE_ONE = -1 + + +class LocalTimelineNDHandler(NDHandler): + def __init__(self, n_rank_per_host: Optional[int] = None): + super().__init__(ignore_metrics=[]) + if n_rank_per_host is None: + n_rank_per_host = torch.cuda.device_count() + self.n_rank_per_host = n_rank_per_host + self.rank2buffer: List[List[DeviceTimerStreamRecord]] = [[] for _ in range(n_rank_per_host)] + # rank -> deque(set(steps), set(steps), empty set) + self.rank2steps: List[Deque[Set[int]]] = [deque(set() for _ in range(1)) for _ in range(n_rank_per_host)] + + def dump_records(self): + output_ranks = set() + events = [] + min_step, max_step = MAX_UINT64, NEGTIVE_ONE + buffer = [record for rank in range(self.n_rank_per_host) for record in self.rank2buffer[rank]] + for record in buffer: + metric, step, rank, dp_rank = record.metric, record.step, record.rank, record.dp_rank + if step < 0: + continue + min_step = min(min_step, step) + max_step = max(max_step, step) + output_ranks.add((dp_rank, rank)) + sort_index, index_name = thread_sort_index.get(metric, (sort_index_other, index_name_other)) + tid = rank * 10 + sort_index # 乘以10表示让出个位数给thread_sort_index编码 + tid_names[tid] = (dp_rank, f"rank[{rank}] {index_name}") + for ts, dur in zip(record.start_ts, record.duration): + args = { + "rank": rank, + "step": step, + "tp": record.tp_rank, + "pp": record.pp_rank, + } + ev = CompleteEvent(name=metric, cat=metric, pid=dp_rank, tid=tid, ts=ts * 1e6, dur=dur * 1e6, args=args) + events.append(ev) + for tid, (dp_rank, name) in tid_names.items(): + ev = ThreadMetadataEvent( + pid=dp_rank, + tid=tid, + sort_index=tid, + thread_name=name, + ) + events.append(ev) + for dp_rank in {dp_rank for dp_rank, _ in output_ranks}: + ev = ProcessMetadataEvent(pid=dp_rank, sort_index=dp_rank, process_name=f"dp rank[{dp_rank}]") + events.append(ev) + spans = [] + for ev in events: + spans.extend(ev.to_objects()) + with open(f"trace_step{min_step}_{max_step}", "w") as f: + json.dump(spans, f) + + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + local_rank = world_info["local_rank"] + if metric_name == NDTIMELINE_FLUSH_SEPCIAL: + self.rank2steps[local_rank].append(set()) + if all(len(self.rank2steps[i]) >= 2 for i in range(self.n_rank_per_host)): + # split + new_rank2buffer: List[List[DeviceTimerStreamRecord]] = [[] for _ in range(self.n_rank_per_host)] + for rank in range(self.n_rank_per_host): + # use record.copy to avoid gc failure and memory leaking + new_rank2buffer[rank] = [ + record.copy() + for record in self.rank2buffer[rank] + if record.step not in self.rank2steps[rank][0] + ] + self.rank2buffer[rank] = [ + record for record in self.rank2buffer[rank] if record.step in self.rank2steps[rank][0] + ] + self.dump_records() + # update + self.rank2buffer = new_rank2buffer + for rank in range(self.n_rank_per_host): + self.rank2steps[rank].popleft() + else: + # assume local_rank is in [0...n_rank_per_device-1] + records = parse_record( + metric_name, + elapsed, + recent_elapsed_raw_parts, + recent_since_start_raw_parts, + tags, + step_range, + world_info, + extra, + ) + self.rank2buffer[local_rank].extend(records) + for record in records: + self.rank2steps[local_rank][-1].add(record.step) diff --git a/vescale/ndtimeline/handlers/logging_handler.py b/vescale/ndtimeline/handlers/logging_handler.py new file mode 100644 index 0000000..ca21e4f --- /dev/null +++ b/vescale/ndtimeline/handlers/logging_handler.py @@ -0,0 +1,47 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from typing import List, Dict, Any + +from .handler_base import NDHandler +from ..world_info import WorldInfo +from ..logger import NDTimelineLogger + + +class LoggingNDHandler(NDHandler): + def __init__(self) -> None: + super().__init__() + + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + NDTimelineLogger().debug( + f"#recent_elapsed_raw_parts: {len(recent_elapsed_raw_parts)}, #recent_since_start_raw_parts {len(recent_since_start_raw_parts)}" + ) + if len(step_range) < 1: + raise ValueError(f"step_range length is {len(step_range)}") + NDTimelineLogger().info( + f"[rank{world_info.topo_info.rank}, step{step_range[0]}-{step_range[-1]}]: {len(recent_since_start_raw_parts)} times {metric_name} total cost: {elapsed*1000:.2f}ms" + ) diff --git a/vescale/ndtimeline/handlers/parser_handler.py b/vescale/ndtimeline/handlers/parser_handler.py new file mode 100644 index 0000000..d56cccf --- /dev/null +++ b/vescale/ndtimeline/handlers/parser_handler.py @@ -0,0 +1,206 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import time +import itertools +from dataclasses import dataclass +from typing import List, Dict, Any + +from ..logger import NDTimelineLogger +from .handler_base import NDHandler +from ..exceptions import NDHandlerError +from ..world_info import WorldInfo +from ..variables import NDTIMELINE_INNER_GLOBAL_STEP_KEY + + +@dataclass +class DeviceTimerStreamRecord: + ts: int # record time for partition purpose + rank: int + metric: str + iteration: int # legacy field, no meaning + step: int + avg_dur: float # time elapsed, legacy name + start_ts: List[float] + duration: List[float] + model_chunk: int # vpp model chunk id, start from 0 + pp_rank: int # pp_rank legacy problem + dp_rank: int # the rank of existing dp group + tp_rank: int # the rank of existing tp group + ip: str + role_id: int # multi-role in RL + trial_id: int # trial id + run_id: int # run_id + + def to_dict(self) -> Dict[str, Any]: + return { + "ts": self.ts, + "rank": self.rank, + "metric": self.metric, + "iteration": self.iteration, + "step": self.step, + "value": self.avg_dur, + "start_ts": self.start_ts, + "duration": self.duration, + "model_chunk": self.model_chunk, + "stage": self.pp_rank, + "dp_rank": self.dp_rank, + "tp_rank": self.tp_rank, + "ip": self.ip, + "role": self.role_id, + "trial": str(self.trial_id), + "run_id": self.run_id, + } + + def copy(self): + return DeviceTimerStreamRecord( + self.ts, + self.rank, + self.metric, + self.iteration, + self.step, + self.avg_dur, + self.start_ts, + self.duration, + self.model_chunk, + self.pp_rank, + self.dp_rank, + self.tp_rank, + self.ip, + self.role_id, + self.trial_id, + self.run_id, + ) + + +def parse_record( + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], +) -> List[DeviceTimerStreamRecord]: + if len(recent_elapsed_raw_parts) != len(recent_since_start_raw_parts): + raise NDHandlerError( + f"recent_elapsed_raw_parts {len(recent_elapsed_raw_parts)} not" + f"equal to recent_since_start_raw_parts {len(recent_since_start_raw_parts)}" + ) + if len(recent_elapsed_raw_parts) != len(tags): + raise NDHandlerError(f"recent_elapsed_raw_parts {len(recent_elapsed_raw_parts)} not equal to tags {len(tags)}") + + if len(recent_elapsed_raw_parts) == 0: + return [] + + specified_steps = [tag[NDTIMELINE_INNER_GLOBAL_STEP_KEY] for tag in tags if NDTIMELINE_INNER_GLOBAL_STEP_KEY in tag] + + now = int(time.time()) + records = [] + if len(specified_steps) != 0: + # metric with `INNER_GLOBAL_STEP_KEY` does not respect `step_range` + # but it should always be set with `INNER_GLOBAL_STEP_KEY` and monotonically increasing + if len(specified_steps) != len(tags): + raise NDHandlerError("timer with INNER_GLOBAL_STEP_KEY's step is not always set") + + # to understand the following codes, + # you can `print(list(itertools.groupby([21,22,23,23,23,46,46,49,50])))` + i = 0 + # NDTimelineLogger().debug("{}: {}".format(metric_name, len(tags))) + for step, group_v in itertools.groupby(specified_steps): + op_counts = sum(1 for _ in group_v) # memory efficient version of `len(list(group_v))` + avg_dur = sum(recent_elapsed_raw_parts[i : i + op_counts]) / op_counts if op_counts != 0 else 0.0 + record = DeviceTimerStreamRecord( + ts=now, + rank=world_info.topo_info.rank, + metric=metric_name, + iteration=0, + step=step, + avg_dur=avg_dur, + start_ts=recent_since_start_raw_parts[i : i + op_counts], + duration=recent_elapsed_raw_parts[i : i + op_counts], + model_chunk=0, + pp_rank=world_info.topo_info.pp_rank, + dp_rank=world_info.topo_info.dp_rank, + tp_rank=world_info.topo_info.tp_rank, + ip=world_info.topo_info.ip, + role_id=world_info["role_id"], + trial_id=world_info["trial_id"], + run_id=world_info["run_id"], + ) + records.append(record) + i += op_counts + else: + if len(step_range) == 0: + raise NDHandlerError(f"step_range {step_range} length is zero") + if len(recent_elapsed_raw_parts) % len(step_range) != 0: + fmt_str = ( + "len(recent_elapsed_raw_parts) {} of {} " + + "is not multiply of len(step_range) {}; " + + "if you can't ensure op counts in every step are equal," + + "please explicitly use `step_getter`" + ) + raise NDHandlerError(fmt_str.format(metric_name, len(recent_elapsed_raw_parts), len(step_range))) + NDTimelineLogger().debug(f"{metric_name}: {len(recent_elapsed_raw_parts)} in {step_range}") + num_step_ops = len(recent_elapsed_raw_parts) // len(step_range) + for i, step in enumerate(step_range): + avg_dur = sum(recent_elapsed_raw_parts[i * num_step_ops : (i + 1) * num_step_ops]) / num_step_ops + record = DeviceTimerStreamRecord( + ts=now, + rank=world_info.topo_info.rank, + metric=metric_name, + iteration=0, + step=step, + avg_dur=avg_dur, + start_ts=recent_since_start_raw_parts[i * num_step_ops : (i + 1) * num_step_ops], + duration=recent_elapsed_raw_parts[i * num_step_ops : (i + 1) * num_step_ops], + model_chunk=0, + pp_rank=world_info.topo_info.pp_rank, + dp_rank=world_info.topo_info.dp_rank, + tp_rank=world_info.topo_info.tp_rank, + ip=world_info.topo_info.ip, + role_id=world_info["role_id"], + trial_id=world_info["trial_id"], + run_id=world_info["run_id"], + ) + records.append(record) + return records + + +class ParserNDHandler(NDHandler): + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + return parse_record( + metric_name, + elapsed, + recent_elapsed_raw_parts, + recent_since_start_raw_parts, + tags, + step_range, + world_info, + extra, + ) diff --git a/vescale/ndtimeline/handlers/sock_handler.py b/vescale/ndtimeline/handlers/sock_handler.py new file mode 100644 index 0000000..6d6cc0a --- /dev/null +++ b/vescale/ndtimeline/handlers/sock_handler.py @@ -0,0 +1,107 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import time +import traceback +import socket +from typing import List, Dict, Any + +from ..logger import NDTimelineLogger +from ..binary_protocol import serialize_to_package +from .handler_base import NDHandler +from ..world_info import WorldInfo +from ..variables import SOCK_PATH, SOCK_TIMEOUT_CLIENT + + +class SockNDHandler(NDHandler): + def __init__(self, timeout: float = SOCK_TIMEOUT_CLIENT, sock_path: str = SOCK_PATH): + super().__init__(ignore_metrics=[]) + self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + self.sock.settimeout(timeout) + self.sock_path = sock_path + self.timeout = timeout + self.initialized = False + self.server_exited = False + self.try_to_connect() + + def try_to_connect(self, must=False): + if self.initialized: + return + if must: + retry = 50 + else: + retry = 1 + backoff = 0.8 # seconds + for _ in range(retry + 1): + err_msg = "" + try: + self.sock.connect(self.sock_path) + self.initialized = True + break + except OSError as e: + if e.errno == 106 and e.strerror == "Transport endpoint is already connected": + # might be called in multiple threads + # but for one process, only one connection is required + self.initialized = True + break + else: + err_msg = traceback.format_exc() + time.sleep(backoff) + except Exception: + err_msg = traceback.format_exc() + time.sleep(backoff) + + if must and not self.initialized: + NDTimelineLogger().error(f"initialize sock handler failed: {err_msg}") + + def call_impl( + self, + metric_name: str, + elapsed: float, + recent_elapsed_raw_parts: List[float], + recent_since_start_raw_parts: List[float], + tags: List[Dict[str, Any]], + step_range: range, + world_info: WorldInfo, + extra: Dict[str, Any], + ) -> Any: + self.try_to_connect(True) + if self.server_exited: + return + try: + st = time.perf_counter() + pkg = serialize_to_package( + { + "metric_name": metric_name, + "elapsed": elapsed, + "recent_elapsed_raw_parts": recent_elapsed_raw_parts, + "recent_since_start_raw_parts": recent_since_start_raw_parts, + "tags": tags, + "step_range": step_range, + "world_info": world_info, + "extra": extra, + } + ) + self.sock.sendall(pkg) + NDTimelineLogger().debug(f"serialize and send data: {(time.perf_counter() - st) * 1000:3.3f}ms") + except BrokenPipeError as e: + NDTimelineLogger().error(f"{e}, server exit") + self.server_exited = True + except socket.timeout: + NDTimelineLogger().warning(f"socket timeout {traceback.format_exc()}") + except Exception: + NDTimelineLogger().error(traceback.format_exc()) diff --git a/vescale/ndtimeline/is_internal.py b/vescale/ndtimeline/is_internal.py new file mode 100644 index 0000000..0f140e3 --- /dev/null +++ b/vescale/ndtimeline/is_internal.py @@ -0,0 +1,23 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +try: + from ._internal import is_internal_vescale +except ImportError: + + def is_internal_vescale(): + return False diff --git a/vescale/ndtimeline/logger.py b/vescale/ndtimeline/logger.py new file mode 100644 index 0000000..929c2a1 --- /dev/null +++ b/vescale/ndtimeline/logger.py @@ -0,0 +1,41 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import logging +import sys +import os + + +class NDTimelineLogger: + def __new__(cls): + if not hasattr(cls, "instance"): + level = logging.getLevelName(os.getenv("VESCALE_NDTIMELINE_LOG_LEVEL", "INFO")) + if isinstance(level, str): + # if NDTIMELINE_LOG_LEVEL has an illegal value + # logging.getLevelName returns a str `Level xxx` + level = logging.WARNING + formatter = logging.Formatter( + "[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d][pid:%(process)d] - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + handler = logging.StreamHandler(stream=sys.stderr) + handler.setFormatter(formatter) + cls.instance = logging.getLogger("ndtimeline") + cls.instance.addHandler(handler) + cls.instance.setLevel(level) + cls.instance.propagate = False + return cls.instance diff --git a/vescale/ndtimeline/pool.py b/vescale/ndtimeline/pool.py new file mode 100644 index 0000000..4cc5c34 --- /dev/null +++ b/vescale/ndtimeline/pool.py @@ -0,0 +1,78 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import threading +from collections import deque +from typing import Dict, Any, Optional + +import torch +from torch.cuda import Event + +from .variables import DEFAULT_CUDA_EVENT_POOL_SIZE + + +class CudaEventPool: + def __init__( + self, device: Optional[int] = None, init_sz: int = DEFAULT_CUDA_EVENT_POOL_SIZE, blocking: bool = False + ) -> None: + self._pool = deque() + self._device = device + self._event_attr = {"enable_timing": True, "blocking": blocking, "interprocess": False} + + self._mtx = threading.Lock() + + for _ in range(init_sz): + event = Event(**self._event_attr) + event.tag = {} + self._pool.append(event) + event.record() # warmup + + def get(self, tag: Dict[str, Any]): + device = torch.cuda.current_device() + if self._device is not None: + device = self._device + with torch.cuda.device(device): + try: + with self._mtx: + event = self._pool.popleft() + except IndexError: + event = Event(**self._event_attr) + event.tag = tag.copy() + return event + + def release(self, event: Event): + with self._mtx: + self._pool.append(event) + + +class DefaultEventPool: + initialized = False + + @classmethod + def init(cls, device: Optional[int] = None): + assert not cls.initialized + cls._default_cuda_event_pool = CudaEventPool(device=device, blocking=True) + cls.initialized = True + + @classmethod + def get(cls, tag: Optional[Dict[str, Any]] = None): + tag = tag if tag is not None else {} + return cls._default_cuda_event_pool.get(tag) + + @classmethod + def release(cls, event: Event): + cls._default_cuda_event_pool.release(event) diff --git a/vescale/ndtimeline/predefined.py b/vescale/ndtimeline/predefined.py new file mode 100644 index 0000000..1cccffb --- /dev/null +++ b/vescale/ndtimeline/predefined.py @@ -0,0 +1,30 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +RECV_FORWARD = "forward-recv" +RECV_BACKWARD = "backward-recv" +SEND_FORWARD = "forward-send" +SEND_BACKWARD = "backward-send" +SEND_FORWARD_RECV_BACKWARD = "forward-send-backward-recv" +SEND_BACKWARD_RECV_FORWARD = "backward-send-forward-recv" +CROSS_MESH_RECV = "cross-mesh-recv" +CROSS_MESH_SEND = "cross-mesh-send" +FORWARD_COMPUTE = "forward-compute" +BACKWARD_COMPUTE = "backward-compute" +UNSHARD_AG = "unshard-all-gather" +GRAD_RS = "grad-reduce-scatter" +GRAD_AR = "grad-all-reduce" diff --git a/vescale/ndtimeline/sock_streamer.py b/vescale/ndtimeline/sock_streamer.py new file mode 100644 index 0000000..34b1983 --- /dev/null +++ b/vescale/ndtimeline/sock_streamer.py @@ -0,0 +1,132 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import socket +import socketserver +import os +import traceback +import queue +import threading +from typing import Callable, List, Optional + +import torch.multiprocessing as mp +from torch.multiprocessing import ProcessContext + +from .logger import NDTimelineLogger +from .binary_protocol import recv_and_validate, loads_fn +from .exceptions import ProtocolValidationError, NDHandlerError +from .variables import SOCK_PATH, SOCK_PARENT_DIR + +q = None + + +def internal_queue_consume(handlers: Optional[List[Callable]] = None): + if handlers is None: + handlers = [] + global q + while True: + try: + args = q.get(block=True) + for handler in handlers: + handler( + args["metric_name"], + args["elapsed"], + args["recent_elapsed_raw_parts"], + args["recent_since_start_raw_parts"], + args["tags"], + args["step_range"], + args["world_info"], + args["extra"], + ) + except NDHandlerError as e: + NDTimelineLogger().error(e) + NDTimelineLogger().warning(traceback.format_exc()) + continue + except queue.Empty: + continue + except Exception as e: + NDTimelineLogger().error(e) + NDTimelineLogger().error(traceback.format_exc()) + continue + + +class MsgHandler(socketserver.BaseRequestHandler): + def handle(self): + global q + # self.request is a socket, automatically closed after `handle` + assert q is not None + preload_data = bytearray() + while True: + try: + payload = recv_and_validate(self.request.recv, preload_data) + args = loads_fn(payload) + q.put(args) + except ProtocolValidationError: + pass + except ValueError as e: + NDTimelineLogger().error(e) + NDTimelineLogger().error(traceback.format_exc()) + except socket.timeout: + NDTimelineLogger().error("socket.timeout") + NDTimelineLogger().error(traceback.format_exc()) + except BrokenPipeError: + NDTimelineLogger().info("client exit") + break + except Exception: + NDTimelineLogger().error(traceback.format_exc()) + break + + +class NDtimelineStreamer: + p: ProcessContext + initialized: bool = False + + @classmethod + def init(cls, local_rank: int, handlers: Optional[List[Callable]] = None): + if local_rank != 0: + return + if cls.initialized: + NDTimelineLogger().warning("NDtimelineStreamer has already been initialized, skipped") + return + handlers = handlers if handlers is not None else [] + try: + if os.path.exists(SOCK_PATH): + os.remove(SOCK_PATH) + if not os.path.exists(SOCK_PARENT_DIR): + os.makedirs(SOCK_PARENT_DIR, exist_ok=True) + cls.p = mp.spawn( + fn=NDtimelineStreamer.run, args=(handlers,), nprocs=1, join=False, daemon=True, start_method="spawn" + ) + NDTimelineLogger().info("ndtimeline streamer started") + cls.initialized = True + except Exception: + NDTimelineLogger().error("NDtimelineStreamer init failed") + NDTimelineLogger().error(traceback.format_exc()) + + @staticmethod + def run(process_index, handlers: List[Callable]): + global q + # in order to save memory of main process, `q` is initialized here + q = queue.Queue(500000) + mq_thread = threading.Thread( + target=internal_queue_consume, args=(handlers,), daemon=True, name="internal_queue_consume" + ) + mq_thread.start() + + with socketserver.ThreadingUnixStreamServer(SOCK_PATH, MsgHandler) as server: + server.daemon_threads = True + server.serve_forever() diff --git a/vescale/ndtimeline/stream.py b/vescale/ndtimeline/stream.py new file mode 100644 index 0000000..a06b072 --- /dev/null +++ b/vescale/ndtimeline/stream.py @@ -0,0 +1,79 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import torch +from .logger import NDTimelineLogger + +NCCL_STREAMS = {} +DEVICE = None + + +def get_nccl_p2p_stream(name: str, nccl_pg: "torch.distributed.ProcessGroup", peer, is_batched): + global NCCL_STREAMS, DEVICE + if DEVICE is None: + DEVICE = torch.device("cuda", index=torch.cuda.current_device()) + if name in NCCL_STREAMS and NCCL_STREAMS[name] is not None: + return NCCL_STREAMS[name] + if hasattr(nccl_pg, "_get_backend"): + nccl_backend = nccl_pg._get_backend(DEVICE) + else: + # before torch 2.x torch._C._distributed_c10d.ProcessGroupNCCL is a subclass of + # torch.distributed.ProcessGroup + nccl_backend = nccl_pg + if hasattr(nccl_backend, "get_p2p_cuda_stream_id"): + stream_id = nccl_backend.get_p2p_cuda_stream_id(DEVICE.index, peer, is_batched) + NDTimelineLogger().debug(f"[{DEVICE.index}]{name} [{peer}] stream_id={stream_id}") + if stream_id < 0: + rank = nccl_pg.rank() + NDTimelineLogger().info(f"[{rank}]{name} is_batched={is_batched} p2p stream is not available, skipped") + return None + _CUDA_DEVICE = 1 + nccl_stream = torch.cuda.Stream(stream_id=stream_id, device_index=DEVICE.index, device_type=_CUDA_DEVICE) + rank = nccl_pg.rank() + msg = f"[{rank}]{name} nccl p2p stream id={stream_id} device={DEVICE} stream={nccl_stream}" + NDTimelineLogger().debug(msg) + NCCL_STREAMS[name] = nccl_stream + return nccl_stream + return None + + +def get_nccl_coll_stream(name: str, nccl_pg: "torch.distributed.ProcessGroup", nccl_tensor: torch.Tensor): + global NCCL_STREAMS + if name in NCCL_STREAMS and NCCL_STREAMS[name] is not None: + return NCCL_STREAMS[name] + device = nccl_tensor.device + if hasattr(nccl_pg, "_get_backend"): + nccl_backend = nccl_pg._get_backend(device) + else: + # before torch 2.x torch._C._distributed_c10d.ProcessGroupNCCL is a subclass of + # torch.distributed.ProcessGroup + nccl_backend = nccl_pg + if hasattr(nccl_backend, "get_coll_cuda_stream_id"): + NDTimelineLogger().info(nccl_backend) + stream_id = nccl_backend.get_coll_cuda_stream_id([nccl_tensor]) + if stream_id < 0: + rank = nccl_pg.rank() + NDTimelineLogger().info(f"[{rank}]{name} coll stream is not available, skipped") + return None + _CUDA_DEVICE = 1 + nccl_stream = torch.cuda.Stream(stream_id=stream_id, device_index=device.index, device_type=_CUDA_DEVICE) + rank = nccl_pg.rank() + msg = f"[{rank}]{name} nccl coll stream id={stream_id} device={device} stream={nccl_stream}" + NDTimelineLogger().debug(msg) + NCCL_STREAMS[name] = nccl_stream + return nccl_stream + return None diff --git a/vescale/ndtimeline/timer.py b/vescale/ndtimeline/timer.py new file mode 100644 index 0000000..f913dd9 --- /dev/null +++ b/vescale/ndtimeline/timer.py @@ -0,0 +1,756 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import dataclasses +import time +import traceback +import gc +import contextlib +from decimal import Decimal +from enum import Enum, unique +from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED +from typing import List, Dict, Any, Callable, Optional, Tuple, Union + +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal +from functools import wraps + +import torch + +from .pool import DefaultEventPool, Event +from .world_info import WorldInfo +from .stream import get_nccl_p2p_stream, get_nccl_coll_stream +from .logger import NDTimelineLogger +from .variables import ( + NDTIMELINE_INNER_GLOBAL_STEP_KEY, + NDTIMELINE_STREAM_KEY, + NDTIMELINE_FLUSH_SEPCIAL, +) + + +class GlobalReferenceTime: + local_rank: int = 0 + world_size: int = 0 + device: torch.device = None + # global ref events + ref_events: List[torch.cuda.Event] = [] + ref_pointer: int = 0 + clock_diff: float = 0.0 # ms + initial_min_clock: int = 0 # ns + last_calibrated_at: float = 0.0 # ms + gpu_clock_residual_coef: float = 1.0 + initialized: bool = False + + @classmethod + def init(cls, world_sz: int, device: Optional[Union[int, torch.device]] = None): + if isinstance(device, int): + cls.device = torch.device(f"cuda:{device}") + cls.local_rank = device + elif isinstance(device, torch.device): + cls.device = device + cls.local_rank = device.index + elif device is None: + cls.device = torch.device(f"cuda:{torch.cuda.current_device()}") + cls.local_rank = torch.cuda.current_device() + else: + raise RuntimeError(f"device must be int or torch.device or None, but got {type(device)}") + cls.world_size = world_sz + assert isinstance(cls.device, torch.device) + with torch.cuda.device(cls.device.index): + cls.ref_events = [ + torch.cuda.Event(enable_timing=True, blocking=False, interprocess=False) for _ in range(2) + ] + # warmup + for e in cls.ref_events: + e.record(stream=torch.cuda.default_stream()) + for e in cls.ref_events: + e.synchronize() + cls.calibrate() + if cls.local_rank == 0: + NDTimelineLogger().debug(f"cls.initial_min_clock: {cls.initial_min_clock}ns") + cls.initialized = True + + @classmethod + def sync_events(cls): + for i in range(len(cls.ref_events)): + cls.ref_events[i].synchronize() + + @classmethod + def calibrate(cls): + # round-robin + calibrate_st = time.perf_counter() + next_pointer = (cls.ref_pointer + 1) % len(cls.ref_events) + cls.ref_pointer = next_pointer + ref = cls.ref_events[next_pointer] + with torch.cuda.device(cls.device.index): + if not cls.initialized: + torch.distributed.barrier() + torch.cuda.synchronize() + # torch.cuda.default_stream().synchronize() + ref.record(stream=torch.cuda.default_stream()) + ref.synchronize() + ts_ns = int(time.time_ns()) + ts = ts_ns / 1e6 + + if not cls.initialized: + my_clock = torch.tensor([ts_ns], dtype=torch.long, device=cls.device) + world_clocks = [torch.zeros([1], dtype=torch.long, device=cls.device) for _ in range(cls.world_size)] + torch.distributed.all_gather(world_clocks, my_clock) + all_clocks = [r.cpu().tolist()[0] for r in world_clocks] + min_clock = min(all_clocks) + cls.initial_min_clock = min_clock + + cls.clock_diff = (ts_ns - cls.initial_min_clock) / 1e6 # to unit ms + + # cpu-gpu calibrate + cpu_time = ts - cls.last_calibrated_at # ms + gpu_time = 0.0 + cls.last_calibrated_at = ts # ms + if cls.initialized and 2 * 1e3 < cpu_time < 200000 * 1e3: + gpu_time = abs(cls.ref_events[0].elapsed_time(cls.ref_events[1])) # ms + gpu_cpu_diff = Decimal((gpu_time) - (cpu_time)) / Decimal(gpu_time) + cls.gpu_clock_residual_coef = float(1 - gpu_cpu_diff) + if cls.local_rank == 0: + NDTimelineLogger().info( + f"local rank 0, calibrate sync cpu moment: {ts_ns} ns, clock diff: {cls.clock_diff} ms, " + f"initial min: {cls.initial_min_clock} ns, " + f"gpu clock redidual coef: {cls.gpu_clock_residual_coef}, " + f"calibrate cpu: {cpu_time}ms, calibrate gpu: {gpu_time}ms" + ) + NDTimelineLogger().info( + f"rank {cls.local_rank} calibrate cost {1000 * (time.perf_counter() - calibrate_st):4.2f}ms" + ) + + @classmethod + def elapsed_time(cls, end_event): + # cuda event elapsed_time return in unit ms + gpu_time = cls.ref_events[cls.ref_pointer].elapsed_time(end_event) + return gpu_time * cls.gpu_clock_residual_coef + cls.last_calibrated_at + + @classmethod + def since_global_start_ts(cls, unix_ts): + # unix_ts in unit s + return unix_ts - cls.initial_min_clock / 1e9 + + +@unique +class NDMetricLevel(Enum): + """ + NDMetricLevel is used to define the level of metric. + """ + + FRAMEWORK_INFO = 2 + USER_INFO = 3 + INFO = 4 + + FRAMEWORK_DEBUG = 12 + USER_DEBUG = 13 + DEBUG = 14 + + FRAMEWORK_TRACE = 102 + USER_TRACE = 103 + TRACE = 104 + + def __lt__(self, other) -> bool: + return self.value < other.value + + def __le__(self, other) -> bool: + return self.value <= other.value + + def __gt__(self, other) -> bool: + return self.value > other.value + + def __ge__(self, other) -> bool: + return self.value >= other.value + + def __eq__(self, other) -> bool: + return self.value == other.value + + def __neq__(self, other) -> bool: + return self.value != other.value + + +@dataclass(frozen=False) +class DeviceTimerMeta: + name: str = "" + is_cpu_op: bool = False + legal_tags: List[str] = dataclasses.field(default_factory=list) + step_getter: Optional[Callable] = None + enabled: bool = True + level: NDMetricLevel = dataclasses.field(default_factory=lambda: NDMetricLevel.FRAMEWORK_DEBUG) + device_id: int = -1 + dispatch_mode: Literal["selected", "all"] = "all" + dst_names: List[str] = dataclasses.field(default_factory=list) + specified_extra: Dict[str, Any] = dataclasses.field(default_factory=dict) + common_extra: Dict[str, Any] = dataclasses.field(default_factory=dict) + + def __post_init__(self): + if self.dispatch_mode not in ["selected", "all"]: + raise ValueError(f"invalid dispatch_mode {self.dispatch_mode}") + if not isinstance(self.level, NDMetricLevel): + raise ValueError(f"invalid type of level {type(self.level)}") + + def copy(self): + return DeviceTimerMeta( + self.name, + self.is_cpu_op, + self.legal_tags.copy(), + self.step_getter, + self.enabled, + self.level, + self.device_id, + self.dispatch_mode, + self.dst_names.copy(), + self.specified_extra.copy(), + self.common_extra.copy(), + ) + + +class DeviceTimer: + def __init__( + self, + name: str, + is_cpu_op: bool = False, + legal_tags: Optional[List[str]] = None, + step_getter: Optional[Callable] = None, + enabled: bool = True, + level: NDMetricLevel = NDMetricLevel.FRAMEWORK_DEBUG, + device_id: int = 0, + dispatch_mode: Literal["selected", "all"] = "all", + dst_names: Optional[List[str]] = None, + specified_extra: Optional[Dict[str, Any]] = None, + common_extra: Optional[Dict[str, Any]] = None, + ) -> None: + super().__init__() + + legal_tags = legal_tags if legal_tags is not None else [] + dst_names = dst_names if dst_names is not None else [] + specified_extra = specified_extra if specified_extra is not None else {} + common_extra = common_extra if common_extra is not None else {} + + if dispatch_mode not in ["all", "selected"]: + raise ValueError(f"invaid dispatch_mode {dispatch_mode} {type(dispatch_mode)}") + self.meta = DeviceTimerMeta( + name, + is_cpu_op, + legal_tags, + step_getter, + enabled, + level, + device_id, + dispatch_mode, + dst_names, + specified_extra, + common_extra, + ) + for field_name in self.meta.__dict__: + setattr(self, field_name, getattr(self.meta, field_name)) + if NDTIMELINE_INNER_GLOBAL_STEP_KEY not in self.legal_tags and step_getter is not None: + legal_tags.append(NDTIMELINE_INNER_GLOBAL_STEP_KEY) + if NDTIMELINE_STREAM_KEY not in self.legal_tags: + legal_tags.append(NDTIMELINE_STREAM_KEY) + self._started: bool = False + self._stream: torch.cuda.Stream = None + # list of [start_event, stop_event] + self._event_pairs: List[List[Event, Event]] = [] + self._pool = DefaultEventPool + # list of [start_ts, duration, tag] + self._extra_records: List[List[float, float, Dict[str, Any]]] = [] + + def __repr__(self) -> str: + return f"DeviceTimer with {self.meta.__repr__()}" + + def is_enabled(self) -> bool: + return self.enabled + + def enable(self): + self.meta.enabled = True + self.enabled = True + + def disable(self): + self.meta.enabled = False + self.enabled = False + + def insert_record( + self, + start_ts: float, + duration: float, + tag: Optional[Dict[str, Any]] = None, + level: NDMetricLevel = NDMetricLevel.FRAMEWORK_DEBUG, + ): + if not self.enabled or self.meta.level > level: + return + tag = tag if tag is not None else {} + if self.step_getter is not None: + tag[NDTIMELINE_INNER_GLOBAL_STEP_KEY] = self.step_getter() + self._extra_records.append([start_ts, duration, tag]) + + def start( + self, + stream: torch.cuda.Stream = None, + tag: Optional[Dict[str, Any]] = None, + level: NDMetricLevel = NDMetricLevel.FRAMEWORK_DEBUG, + ) -> None: + """Start the timer""" + if not self.enabled or self.meta.level > level: + return + assert not self._started, "timer has already been started" + tag = tag if tag is not None else {} + if self.step_getter is not None: + tag[NDTIMELINE_INNER_GLOBAL_STEP_KEY] = self.step_getter() + if self.is_cpu_op: + self._extra_records.append([time.time(), None, tag]) + self._started = True + return + start_event = self._pool.get(tag=tag) + stream_args = {} + if stream is not None: + self._stream = stream + self._stream.wait_stream(torch.cuda.default_stream()) + stream_args = {"stream": self._stream} + start_event.record(**stream_args) + self._event_pairs.append([start_event, None]) + self._started = True + + def stop(self, tag: Optional[Dict[str, Any]] = None, level: NDMetricLevel = NDMetricLevel.FRAMEWORK_DEBUG) -> None: + """Stop the timer. May be called in another thread.""" + if not self.enabled or self.meta.level > level: + return + assert self._started, "timer is not started" + tag = tag if tag is not None else {} + if self.is_cpu_op: + now = time.time() + assert self._extra_records[-1][1] is None, "duration is already set" + self._extra_records[-1][1] = now - self._extra_records[-1][0] + self._extra_records[-1][2] = {**tag, **self._extra_records[-1][2]} + self._started = False + return + stop_event = self._pool.get(tag=tag) + stream_args = {} + if self._stream is not None: + stream_args = {"stream": self._stream} + stop_event.record(**stream_args) + assert self._event_pairs[-1][-1] is None, "stop_event is already set" + self._event_pairs[-1][-1] = stop_event + self._started = False + + def reset(self) -> None: + self._started = False + self._stream = None + self._event_pairs = [] + self._extra_records = [] + + def elapsed(self, reset=True) -> Tuple[float, List[float], List[float], List[Dict[str, Any]]]: + """Calculate the elapsed time.""" + if not self.enabled: + return 0.0, [], [], [] + recent_elapsed_raw_parts = [0.0] * len(self._event_pairs) + recent_since_start_raw_parts = [0.0] * len(self._event_pairs) + tags = [{}] * len(self._event_pairs) + elapsed = 0.0 + with torch.cuda.device(self.device_id): + for i, (start_event, stop_event) in enumerate(self._event_pairs): + stop_event.synchronize() + start_event.synchronize() + single_elapsed = start_event.elapsed_time(stop_event) / 1e3 + single_since = GlobalReferenceTime.elapsed_time(start_event) / 1e3 + elapsed += single_elapsed + recent_elapsed_raw_parts[i] = single_elapsed + recent_since_start_raw_parts[i] = single_since + tags[i] = {**start_event.tag, **stop_event.tag} + tags[i] = {k: tags[i][k] for k in tags[i] if k in self.legal_tags} + self._pool.release(start_event) + self._pool.release(stop_event) + + if len(self._extra_records) > 0: + try: + elapsed += sum([record[1] for record in self._extra_records]) + except TypeError as e: + NDTimelineLogger().error( + f"exception {e} detected in `elapsed` of {self.name}, possible unmatched start stop" + ) + return 0.0, [], [], [] + self._extra_records.sort(key=lambda x: x[0]) + if len(recent_since_start_raw_parts) == 0: + recent_since_start_raw_parts = [record[0] for record in self._extra_records] + recent_elapsed_raw_parts = [record[1] for record in self._extra_records] + tags = [record[2] for record in self._extra_records] + else: + i = 0 + for record in self._extra_records: + while i < len(recent_since_start_raw_parts) and recent_since_start_raw_parts[i] < record[0]: + i += 1 + # a.insert(len(a), x) is equivalent to a.append(x). + recent_since_start_raw_parts.insert(i, record[0]) + recent_elapsed_raw_parts.insert(i, record[1]) + tags.insert(i, record[2]) + i += 1 + if reset: + self.reset() + return elapsed, recent_elapsed_raw_parts, recent_since_start_raw_parts, tags + + +class NDTimerManager: + def __init__( + self, + world_info: WorldInfo, + handlers: Optional[List[Callable]] = None, + max_workers: int = 3, + device_id: Optional[int] = None, + init_cuda_dist: bool = True, + metric_level: NDMetricLevel = NDMetricLevel.TRACE, + is_nature_step: bool = True, + ) -> None: + self._name2timer = {} + self._name2active_tmp: Dict[str, bool] = {} + self._executor = ThreadPoolExecutor(max_workers=max_workers) + self._futures = [] + self._is_initailized = False + self.world_info = world_info + self.handlers = handlers if handlers is not None else [] + self._device_id = device_id + self.metric_level = metric_level + self.is_nature_step = is_nature_step + self._unregistered_timer_start = [] + self._unregistered_timer_stop = [] + self._unregistered_timer_records_insert = [] + self._cur_global_step = 0 + + if init_cuda_dist: + self.init_cuda_dist_associated(device_id=device_id) + + @property + def global_step(self): + return self._cur_global_step + + @global_step.setter + def global_step(self, step: int): + if not isinstance(step, int): + raise ValueError(f"step {step} is not int") + self._cur_global_step = step + + def init_cuda_dist_associated(self, device_id: Optional[int] = None): + self._device_id = device_id + if self._device_id is not None: + DefaultEventPool.init(device=self._device_id) + GlobalReferenceTime.init(device=self._device_id, world_sz=self.world_info["world_size"]) + else: + DefaultEventPool.init() + GlobalReferenceTime.init(world_sz=self.world_info["world_size"]) + + def register_timers(self, timer_metas: List[DeviceTimerMeta]) -> None: + for meta in timer_metas: + if meta.device_id == -1: + if not meta.is_cpu_op: + meta.device_id = torch.cuda.current_device() + else: + meta.device_id = 0 + if meta.step_getter is None and self.is_nature_step: + + def getter(): + return self._cur_global_step + + meta.step_getter = getter + assert not self._is_initailized, "DeviceTimerManager should only be initialized once" + NDTimerManager._register_timers(timer_metas, self._name2timer) + self._is_initailized = True + + @staticmethod + def _register_timers(timer_metas: List[DeviceTimerMeta], d: Dict[str, DeviceTimer]): + for meta in timer_metas: + d[meta.name] = DeviceTimer(**meta.__dict__) + + @staticmethod + def _flush_timers( + handlers: List[Callable], + name2timer: Dict[str, DeviceTimer], + step_range: range, + world_info: WorldInfo, + require_calibrate: bool = False, + ) -> None: + if require_calibrate: + GlobalReferenceTime.calibrate() + for name in name2timer: + timer = name2timer[name] + elapsed_result = timer.elapsed() + for handler in handlers: + if timer.dispatch_mode == "selected" and handler.dispatch_key not in timer.dst_names: + continue + extra = timer.common_extra + if handler.dispatch_key in timer.specified_extra: + specified_extra = timer.specified_extra[handler.dispatch_key] + extra = {**extra, **specified_extra} + try: + handler(name, *elapsed_result, step_range, world_info, extra) + except Exception as e: + NDTimelineLogger().error(f"handler {handler} failed: {e}") + NDTimelineLogger().error(traceback.format_exc()) + timer.meta = None # in case of CudaTimer obj gc failure due to meta obj + + for handler in handlers: + handler(NDTIMELINE_FLUSH_SEPCIAL, 0.0, [], [], [], range(0, 1), world_info, extra) + + def start_timer(self, name: str, tag: Optional[Dict[str, Any]] = None) -> None: + assert isinstance(self, NDTimerManager) or issubclass(type(self), NDTimerManager) + tag = tag if tag is not None else {} + try: + if name not in self._unregistered_timer_start: + stream = None + if NDTIMELINE_STREAM_KEY in tag: + stream = tag[NDTIMELINE_STREAM_KEY] + del tag[NDTIMELINE_STREAM_KEY] + self._name2timer[name].start(stream=stream, tag=tag, level=self.metric_level) + except KeyError: + self._unregistered_timer_start.append(name) + NDTimelineLogger().warning(f"metric {name} is not registered when `start_timer`, skipped") + except Exception: + NDTimelineLogger().error(f"trigger exception when `start_timer` metric {name}") + NDTimelineLogger().error(traceback.format_exc()) + + def stop_timer(self, name, tag: Optional[Dict[str, Any]] = None) -> None: + assert isinstance(self, NDTimerManager) or issubclass(type(self), NDTimerManager) + tag = tag if tag is not None else {} + try: + if name not in self._unregistered_timer_stop: + if NDTIMELINE_STREAM_KEY in tag: + del tag[NDTIMELINE_STREAM_KEY] + self._name2timer[name].stop(tag=tag, level=self.metric_level) + except KeyError: + self._unregistered_timer_stop.append(name) + NDTimelineLogger().warning(f"metric {name} is not registered when `stop_timer`, skipped") + except Exception: + NDTimelineLogger().error(f"trigger exception when `start_timer` metric {name}") + NDTimelineLogger().error(traceback.format_exc()) + + def insert_record(self, name, start_ts: float, duration: float, tag: Optional[Dict[str, Any]] = None): + assert isinstance(self, NDTimerManager) or issubclass(type(self), NDTimerManager) + tag = tag if tag is not None else {} + try: + if name not in self._unregistered_timer_records_insert: + self._name2timer[name].insert_record(start_ts, duration, tag, self.metric_level) + except KeyError: + self._unregistered_timer_records_insert.append(name) + NDTimelineLogger().warning(f"metric {name} is not registered when `insert_record`, skipped") + except Exception: + NDTimelineLogger().error(f"trigger exception when `insert_record` metric {name}") + NDTimelineLogger().error(traceback.format_exc()) + + def clear(self): + self.async_flush( + step_range=range(0, 10), + next_iter_enabled=False, + collect_future=False, + submit2handler=False, + keep_timer_state=True, + ) + + def disable_and_save(self): + is_autogc = gc.isenabled() + if is_autogc: + gc.disable() + for k in self._name2timer: + self._name2active_tmp[k] = self._name2timer[k].is_enabled() + self._name2timer[k].disable() + if is_autogc: + gc.enable() + + def recover_from_history(self): + is_autogc = gc.isenabled() + if is_autogc: + gc.disable() + for k in self._name2timer: + if k in self._name2active_tmp: + if self._name2active_tmp[k]: + self._name2timer[k].enable() + else: + self._name2timer[k].disable() + del self._name2active_tmp[k] + if is_autogc: + gc.enable() + + def async_flush( + self, + step_range: range, + next_iter_enabled: bool = True, + world_info: Optional[WorldInfo] = None, + handlers: Optional[List[Callable[..., None]]] = None, + collect_future: bool = True, + submit2handler: bool = True, + force_calibrate: bool = False, + dynamic_calibrate: bool = False, + keep_timer_state: bool = False, + sequential_calibrate: bool = True, + ): + st = time.perf_counter() + handlers = handlers if handlers is not None else [] + enabled_timer_names = [name for name in self._name2timer if self._name2timer[name].meta.enabled] + NDTimelineLogger().debug(f"async flush triggered, {enabled_timer_names}") + + unregistered = self._unregistered_timer_start.copy() + unregistered.extend(self._unregistered_timer_stop) + unregistered.extend(self._unregistered_timer_records_insert) + unregistered = list(set(unregistered)) + if len(unregistered) > 0: + NDTimelineLogger().warning(f"unregistered timers: {unregistered}") + + past_name2timer = self._name2timer + fresh_name2timer = {} + timer_metas = [past_name2timer[name].meta.copy() for name in past_name2timer] + + if not keep_timer_state: + for meta in timer_metas: + meta.enabled = next_iter_enabled + + # filter enabled timer + past_name2timer = { + name: past_name2timer[name] + for name in past_name2timer + if past_name2timer[name].meta.enabled and past_name2timer[name].meta.level <= self.metric_level + } + + NDTimerManager._register_timers(timer_metas, fresh_name2timer) + + is_autogc = gc.isenabled() + if is_autogc: + gc.disable() + self._name2timer = fresh_name2timer + if is_autogc: + gc.enable() + + if collect_future: + i = 0 + while i < len(self._futures): + if self._futures[i].done(): + e = self._futures[i].exception() + if e is not None: + NDTimelineLogger().error("".join(traceback.format_exception(type(e), e, e.__traceback__))) + self._futures.pop(i) + else: + i += 1 + + if len(handlers) == 0: + handlers = self.handlers + + require_calibrate = force_calibrate or ( + dynamic_calibrate and GlobalReferenceTime.last_calibrated_at < (time.time() - 30 * 60) * 1e3 + ) + if require_calibrate and sequential_calibrate: + GlobalReferenceTime.calibrate() + require_calibrate = False + + if submit2handler and len(past_name2timer) > 0: + world_info = self.world_info if world_info is None else self.world_info + future = self._executor.submit( + NDTimerManager._flush_timers, handlers, past_name2timer, step_range, world_info, require_calibrate + ) + self._futures.append(future) + + NDTimelineLogger().debug(f"async flush cost {1000 * (time.perf_counter() - st):4.2f}ms") + + def wait(self) -> None: + if len(self._futures) == 0: + return + torch.distributed.barrier() + # wait at most 10 seconds + wait(self._futures, timeout=10, return_when=ALL_COMPLETED) + for f in self._futures: + e = f.exception(timeout=0.001) + if e is not None: + NDTimelineLogger().error("".join(traceback.format_exception(type(e), e, e.__traceback__))) + self._futures = [] + # streamer can not respond to training process now + # assume msg will be handled in 3 seconds + time.sleep(3) + + +class Singleton(type): + _instances = {} + + def __call__(self, *args, **kwargs): + if self not in self._instances: + self._instances[self] = super().__call__(*args, **kwargs) + self._singleton_inited = True + return self._instances[self] + + +class NDTimerManagerSingleton(NDTimerManager, metaclass=Singleton): + @classmethod + def is_initialized(cls) -> bool: + return hasattr(cls, "_singleton_inited") and cls._singleton_inited + + +@contextlib.contextmanager +def ndtimeit(name: str, tag: Optional[Dict[str, Any]] = None): + """reentrant timeit context manager""" + if not NDTimerManagerSingleton.is_initialized(): + yield + return + tag = tag if tag is not None else {} + NDTimerManagerSingleton().start_timer(name, tag) + try: + yield + finally: + NDTimerManagerSingleton().stop_timer(name) + + +@contextlib.contextmanager +def ndtimeit_p2p(name: str, nccl_pg, peer: int, is_batched: bool = True, tag: Optional[Dict[str, Any]] = None): + if not NDTimerManagerSingleton.is_initialized(): + yield + return + p2p_stream = get_nccl_p2p_stream(name=name, nccl_pg=nccl_pg, peer=peer, is_batched=is_batched) + if tag is not None: + tag[NDTIMELINE_STREAM_KEY] = p2p_stream + else: + tag = {NDTIMELINE_STREAM_KEY: p2p_stream} + NDTimerManagerSingleton().start_timer(name, tag) + try: + yield + finally: + NDTimerManagerSingleton().stop_timer(name) + + +@contextlib.contextmanager +def ndtimeit_coll(name: str, pg, tensor: torch.Tensor, tag: Optional[Dict[str, Any]] = None): + if not NDTimerManagerSingleton.is_initialized(): + yield + return + coll_stream = get_nccl_coll_stream(name, pg, tensor) + if tag is not None: + tag[NDTIMELINE_STREAM_KEY] = coll_stream + else: + tag = {NDTIMELINE_STREAM_KEY: coll_stream} + NDTimerManagerSingleton().start_timer(name, tag) + try: + yield + finally: + NDTimerManagerSingleton().stop_timer(name) + + +def ndtimer(metric: str, tags: Optional[Dict[str, Any]] = None): + def _ndtimeit_decorator(func): + @wraps(func) + def with_ndtimeit(*args, **kwargs): + with ndtimeit(metric, tags): + return func(*args, **kwargs) + + return with_ndtimeit + + return _ndtimeit_decorator diff --git a/vescale/ndtimeline/variables.py b/vescale/ndtimeline/variables.py new file mode 100644 index 0000000..7ed9d65 --- /dev/null +++ b/vescale/ndtimeline/variables.py @@ -0,0 +1,27 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import os + +SOCK_TIMEOUT_CLIENT: float = 2.0 # seconds +SOCK_PARENT_DIR: str = "/opt/tiger/tmp/ndtimeline" +SOCK_PATH: str = os.path.join(SOCK_PARENT_DIR, "ndtimeline.sock") # /opt/tiger/tmp/ndtimeline/ndtimeline.sock +LOCAL_LOGGING_PATH: str = SOCK_PARENT_DIR +DEFAULT_CUDA_EVENT_POOL_SIZE: int = 20 +NDTIMELINE_INNER_GLOBAL_STEP_KEY: str = "_inner_global_step" +NDTIMELINE_STREAM_KEY: str = "stream_key" +NDTIMELINE_FLUSH_SEPCIAL: str = "special" diff --git a/vescale/ndtimeline/world_info.py b/vescale/ndtimeline/world_info.py new file mode 100644 index 0000000..197e1a1 --- /dev/null +++ b/vescale/ndtimeline/world_info.py @@ -0,0 +1,123 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from dataclasses import dataclass +from typing import Any, Dict + + +@dataclass(frozen=False) +class TopoInfo: + rank: int = 0 + dp_rank: int = 0 + ddp_rank: int = 0 + tp_rank: int = 0 + pp_rank: int = 0 + local_rank: int = 0 + ip: str = "0.0.0.0" + dp_size: int = 1 + ddp_size: int = 1 + tp_size: int = 1 + pp_size: int = 1 + world_size: int = 1 + + def __post_init__(self): + # validation + for field_name in self.__dict__: + field_content = self.__dict__[field_name] + if field_name.endswith("rank") and field_content < 0: + raise ValueError(f"TopoInfo instance's {field_name}={field_content}, expected nonnegative number") + if field_name.endswith("size") and field_content <= 0: + raise ValueError(f"WorldInfo instance's {field_name}={field_content}, expected positive number") + + +@dataclass(frozen=False) +class TrainingInfo: + role_id: int = 0 + trial_id: int = 0 + run_id: int = 0 + + def __post_init__(self): + # validation + for field_name in self.__dict__: + field_content = self.__dict__[field_name] + if field_content < 0: + raise ValueError(f"TrainingInfo instance's {field_name}={field_content}, expected nonnegative number") + + +class WorldInfo: + def __init__( + self, + rank: int, + local_rank: int, + dp_rank: int = 0, + ddp_rank: int = 0, + tp_rank: int = 0, + pp_rank: int = 0, + dp_size: int = 1, + ddp_size: int = 1, + tp_size: int = 1, + pp_size: int = 1, + world_size: int = 1, + ip: str = "0.0.0.0", + role_id: int = 0, + run_id: int = 0, + trial_id: int = 0, + **extra_meta: Dict[str, Any], + ): + self.topo_info = TopoInfo( + rank=rank, + local_rank=local_rank, + dp_rank=dp_rank, + ddp_rank=ddp_rank, + tp_rank=tp_rank, + pp_rank=pp_rank, + dp_size=dp_size, + ddp_size=ddp_size, + tp_size=tp_size, + pp_size=pp_size, + world_size=world_size, + ip=ip, + ) + self.training_info = TrainingInfo( + role_id=role_id, + trial_id=trial_id, + run_id=run_id, + ) + self.extra_info = {} + for k in extra_meta: + self.extra_info[k] = extra_meta[k] + + def __repr__(self) -> str: + return f"WorldInfo: {self.topo_info.__repr__()} {self.training_info.__repr__()} {self.extra_info.__repr__()}" + + def __getitem__(self, key: str): + if key in self.topo_info.__dict__: + return self.topo_info.__dict__[key] + if key in self.training_info.__dict__: + return self.training_info.__dict__[key] + if key in self.extra_info: + return self.extra_info[key] + raise KeyError(f"{key} is not found") + + def __setitem__(self, key: str, value: Any): + if key in self.topo_info.__dict__: + self.topo_info.__dict__[key] = value + if key in self.training_info.__dict__: + self.training_info.__dict__[key] = value + if key in self.extra_info: + self.extra_info[key] = value + raise KeyError(f"{key} is not found") diff --git a/vescale/pipe/README.md b/vescale/pipe/README.md new file mode 100644 index 0000000..028bb44 --- /dev/null +++ b/vescale/pipe/README.md @@ -0,0 +1,125 @@ +# veScale Pipeline Parallel (PP) + +## TLDR + +PP + +## What is PP? + +`Pipeline Parallel` (`PP`) partitions layers of a model across multiple devices to form a pipelined execution of the training. +`PP` takes as input a list of microbatches of data per iteration and performs pipelined training execution (forward, backward, and optimizer update) on each microbatch, while overlaps communication with computation on each device. + +## Why veScale PP? + +Existing `PP` systems suffer multiple drawbacks as below, which prevent productization within a company: + +- _Complex API_: assuming that model developers are also systems experts in `PP` + +- _Hacking model code_: requiring manually rewrite the model code to run `PP` + +- _Lacking single device abstraction_: requiring manually rewrite the training script to be `PP` device-specific + +- _Lacking options of pipeline construction_: relying on a single option of graph tracing, or perfect graph tracing, or solely manual construction of the pipeline. + +- _Lacking customizability of pipeline schedule_: deeply coupling the entire runtime (e.g., compute, communication) with a specific `PP` schedule (e.g., `1F1B`) + +- _Lacking diverse model support_: supporting only sequential model architecture without branching, or supporting only pipeline stages having single input or single output without multiple input/output. + +## What is veScale PP? + +`veScale PP` offers a new `PP` framework that is both _**Easy-to-Use**_ and _**Easy-to-Customize**_, thus it is used internally in our production. +Especially, `veScale PP` provides: + +- _Easy API_: hiding the complexity of `PP` systems and runtimes from model developers + +- _Zero model code change_: keeping the original torch model code as it is for transparent pipelined models + +- _Single device abstraction_: keeping the single device training script as it is for transparent pipelined training on multiple devices + +- _Multiple options of pipeline construction_: user can flexibly choose modes: + + - `GRAPH_EAGER` mode automatically traces and parses the model into a graph, splits the graph into pipeline stages, and constructs each stage for pipeline execution + + - graph tracer can also be choices or users + + - `MANUAL_EAGER` mode manually constructs each pipeline stage for pipeline execution, without graph tracing, parsing, and splitting. + +- _Customizable pipeline schedule_: empowering users to define their custom pipeline schedules, beyond our built-in schedule as below: + + - `1F1B` + + - `Interleaved 1F1B` + + - `Zero Bubble` + +- _Support diverse models_: support comprehensive model archictures for non-sequential models, multiple-input-multiple-output stages, and etc. + +## Why is veScale PP a better option than its counterparts? + +- Compared with Megatron-LM's PP, `veScale PP` offers not only a better __Ease-of-Use__ experience in all aspects (easy API, zero model code, single device abstraction, options of pipeline construction) but also a plus of __Customizability__ allowing users to conveniently customize new pipeline schedules. + +- Compared with DeepSpeed, `veScale PP` requires no modification of model code. It further supports multi-stage scheduling for non-sequential multimodal architecture and multi-input settings instead of being constrained by `nn.Sequential`'s syntax. + +- Compared with the pre-release torchtitan, `veScale PP` provides: i) single device abstraction of training script, ii) wider options of graph tracer support, iii) wider model architecture support, and iv) guarantees bitwise accuracy alignment between `PP` and single device code. + +## How does veScale PP work? + +Spinning up a `PP` job typically requires three steps: i) trace and parse model graph, ii) construct pipeline stage, and iii) execute pipeline schedule. Each step is handled by `PipeParser`, `PipeModule`, and `PipeEngine`. Upon receiving the model definition, `PipeParser` (`GRAPH_EAGER` mode) breaks down the model code to the intermediate representation of low-level modules and operators up to the granularity of your choice. Under `MANUAL_EAGER` mode, users only need to assign stage modules and their communication relationships. `PipeModule` collects parameters and operators, and optimizer states belonging to the same stage, and resolves communication topology among devices. `PipeEngine` will schedule steps to execute training according to pipeline schedules. + +## How to use veScale PP? + +- Example of using `GRAPH_EAGER` mode: + + ```python + # zero model code change + class EightMLP(nn.Module): + def __init__(self, ...): + self.mlp1 = MLP(...) + ... + self.mlp8 = MLP(...) + def forward(...): + ... + + # An EightMLP is composed of 8 submodules called MLP + model = EightMLP() + # or model = deferred_init(EightMLP) + + from vescale.plan import PipelineParallelPlan, PipelineScheduleType, PipelineSplitMethodType, ModeType + from vescale.pipe import construct_pipeline_stage + from vescale.engine import PipeEngine + from vescale.dtensor.device_mesh import DeviceMesh + + # create 3-dim DeviceMesh + device_mesh = DeviceMesh("cuda", [[[0]], [[1]], [[2]], [[3]]], mesh_dim_names=("PP", "DP", "TP")) + + # prepare plan for pipeline parallelism + pipe_plan = PipelineParallelPlan( + mode=ModeType.GRAPH_EAGER, + split_method=PipelineSplitMethodType.MANUAL, + num_stages=4, + virtual_chunks=2, + smallest_unsplittable_units=[f"mlp{i + 1}" for i in range(8)], # maintain hierarchy of each MLP module + split_points=["mlp2", "mlp4", "mlp6", "mlp8"], # managed pipeline split points by fully qualified names + overlap_p2p_comm=True, # speedup option + schedule_type=PipelineScheduleType.INTERLEAVED_1F1B, + ) + + # parse model graph, split graph, and construct pipeline stage + pipe_stage = construct_pipeline_stage(model, pipe_plan, device_mesh) + + # prepare pipeline schedule and execution engine + engine = PipeEngine(pipe_stage, device_mesh, pipe_plan) + + # train PP model as if on single device + for minibatch_data in dataloader: + minibatch_loss, microbatch_outputs = engine(minibatch_data) + minibatch_loss.backward() + ... + + ``` + +- Example of using `MANUAL_EAGER` mode: Coming Soon. + +- APIs can be found in `/vescale/pipe/pipe_stage.py` and `/vescale/pipe/pipe.py` + +- More examples can be found in `/test/parallel/pipeline/api/test_simple_api.py` \ No newline at end of file diff --git a/vescale/pipe/__init__.py b/vescale/pipe/__init__.py new file mode 100644 index 0000000..2403b36 --- /dev/null +++ b/vescale/pipe/__init__.py @@ -0,0 +1,26 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from .pipe_stage import ( + PipeModule, + build_shared_module_group, + build_stage_module_and_dependency, + construct_stage_modules, + construct_pipeline_stage, +) +from .pipe_parser import PipeParser, parse_model_graph, split_pipeline_point, construct_pipeline_split_graph +from .pipe_emmiter import ScheduleEngine, validate_pipeline_schedule diff --git a/vescale/pipe/_schedules/__init__.py b/vescale/pipe/_schedules/__init__.py new file mode 100644 index 0000000..66cb2c6 --- /dev/null +++ b/vescale/pipe/_schedules/__init__.py @@ -0,0 +1,21 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from .instruction_base import StageDeps, Shape, register_instruction +from .pipedream_flush import OneFOneBInstrcutionGenerator +from .looping_bfs import InterleavedOneFOneBInstructionGenerator +from .zero_bubble_v import ZeroBubbleVInstrcutionGenerator diff --git a/vescale/pipe/_schedules/instruction_base.py b/vescale/pipe/_schedules/instruction_base.py new file mode 100644 index 0000000..d43474e --- /dev/null +++ b/vescale/pipe/_schedules/instruction_base.py @@ -0,0 +1,552 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +import enum +from dataclasses import dataclass +from collections import defaultdict +from abc import ABCMeta, abstractmethod +from typing import Sequence, Callable +import torch +from torch.distributed.distributed_c10d import get_rank +from vescale.dtensor.device_mesh import DeviceMesh +from vescale.dtensor.placement_types import Placement +from vescale.pipe.pipe_stage import PipeModule +from typing import List, Tuple, Union, Optional, Dict, Any +import logging +import numpy as np +from vescale.plan.spec import PipelineP2PSpec + +Shape = Union[List[int], torch.Size] + +logger = logging.getLogger(__name__) +registed_functions = {} + + +def register_instruction(name): + assert name is not None, "The Instruction must have name" + if name in registed_functions: + msg = f"{name} allready in registed instruction" + logger.warning(msg) + + def _register_instruction(func): + def wrap(*args, **kwargs): + return func(*args, **kwargs) + + registed_functions.update({name: func}) + return wrap + + return _register_instruction + + +@dataclass +class CommPacket: + cur_mesh: DeviceMesh + peer_mesh: DeviceMesh + input_id: int + peer_stage: int + peer_sharding: List[Placement] = None + cur_sharding: List[Placement] = None + is_kwargs: bool = False + + +class StageDeps: + def __init__( + self, + dep: np.ndarray, + meshes: List[DeviceMesh], + vpp_module_list: Union[List, PipeModule], + p2p_index_mapping: Optional[Dict[int, List[PipelineP2PSpec]]] = None, + ): + self.D = dep + self.M = vpp_module_list + self.meshes = meshes + self.is_vpp = self.get_num_chunks() > 1 + self.mapping: Dict = {} + if p2p_index_mapping is None: + self.mapping = defaultdict(list) + self.generate_one_forward_mapping() + else: + self.mapping = p2p_index_mapping + self.parsing_forward_mapping() + + self.recv_tables: Dict[int, List[CommPacket]] = defaultdict(list) + self.send_tables: Dict[int, List[CommPacket]] = defaultdict(list) + self.local_dataloader_list: Dict[Any, List[CommPacket]] = defaultdict(list) + self.construct_communication_graph() + + def construct_communication_graph(self): + for i in range(self.num_stage): + cur_mesh = self.get_current_mesh(i) + cur_mapping = self.mapping[i] # get the index mapping i + prior_list = [] + local_data_list = [] + # stage_id: [input_idx, ...] + for p2p_spec in cur_mapping: + prev_stage_id = p2p_spec.peer_stage_idx + input_id = p2p_spec.peer_output_idx + if prev_stage_id != i: # not from self + prior_list.append((self.get_current_mesh(prev_stage_id), prev_stage_id, input_id)) + else: # from self stage + local_data_list.append(input_id) + + prior_list = sorted(prior_list, key=lambda item: (item[1], item[2])) + for device, pre, input_id in prior_list: + sr = CommPacket( + cur_mesh=cur_mesh, peer_mesh=device, input_id=input_id, peer_stage=pre + ) # input is single + self.recv_tables[i].append(sr) + for input_id in local_data_list: + sr = CommPacket( + cur_mesh=cur_mesh, + peer_mesh=None, + input_id=input_id, + peer_stage=None, + ) + self.local_dataloader_list[i].append(sr) + + # construct out degree + for i in range(self.num_stage): + prior_list = [] + for j in range(self.num_stage): + if i == j: # don't check self , no cycle + continue + j_recvs = self.recv_tables[j] + for recv in j_recvs: + if recv.peer_stage == i: # is i send to j + send = CommPacket( + cur_mesh=recv.peer_mesh, + peer_mesh=recv.cur_mesh, + input_id=recv.input_id, + peer_stage=j, + ) + prior_list.append(send) + # sort by input_id stage id is unneeded + sorted(prior_list, key=lambda item: item.input_id) + self.send_tables[i] = prior_list + + def generate_one_forward_mapping(self): + for i in range(self.num_stage): + cur_mapping = self.mapping[i] + pre_stages = self.get_pre_stage(i, ignore_virtual=False) + assert len(pre_stages) <= 1, "multi branch stage need parse p2p_index_mapping" + for pre in pre_stages: + cur_mapping.append(PipelineP2PSpec(pre, 0)) + + if self.is_pipeline_first_stage(i): + cur_mapping.append(PipelineP2PSpec(i, 0)) + + def parsing_forward_mapping(self): + # 1: [(0,0), (1,0), (0,2)] + for i in range(self.num_stage): + if i not in self.mapping: + cur_indexing = [] + pre_stages = self.get_pre_stage(i, ignore_virtual=False) + assert len(pre_stages) <= 1, "multi branch stage need parse p2p_index_mapping" + for pre in pre_stages: + cur_indexing.append(PipelineP2PSpec(pre, 0)) + if self.is_pipeline_first_stage(i): + cur_indexing.append(PipelineP2PSpec(i, 0)) + self.mapping.update({i: cur_indexing}) + + def get_send_comms(self, i): + return self.send_tables[i] + + def get_recv_comms(self, i): + return self.recv_tables[i] + + def get_local_comms(self, i): + return self.local_dataloader_list[i] + + @property + def num_stage(self): + return len(self.D) + + def is_first(self, s_id): + pre = self.D[:, s_id] + non_zero = np.count_nonzero(pre) + if non_zero == 0: + return True + return False + + def is_last(self, s_id): + post = self.D[s_id] + non_zero = np.count_nonzero(post) + if non_zero == 0: + return True + return False + + def get_pre_stage(self, i, ignore_virtual=True): + pre = self.D[:, i] + stage_ids = np.where(pre == 1)[0].tolist() + if self.is_first(i) and self.is_vpp and not ignore_virtual: + last_stages = list(filter(self.is_last, range(self.num_stage))) + return last_stages + else: + return stage_ids + + def get_post_stage(self, i, ignore_virtual=True): + post = self.D[i] + stage_ids = np.where(post == 1)[0].tolist() + + if self.is_last(i) and self.is_vpp and not ignore_virtual: + first_stages = list(filter(self.is_first, range(self.num_stage))) + return first_stages + else: + return stage_ids + + def get_first_stage(self): + stages = [] + for i in range(self.num_stage): + pre_stages = self.get_pre_stage(i) + if len(pre_stages) == 0: # in-degree is 0 + stages.append(i) + return stages + + def get_last_stage(self): + stages = [] + for i in range(self.num_stage): + post_stages = self.get_post_stage(i) + if len(post_stages) == 0: # out-degree is 0 + stages.append(i) + return stages + + def get_current_model(self, i): + return self.M + + def is_pipeline_first_stage(self, i): + pre = self.get_pre_stage(i) + return len(pre) == 0 # first stage has no input + + def is_pipeline_last_stage(self, i): + post = self.get_post_stage(i) + return len(post) == 0 # last stage has no output + + def is_vpp_first_stage(self, i, chunk_id): + return self.is_pipeline_first_stage(i) and chunk_id == 0 + + def is_vpp_last_stage(self, i, chunk_id): + return self.is_pipeline_last_stage(i) and (chunk_id == (self.get_num_chunks() - 1)) + + def get_num_chunks(self): + if isinstance(self.M, list): + return len(self.M) + else: + return self.M.virtual_chunks + + def get_current_mesh(self, i): + return self.meshes[i] + + def __str__(self): + tmp = "\n\n" + tmp += f"stages: {self.num_stage}, deps:{self.D}\n" + for i in range(self.num_stage): + tmp += f"\n===================stage:{i} start=======================\n" + tmp += "recv : \n" + for comm in self.recv_tables[i]: + tmp += f"\t\t recv from {comm.peer_stage} with input:{comm.input_id} comm:{comm}\n" + tmp += "send : \n" + for comm in self.send_tables[i]: + tmp += f"\t\t send to {comm.peer_stage} with input:{comm.input_id} comm:{comm}\n" + tmp += "local_dataloader_list : \n" + for comm in self.local_dataloader_list[i]: + tmp += f"\t\t local_dataloader with input:{comm.input_id} comm:{comm}\n" + + tmp += f"===================stage:{i} end=======================\n\n" + return tmp + + +def get_linear_pp_module_dep2(module_list: List, device_mesh_list: List[DeviceMesh]): + stage_len = len(device_mesh_list) # for forward + dep = np.zeros((stage_len, stage_len), dtype=np.int64) + for i in range(stage_len - 1): + dep[i][i + 1] = 1 # direct graph + return StageDeps(dep, device_mesh_list, module_list) + + +@dataclass +class Status: + batch_idx: int = 0 + stage_id: int = 0 + chunk_id: int = 0 + f_b: "str" = "" # forward or backward + stg: "str" = "" # stage for 1f1b + k: int = 0 + + def __str__(self): + return f"b:{self.batch_idx}, c:{self.chunk_id}, {self.stg + '-' + self.f_b}" + + +class PipelineSchema(metaclass=ABCMeta): + """ + we define this class to abstract the pipeline execute + Args: + dep: the dependency for adjacency martrix + meshes: the list for stage of + + """ + + def __init__(self, num_stage: int, meshes: Union[List[DeviceMesh], int], batches: int = 1): + self.num_stage = num_stage + self.meshes = meshes + self.batches = batches + self._schedules: List[List[Tuple]] = self._gen_schedule() + + @property + @abstractmethod + def name(self): + """print schedule name""" + raise NotImplementedError() + + @abstractmethod + def _gen_schedule(self): + """generator the pipelinne schedule for engine""" + raise NotImplementedError("not impl") + + def __str__(self): + """print the pipeline clock work""" + stream = "\n" + d = " ".join([f"d{d:<24}" for d in range(self.num_mesh)]) + stream += f"T k :{d:<24} \n" + for time, scheds in enumerate(self.schedules): + sched_str = " ".join([f"{str(sched):<24}" for sched in scheds]) + stream += f"T {time:<2}: {sched_str} \n" + return stream + + @property + def schedules(self): + """return schedules""" + return self._schedules + + @property + def num_mesh(self): + """return the num mesh of tp group""" + if isinstance(self.meshes, Sequence): + return len(self.meshes) + elif isinstance(self.meshes, int): + return self.meshes + else: + raise NotImplementedError("unsupport device mesh list") + + @property + def num_clock(self): + """return num schedule for the num clock""" + + return len(self._schedules) + + +@dataclass +class BaseInstruction(metaclass=ABCMeta): + @abstractmethod + def run(self, *args, **kwargs): + raise NotImplementedError("unsupport run command") + + @property + def name(self): + return "base_instruction" + + def dump(self): + return f"{get_rank()}: {self}" + + +class InstructionGenerator(metaclass=ABCMeta): + def __init__( + self, + deps: StageDeps, + meshes: int, + batches: int, + default_shape: Optional[Shape] = None, + default_dtype: Optional[torch.dtype] = None, + batch_shape_lists: Optional[List[Any]] = None, + batch_dtype_lists: Optional[List[Any]] = None, + forward_only=False, + num_chunk=1, + ): + self.deps = deps + self.meshes = meshes + self.num_chunk = num_chunk + self.batches = batches + self.default_shape = default_shape + self.default_dtype = default_dtype + self.batch_shape_lists = batch_shape_lists + self.batch_dtype_lists = batch_dtype_lists + self.forward_only = forward_only + self.instruction_list: List = [] + + """ + generate instruction + """ + + @abstractmethod + def gen_instruction(self): + raise NotImplementedError("not implement") + + """ + get current stage instruction + """ + + def get_instruction_list(self, stage: int): + return self.instruction_list[stage] + + """ + update with batch idx, stage idx + """ + + def _set_inst(self, inst: BaseInstruction, s: int): + self.instruction_list[s].append(inst) + + """ + set instruction type + """ + + def execute(self, *args, **kwargs): + raise NotImplementedError("not implement") + + +class InstructionBuilder: + global_instructions_funcs = defaultdict(list) + global_instructions_str = defaultdict(list) + + constant_data = defaultdict() + user_data = defaultdict() + loss_fn: Callable = torch.sum + dataloader: Any + topo: StageDeps + model: Callable + stage_id: int + _pos = 0 + _stack = None + + def build_from_dict(self, instructions: Dict): + assert isinstance(instructions, dict), "instructions should be dict" + for stage_id, instruction_list in instructions.items(): + cur_stage_ins_list = instruction_list + if isinstance(cur_stage_ins_list, str): + instructions_funcs = cur_stage_ins_list.split(",") + else: + instructions_funcs = cur_stage_ins_list + + mapped_functions = [registed_functions[x] for x in instructions_funcs] + + self.global_instructions_funcs[stage_id] = mapped_functions + self.global_instructions_str[stage_id] = instructions_funcs + + def draw_instructions(self): + from matplotlib import pyplot as plt + + fig, ax = plt.subplots() + # draw rectangle + stage_nums = len(self.global_instructions_str.keys()) + for stage_id, instuctions_strs in self.global_instructions_str.items(): + for id, stage_str in enumerate(instuctions_strs): + ax.add_patch(plt.Rectangle((id, -1 * stage_id), 1, 1, fill=False, edgecolor="black", lw=2)) + ax.text(id + 0.5, -1 * stage_id + 0.5, stage_str, ha="center", va="center") + + for stage_id in range(stage_nums): + ax.text(-0.5, -1 * stage_id + 0.5, stage_id, ha="center", va="center") + # set max xlim and ylim + max_stages = max(len(x) for x in self.global_instructions_str.values()) + ax.set_xlim(0, max_stages) + ax.set_ylim(-1 * stage_nums + 1, 1) + ax.axis("off") + plt.savefig("instructions.png") + + @property + def pos(self): + return self._pos + + @property + def last(self): + return self._stack + + def run(self, stage_id: int): + output = [] + for pos, fn in enumerate(self.global_instructions_funcs[stage_id]): + self._pos = pos + out = fn() + self._stack = out + output.append(out) + return output + + def export(self, stage_id, *args, **kwargs): + func_lists = self.global_instructions_funcs[stage_id] + + class Model(torch.nn.Module): + def __init__(self, func_lists, model): + super().__init__() + self.func_lists = func_lists + self.model = model + + def forward(self, *args, **kwargs): + for f in self.func_lists: + # TODO: handle this to make forward inst work. + if f.__name__ == "forward": + activation = self.model(*args, **kwargs) + args = (activation,) + else: + args, kwargs = f(*args, **kwargs) + return args, kwargs + + model = Model(func_lists, self.model) + graph = torch.export.export(model, args) + return graph + + +class CompilePPCollectiveKind(enum.Enum): + SEND = 1 + RECV = 2 + BORADCAST = 3 # for cross mesh collective + UNKNOWN = 4 + + +class CompilePPCollectiveOperator: + def __init__( + self, + kind: CompilePPCollectiveKind, + src: int = None, + dst: List[int] = None, + is_backward: bool = False, + ) -> None: + assert kind in ( + CompilePPCollectiveKind.BORADCAST, + CompilePPCollectiveKind.SEND, + CompilePPCollectiveKind.RECV, + ) + self.kind = kind + self.is_backward = is_backward + + if self.kind is CompilePPCollectiveKind.SEND: + assert dst is not None and isinstance(dst, int) + elif self.kind is CompilePPCollectiveKind.RECV: + assert src is not None and isinstance(src, int) + else: + assert src is not None and isinstance(src, int) + assert dst is not None and isinstance(dst, List[int]) + assert src in dst + + self.src = src + self.dst = dst + pass + + def __hash__(self) -> int: + if isinstance(self.dst, List[int]): + dst = tuple(self.dst) + else: + dst = self.dst + return hash((self.kind, self.src, dst, self.is_backward)) + + +VESCALE_INTRUCTION_BUILDER = InstructionBuilder() diff --git a/vescale/pipe/_schedules/looping_bfs.py b/vescale/pipe/_schedules/looping_bfs.py new file mode 100644 index 0000000..4d0b6e6 --- /dev/null +++ b/vescale/pipe/_schedules/looping_bfs.py @@ -0,0 +1,1789 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from vescale.pipe._schedules.instruction_base import ( + PipelineSchema, + Status, + Shape, + InstructionGenerator, + StageDeps, + BaseInstruction, + CommPacket, + VESCALE_INTRUCTION_BUILDER as builder, + register_instruction, + registed_functions, +) +import contextlib +from dataclasses import dataclass, field +from vescale.dtensor.dtensor import DTensor +import torch +from collections import defaultdict +from inspect import signature +import numpy as np +from vescale.dtensor.device_mesh import DeviceMesh +from typing import List, Sequence, Optional, Dict, Union, Callable +from functools import partial +from vescale.dtensor._diff import dummy_p2p, manage_dump_file +from vescale.pipe.p2p_communication import ( + recv_forward, + drain_send_reqs, + drain_recv_reqs, + send_forward_backward_recv_forward_backward, + send_forward_recv_forward, + send_backward_recv_backward, +) +from vescale.model.base_gpt.utils import switch_dtensor + + +@dataclass +class RECV_FORWARD(BaseInstruction): + comm_packages: List[CommPacket] = field(default_factory=list) + tensor_shapes: Union[List[Shape], Shape] = field(default_factory=list) + tensor_dtypes: Union[List[torch.dtype], torch.dtype] = field(default_factory=list) + batch_p2p_comm: bool = True + batch_id: Optional[int] = None + is_pp_first_stage: bool = False + debug: str = "" + + @property + def name(self): + return "recv_forward" + + @dummy_p2p + def run(self) -> List: + if self.is_pp_first_stage: + return None + + def f(info): + comm, shape, dtype = info + return recv_forward( + tensor_shape=shape, + recv_dtype=dtype, + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + batch_p2p_comm=self.batch_p2p_comm, + ) + + infos = zip(self.comm_packages, self.tensor_shapes, self.tensor_dtypes) + out = list(map(f, infos)) + return out if len(out) > 0 else None + + +@dataclass +class WAIT_FWD(BaseInstruction): + @property + def name(self): + return "wait_forward" + + @dummy_p2p + def run(self, fwd_wait_handles: Optional[Sequence]): + if fwd_wait_handles is not None: + for req in fwd_wait_handles: + req.wait() + + +@dataclass +class DRAIN_SEND_REQS(BaseInstruction): + @property + def name(self): + return "drain_send_reqs" + + @dummy_p2p + def run(self): + drain_send_reqs() + + +@dataclass +class DRAIN_RECV_REQS(BaseInstruction): + drain_type: str = "all" + check_bwd_wait: bool = False + + @property + def name(self): + return "drain_recv_reqs" + + @dummy_p2p + def run(self, bwd_wait_handles: Optional[Sequence]): + if self.check_bwd_wait: + if bwd_wait_handles is not None: + drain_recv_reqs(self.drain_type) + else: + drain_recv_reqs(self.drain_type) + + +@dataclass +class DEALLOCATE_OUTPUT_TENSOR(BaseInstruction): + @property + def name(self): + return "deallocate tensor" + + @dummy_p2p + def run(self, output_tensor, deallocate_pipeline_outputs): + def deallocate(output_tensor): + if (output_tensor is None) or (not deallocate_pipeline_outputs): + return + assert isinstance( + output_tensor, [torch.Tensor, DTensor] + ), f"expected Tensor, found {type(output_tensor).__name__}." + assert output_tensor._base is None, "counter-productive to free a view of another tensor." + if isinstance(output_tensor, [torch.Tensor, DTensor]): + output_tensor._local_tensor.data = torch.empty( + (1,), + device=output_tensor.device, + dtype=output_tensor.dtype, + ) + else: + output_tensor.data = torch.empty( + (1,), + device=output_tensor.device, + dtype=output_tensor.dtype, + ) + return + + if not isinstance(output_tensor, Sequence): + output_tensor = [output_tensor] + map(deallocate, output_tensor) + + +@dataclass +class APPEND_INPUTS(BaseInstruction): + chunk: int = 0 + + @property + def name(self): + return "append inputs" + + @dummy_p2p + def run(self, input_tensor, input_tensors): + input_tensors[self.chunk].append(input_tensor) + + +@dataclass +class APPEND_GRADS(BaseInstruction): + chunk: int = 0 + + @property + def name(self): + return "append grads" + + @dummy_p2p + def run(self, output_tensor_grad, output_tensor_grads): + output_tensor_grads[self.chunk].append(output_tensor_grad) + + +@dataclass +class SEND_FORWARD_BACKWARD_RECV_FORWARD_BACKWARD(BaseInstruction): + recv_prev: bool = False + recv_next: bool = False + send_comms: List[CommPacket] = field(default_factory=list) + recv_comms: List[CommPacket] = field(default_factory=list) + recv_shapes: List[Shape] = field(default_factory=list) + recv_dtypes: List[torch.dtype] = field(default_factory=list) + batch_p2p_comm: bool = True + debug: str = "" + + @property + def name(self): + return "send forward backward recv forward backward" + + @dummy_p2p + def run(self, output_tensor, input_tensor_grad): + if not isinstance(output_tensor, Sequence): + output_tensor = [output_tensor] + if not isinstance(input_tensor_grad, Sequence): + input_tensor_grad = [input_tensor_grad] + + def f(info): + output_tensor, input_tensor_grad, recv_comm, send_comm, tensor_shape, dtype = info + if isinstance(output_tensor, DTensor): + output_tensor = output_tensor._local_tensor + if isinstance(input_tensor_grad, DTensor): + input_tensor_grad = input_tensor_grad._local_tensor + + input_tensor, output_tensor_grad = send_forward_backward_recv_forward_backward( + output_tensor=output_tensor, + input_tensor_grad=input_tensor_grad, + recv_prev=self.recv_prev, + recv_next=self.recv_next, + current_device_mesh=send_comm.cur_mesh, + prev_device_mesh=recv_comm.peer_mesh, + next_device_mesh=send_comm.peer_mesh, + tensor_shape=tensor_shape, + recv_dtype=dtype, + batch_p2p_comm=self.batch_p2p_comm, + ) + return input_tensor, output_tensor_grad + + zipped_data = list( + zip( + output_tensor, + input_tensor_grad, + self.recv_comms, + self.send_comms, + self.recv_shapes, + self.recv_dtypes, + ) + ) + + outputs = list(map(f, zipped_data)) + + if len(outputs) > 1: + if self.overlap_p2p_comm: + out = [x[0] for x in outputs] + handle = [x[1] for x in outputs] + return out, handle + else: + return outputs + else: + return outputs[0] + + +@dataclass +class SEND_FORWARD_RECV_FORWARD(BaseInstruction): + recv_prev: bool = False + send_shapes: List[Shape] = field(default_factory=list) + send_tensor_shapes_unpad: List[Shape] = field(default_factory=list) + send_dtypes: List[torch.dtype] = field(default_factory=list) + batch_p2p_comm: bool = True + overlap_p2p_comm: bool = False + send_comms: List[CommPacket] = field(default_factory=list) + recv_comms: List[CommPacket] = field(default_factory=list) + microbatch_id: int = 0 + debug: str = "" + + @property + def name(self): + return "send forward recv forward" + + @dummy_p2p + def run(self, output_tensor): + if not isinstance(output_tensor, Sequence): + output_tensor = [output_tensor] + + def f(info): + output_tensor, recv_comm, send_comm, tensor_shape, tensor_shape_unpad, dtype = info + if isinstance(output_tensor, DTensor): + output_tensor = output_tensor._local_tensor + output = send_forward_recv_forward( + output_tensor, + recv_prev=self.recv_prev, + tensor_shape=tensor_shape, + send_tensor_shape_unpad=tensor_shape_unpad, + overlap_p2p_comm=self.overlap_p2p_comm, + batch_p2p_comm=self.batch_p2p_comm, + recv_dtype=dtype, + current_device_mesh=send_comm.cur_mesh, + prev_device_mesh=recv_comm.peer_mesh, + next_device_mesh=send_comm.peer_mesh, + ) + return output + + zipped_data = list( + zip( + output_tensor, + self.recv_comms, + self.send_comms, + self.send_shapes, + self.send_tensor_shapes_unpad, + self.send_dtypes, + ) + ) + + outputs = list(map(f, zipped_data)) + + if len(outputs) > 1: + if self.overlap_p2p_comm: + out = [x[0] for x in outputs] + handle = [x[1] for x in outputs] + return out, handle + else: + return outputs + else: + return outputs[0] + + +@dataclass +class SEND_BACKWARD_RECV_BACKWARD(BaseInstruction): + recv_next: bool = False + send_shapes: List[Shape] = field(default_factory=list) + send_tensor_shapes_unpad: List[Shape] = field(default_factory=list) + send_dtypes: List[torch.dtype] = field(default_factory=list) + batch_p2p_comm: bool = True + overlap_p2p_comm: bool = False + send_comms: List[CommPacket] = field(default_factory=list) + recv_comms: List[CommPacket] = field(default_factory=list) + debug: str = "" + + @property + def name(self): + return "send backward recv backward" + + @dummy_p2p + def run(self, input_tensor_grad): + if not isinstance(input_tensor_grad, Sequence): + input_tensor_grad = [input_tensor_grad] + + def f(info): + input_tensor_grad, recv_comm, send_comm, tensor_shape, tensor_shape_unpad, dtype = info + if isinstance(input_tensor_grad, DTensor): + input_tensor_grad = input_tensor_grad._local_tensor + output = send_backward_recv_backward( + input_tensor_grad, + recv_next=self.recv_next, + tensor_shape=tensor_shape, + send_tensor_shape_unpad=tensor_shape_unpad, + overlap_p2p_comm=self.overlap_p2p_comm, + batch_p2p_comm=self.batch_p2p_comm, + recv_dtype=dtype, + current_device_mesh=send_comm.cur_mesh, + prev_device_mesh=recv_comm.peer_mesh, + next_device_mesh=send_comm.peer_mesh, + ) + return output + + zipped_data = list( + zip( + input_tensor_grad, + self.recv_comms, + self.send_comms, + self.send_shapes, + self.send_tensor_shapes_unpad, + self.send_dtypes, + ) + ) + + output = list(map(f, zipped_data)) + + if len(output) > 1: + if self.overlap_p2p_comm: + return [x[0] for x in output], [x[1] for x in output] + else: + return output + else: + return output[0] + + +@dataclass +class SET_INPUTGRAD_TO_NONE(BaseInstruction): + @property + def name(self): + return "set inputgrad to none" + + @dummy_p2p + def run(self): + return None + + +@dataclass +class SET_OUTPUT_TO_NONE(BaseInstruction): + @property + def name(self): + return "set output to none" + + @dummy_p2p + def run(self): + return None + + +@dataclass +class BWD(BaseInstruction): + is_vpp_last_stage: bool = False + last_microbatch_for_model_chunk: bool = False + grad_sync_chunk_id: int = 0 + grad_sync_microbatch_id: int = 0 + model_chunk_id: int = 0 + microbatch_id: int = 0 + debug: str = "" + + @property + def name(self): + return "backward" + + def backward_step( + self, + input_tensor, + output_tensor, + output_tensor_grad, + grad_scaler=None, + deallocate_pipeline_outputs=False, + ): + """Backward step through passed-in output tensor. + + If last stage, output_tensor_grad is None, otherwise gradient of loss + with respect to stage's output tensor. + + Returns gradient of loss with respect to input tensor (None if first + stage).""" + + # NOTE: This code currently can handle at most one skip connection. It + # needs to be modified slightly to support arbitrary numbers of skip + # connections. + + # Retain the grad on the input_tensor. + unwrap_input_tensor_grad = False + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + unwrap_input_tensor_grad = True + for x in input_tensor: + if x is not None: + x.retain_grad() + + if not isinstance(output_tensor, list): + output_tensor = [output_tensor] + if not isinstance(output_tensor_grad, list): + output_tensor_grad = [output_tensor_grad] + + # extract loss value from output tensors + if isinstance(output_tensor[0], Sequence): + for j in range(len(output_tensor[0])): + if output_tensor[0][j].ndim == 0 and output_tensor[0][j].numel() == 1: + loss_value = output_tensor[0][j] + break + else: + loss_value = output_tensor[0][-1] + else: + loss_value = output_tensor[0] + + # Backward pass. + if output_tensor_grad[0] is None and grad_scaler is not None: + loss_value = grad_scaler(loss_value) + # FIXME: For virtual pipeline, there may exist frozen layer without grad; + # Need to verify if this solution is correct + if not loss_value.requires_grad: + return None + + model_chunk_id = builder.user_data["model_chunk_id"] + model = builder.model[model_chunk_id] + if deallocate_pipeline_outputs: + assert 0 + else: + switch_dtensor(torch.autograd.backward)(loss_value, grad_tensors=output_tensor_grad[0]) + + model_chunk_id = builder.user_data["model_chunk_id"] + model = builder.model[model_chunk_id] + + # Collect the grad of the input_tensor. + input_tensor_grad = [None] + if input_tensor is not None: + input_tensor_grad = [] + for x in input_tensor: + if x is None: + input_tensor_grad.append(None) + else: + input_tensor_grad.append(x.grad) + + if unwrap_input_tensor_grad: + input_tensor_grad = input_tensor_grad[0] + + return input_tensor_grad + + @dummy_p2p + def run( + self, + input_tensors, + output_tensors, + output_tensor_grads, + grad_sync_func, + synchronized_model_chunks, + kwargs: dict, + ): + grad_scaler, model, deallocate_pipeline_outputs = ( + kwargs["grad_scaler"], + kwargs["model"], + kwargs["deallocate_pipeline_outputs"], + ) + if self.is_vpp_last_stage: + if len(output_tensor_grads[self.model_chunk_id]) == 0: + output_tensor_grads[self.model_chunk_id].append(None) + input_tensor = input_tensors[self.model_chunk_id].pop(0) + output_tensor = output_tensors[self.model_chunk_id].pop(0) + output_tensor_grad = output_tensor_grads[self.model_chunk_id].pop(0) + input_tensor_grad = self.backward_step( + input_tensor, output_tensor, output_tensor_grad, grad_scaler, deallocate_pipeline_outputs + ) + + def f(input_tensor): + if input_tensor is not None: + assert isinstance(input_tensor, (torch.Tensor, DTensor)), input_tensor + input_tensor.grad = None + DEALLOCATE_OUTPUT_TENSOR().run(input_tensor, deallocate_pipeline_outputs) + + if not isinstance(input_tensor, Sequence): + map(f, [input_tensor]) + else: + map(f, input_tensor) + + # launch grad synchronization (custom grad sync) + # Note: Asynchronous communication tends to slow down compute. + # To reduce idling from mismatched microbatch times, we launch + # asynchronous communication at the same time across the + # pipeline-parallel group. + if grad_sync_func is not None: + if self.grad_sync_microbatch_id >= 0 and self.last_microbatch_for_model_chunk: + grad_sync_func(model[self.grad_sync_chunk_id]) + synchronized_model_chunks.add(self.grad_sync_chunk_id) + return input_tensor_grad + + +@dataclass +class FWD(BaseInstruction): + microbatch_id: int = 0 + model_chunk_id: int = 0 + param_sync_chunk_id: int = 0 + is_vpp_first_stage: bool = False + is_vpp_last_stage: bool = False + forward_only: bool = False + num_model_chunks: int = 1 + num_microbatches: int = 1 + param_sync_microbatch_id: int = 0 + first_microbatch_for_model_chunk: bool = True + optimizer_step_successful: bool = True + overlap_p2p_comm: bool = False + param_sync_overlap: bool = False + debug: str = "" + + @property + def name(self): + return "forward" + + def forward_step( + self, + data_iterator, + input_tensor, + model, + forward_data_store, + is_pp_first_stage: bool, + is_pp_last_stage: bool, + autocast_dtype=torch.float, + enable_autocast=False, + model_chunk_id=0, + ): + """Forward step for passed-in model. + + If first stage, input tensor is obtained from data_iterator, otherwise + passed-in input_tensor is used. + + Returns output tensor.""" + if enable_autocast: + context_manager = torch.autocast("cuda", dtype=autocast_dtype) + else: + context_manager = contextlib.nullcontext() + with context_manager: + + def prepare_data(): + model_chunk_id = builder.user_data["model_chunk_id"] + ground_truth = [] + if builder.user_data["is_pp_first_stage"]: + local_tensors = next(builder.dataloader[model_chunk_id]) + true_input_tensor = None + else: + local_tensors = next(builder.dataloader[model_chunk_id]) + if isinstance(local_tensors, Sequence) and len(local_tensors) > 1: + ground_truth.append(local_tensors[-1]) + elif isinstance(local_tensors, Dict) and "labels" in local_tensors: + ground_truth.append(local_tensors["labels"]) + true_input_tensor = builder.user_data["p2p_tensors"] + if isinstance(true_input_tensor, Sequence) and len(true_input_tensor) == 1: + true_input_tensor = true_input_tensor[0] + + return true_input_tensor, local_tensors, ground_truth + + builder.user_data["model_chunk_id"] = model_chunk_id + builder.user_data["p2p_tensors"] = input_tensor + builder.user_data["is_pp_first_stage"] = is_pp_first_stage + builder.user_data["is_pp_last_stage"] = is_pp_last_stage + builder.user_data["prepare_data_fn"] = prepare_data + p2p_input, local_input, ground_truth = registed_functions["vescale_interleaved_1f1b_pre_forward_data"]() + builder.user_data["ground_truth"] = ground_truth + output_tensor = registed_functions["vescale_interleaved_1f1b_forward"](p2p_input, local_input) + builder.user_data["output_tensor"] = output_tensor + + if is_pp_last_stage: + output_tensor, loss_tensor = registed_functions["vescale_interleaved_1f1b_loss_fn"]() + forward_data_store.append((output_tensor, loss_tensor)) + if builder.loss_fn is None: + return output_tensor + else: + return loss_tensor + + return output_tensor + + @dummy_p2p + def run(self, input_tensors, output_tensors, param_sync_func, kwargs): + # dump arguments for underlying fwd/bwd helpers + data_iterator, model, forward_data_store, dtype, enable_autocast = ( + kwargs["data_iterator"], + kwargs["model"], + kwargs["forward_data_store"], + kwargs["dtype"], + kwargs["enable_autocast"], + ) + + assert param_sync_func is None + # TODO: implment logic for param_sync_func with PipeModule's utils + if param_sync_func is not None: + if self.param_sync_microbatch_id < self.num_microbatches and self.first_microbatch_for_model_chunk: + if 1 < self.param_sync_chunk_id < self.num_model_chunks: + param_sync_func(model[self.param_sync_chunk_id].parameters()) + + if self.overlap_p2p_comm and self.param_sync_overlap: + drain_recv_reqs("forward") + + # forward step + if self.is_vpp_first_stage: + if len(input_tensors[self.model_chunk_id]) == len(output_tensors[self.model_chunk_id]): + input_tensors[self.model_chunk_id].append(None) + + input_tensor = input_tensors[self.model_chunk_id][-1] + output_tensor = self.forward_step( + data_iterator=data_iterator, + input_tensor=input_tensor, + model=model, + forward_data_store=forward_data_store, + is_pp_first_stage=self.is_vpp_first_stage, + is_pp_last_stage=self.is_vpp_last_stage, + autocast_dtype=dtype, + enable_autocast=enable_autocast, + model_chunk_id=self.model_chunk_id, + ) + output_tensors[self.model_chunk_id].append(output_tensor) + + # if forward-only, no need to save tensors for a backward pass + if self.forward_only: + input_tensors[self.model_chunk_id].pop() + output_tensors[self.model_chunk_id].pop() + + return output_tensor + + +@dataclass +class BUBBLE(BaseInstruction): + @property + def name(self): + return "bubble" + + def run(self): + return + + +@dataclass +class LAUNCH_SHARED_UNITS_SYNC(BaseInstruction): + num_chunks: int = 1 + + @property + def name(self): + return "launch remain grad sync" + + @dummy_p2p + def run(self, model): + for model_chunk_id in range(self.num_chunks): + # if isinstance(model, PipeModule): + # model.sync_shared_params(share_params=False, model_chunk_id=model_chunk_id) + ... + + +class InterleavedPipeDreramFlush(PipelineSchema): + def __init__( + self, + num_chunks: int, + meshes: Sequence[DeviceMesh], + default_shape: Shape, + default_dtype: torch.dtype = torch.float32, + batches: int = 1, + input_shapes: Optional[List] = None, + input_shapes_unpad: Optional[List] = None, + **kwargs, + ): + assert batches % len(meshes) == 0, "Interleaved 1f1b only support mircobatch size mode device size" + assert batches // len(meshes) > 1, "Interleaved 1f1b only support mircobatch size = Interger * device size" + self.num_chunks = num_chunks + self.total_num_microbatches = num_chunks * batches + self.input_shapes = input_shapes + self.input_shapes_unpad = input_shapes_unpad + self.default_tensor_shape = default_shape + self.default_dtype = default_dtype + super().__init__(len(meshes), meshes, batches) + + @property + def name(self): + return "Interleaved 1f1b" + + def get_variable_tensor_shape(self, microbatch_id: int): + if self.input_shapes is None or len(self.input_shapes) == 0 or microbatch_id >= self.total_num_microbatches: + return self.default_tensor_shape + + microbatch_group_size = self.num_mesh * self.num_chunks + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + microbatch_id_curr_model_chunk = microbatch_group_id * self.num_mesh + microbatch_id_in_group % self.num_mesh + tensor_shape = self.input_shapes[microbatch_id_curr_model_chunk] + + return tensor_shape + + def get_variable_tensor_shape_unpad(self, microbatch_id: int): + if ( + self.input_shapes_unpad is None + or len(self.input_shapes_unpad) == 0 + or microbatch_id >= self.total_num_microbatches + ): + return None + + microbatch_group_size = self.num_mesh * self.num_chunks + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + microbatch_id_curr_model_chunk = microbatch_group_id * self.num_mesh + microbatch_id_in_group % self.num_mesh + return self.input_shapes_unpad[microbatch_id_curr_model_chunk] + + def get_model_chunk_id(self, microbatch_id: int, forward: bool): + """Helper method to get the model chunk ID given the iteration number.""" + microbatch_id_in_group = microbatch_id % (self.num_mesh * self.num_chunks) + model_chunk_id = microbatch_id_in_group // self.num_mesh + if not forward: + model_chunk_id = self.num_chunks - model_chunk_id - 1 + return model_chunk_id + + def is_first_microbatch_for_model_chunk_eager(self, microbatch_id: int) -> bool: + """Check if an iteration is the first for a model chunk eagerly""" + if microbatch_id % self.num_mesh != 0: + # Not the first time to run this model chunk + # For pipeline stage 0, chunk 0 is used by mb(0) + # mb(p), mb(2p), ... + return False + # grouping microbatches by pp_size, the groups will run different model chunk iteratively + microbatch_group_id = microbatch_id // self.num_mesh + if microbatch_group_id < self.num_chunks: + return True + return False + + def is_first_microbatch_for_model_chunk(self, microbatch_id: int) -> bool: + """Check if an iteration is the first for a model chunk.""" + # pp(0): mb(3+1) + # pp(1): mb(2+1) + # pp(2): mb(1+1) + # pp(3): mb(0+1) + microbatch_group_size = self.num_mesh * self.num_chunks + num_microbatch_groups = self.total_num_microbatches // microbatch_group_size + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + if microbatch_group_id == 0: + return microbatch_id_in_group % self.num_mesh == 0 + else: + return False + + def is_last_microbatch_for_model_chunk(self, microbatch_id: int) -> bool: + """Check if an iteration is the last for a model chunk.""" + microbatch_group_size = self.num_mesh * self.num_chunks + num_microbatch_groups = self.total_num_microbatches // microbatch_group_size + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + if microbatch_group_id == num_microbatch_groups - 1: + return microbatch_id_in_group % self.num_mesh == self.num_mesh - 1 + else: + return False + + def _gen_schedule(self): + b = self.batches + d = self.num_mesh + s = self.num_chunks + + warmup_batches = [min((d - i - 1) * 2 + (s - 1) * d, b * s) for i in range(d)] + self.warmup_batches = warmup_batches + remaining = [(b * s - w) for w in warmup_batches] + self.remaining = remaining + num_clock = (b * s + d - 1) * 2 # time todo flush + schedules = [[None] * d for c in range(num_clock)] + new_timeline = list(range(d)) + bwd_done_idx = np.zeros(shape=[num_clock, d, s], dtype=np.int32) + next_fwd_batch_idx = np.zeros(shape=[d, s], dtype=np.int32) + next_bwd_batch_idx = np.zeros(shape=[d, s], dtype=np.int32) + # warm-up steps + for i in range(d): + for k in range(warmup_batches[i]): + t_i = new_timeline[i] + chunk_id = self.get_model_chunk_id(k, forward=True) + schedules[t_i][i] = Status(next_fwd_batch_idx[i][chunk_id], i, chunk_id, "F", "WUp", k) + new_timeline[i] += 1 # self add for new timeline + next_fwd_batch_idx[i][chunk_id] += 1 # do next micro batch + + for i in reversed(range(d)): + for k in range(remaining[i]): + t_i = new_timeline[i] + f_k = k + warmup_batches[i] + chunk_id = self.get_model_chunk_id(f_k, forward=True) + schedules[t_i][i] = Status(next_fwd_batch_idx[i][chunk_id], i, chunk_id, "F", "1f1b", k) + next_fwd_batch_idx[i][chunk_id] += 1 # do next micro batch + bwd_k = k + chunk_id = self.get_model_chunk_id(bwd_k, forward=False) + bwd_done_idx[t_i][i] = bwd_done_idx[t_i - 1][i] + bwd_done_idx[t_i][i][chunk_id] = next_bwd_batch_idx[i][chunk_id] + t_i += 1 + + # do backward + if i + 1 < d: + while bwd_done_idx[t_i][i + 1][chunk_id] < next_bwd_batch_idx[i][chunk_id]: + assert bwd_done_idx[t_i - 1][i][chunk_id] == next_bwd_batch_idx[i][chunk_id] + bwd_done_idx[t_i][i][chunk_id] = bwd_done_idx[t_i - 1][i][chunk_id] + t_i = t_i + 1 + + if k == remaining[i] - 1: # last iterator + schedules[t_i][i] = Status(next_bwd_batch_idx[i][chunk_id], i, chunk_id, "B", "1f1b-l", k) + else: + schedules[t_i][i] = Status(next_bwd_batch_idx[i][chunk_id], i, chunk_id, "B", "1f1b", k) + + bwd_done_idx[t_i][i] = bwd_done_idx[t_i - 1][i] + bwd_done_idx[t_i][i][chunk_id] = next_bwd_batch_idx[i][chunk_id] + next_bwd_batch_idx[i][chunk_id] += 1 + new_timeline[i] = t_i + 1 + + # run cooldown passes + for i in reversed(range(d)): + for k in range(remaining[i], self.total_num_microbatches): + t_i = new_timeline[i] + bwd_k = k + chunk_id = self.get_model_chunk_id(bwd_k, forward=False) + if i + 1 < d: + while bwd_done_idx[t_i][i + 1][chunk_id] <= next_bwd_batch_idx[i][chunk_id]: + bwd_done_idx[t_i][i] = bwd_done_idx[t_i - 1][i] + bwd_done_idx[t_i][i][chunk_id] = next_bwd_batch_idx[i][chunk_id] + t_i = t_i + 1 + schedules[t_i][i] = Status(next_bwd_batch_idx[i][chunk_id], i, chunk_id, "B", "CD", k) + bwd_done_idx[t_i][i] = bwd_done_idx[t_i - 1][i] + bwd_done_idx[t_i][i] = next_bwd_batch_idx[i] + next_bwd_batch_idx[i][chunk_id] += 1 + new_timeline[i] = t_i + 1 + bwd_done_idx[new_timeline[i] : num_clock, i, :] = b + + return schedules + + +class InterleavedOneFOneBInstructionGenerator(InstructionGenerator): + def __init__( + self, + deps: StageDeps, + meshes: List[DeviceMesh], + batches: int, + default_shape: Optional[Shape] = None, + default_dtype: Optional[torch.dtype] = None, + batch_shape_lists: Optional[List[Dict[int, Shape]]] = None, + batch_dtype_lists: Optional[List[Dict[int, torch.dtype]]] = None, + input_shapes: List[Dict[int, Shape]] = None, + input_shapes_unpad: List[Dict[int, Shape]] = None, + num_chunks: int = 1, + batch_p2p_comm: bool = True, + param_sync_overlap: bool = False, + overlap_p2p_comm: bool = False, + grad_sync_overlap: bool = False, + forward_only: bool = False, + ): + forward_only = True if not torch.is_grad_enabled() else forward_only + super().__init__( + deps=deps, + meshes=meshes, + batches=batches, + default_shape=default_shape, + default_dtype=default_dtype, + batch_shape_lists=batch_shape_lists, + batch_dtype_lists=batch_dtype_lists, + num_chunk=num_chunks, + forward_only=forward_only, + ) + self.batch_p2p_comm = batch_p2p_comm + self.overlap_p2p_comm = overlap_p2p_comm + self.param_sync_overlap = param_sync_overlap + self.grad_sync_overlap = grad_sync_overlap + self.num_stage = len(meshes) + self.num_chunks = num_chunks + self.num_meshes = self.num_stage + self.schema = InterleavedPipeDreramFlush( + num_chunks=self.num_chunks, + meshes=self.meshes, + batches=self.batches, + default_shape=default_shape, + default_dtype=default_dtype, + input_shapes=input_shapes, + input_shapes_unpad=input_shapes_unpad, + ) + self.forward_only = forward_only + + def get_tensor_shape(self, microbatch_id: int, input_id: int = 0): + if ( + self.schema.input_shapes is None + or len(self.schema.input_shapes) == 0 + or microbatch_id >= self.schema.total_num_microbatches + ): + return self.schema.default_tensor_shape + microbatch_group_size = self.num_mesh * self.num_chunks + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + microbatch_id_curr_model_chunk = microbatch_group_id * self.num_mesh + microbatch_id_in_group % self.num_mesh + tensor_shape = self.schema.input_shapes[microbatch_id_curr_model_chunk] + if isinstance(tensor_shape, Dict): + tensor_shape = tensor_shape[input_id] + return tensor_shape + + def get_variable_tensor_shape_unpad(self, microbatch_id: int, input_id: int = 0): + if ( + self.schema.input_shapes is None + or len(self.schema.input_shapes) == 0 + or microbatch_id >= self.schema.total_num_microbatches + ): + return self.schema.default_tensor_shape + microbatch_group_size = self.num_mesh * self.num_chunks + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + microbatch_id_curr_model_chunk = microbatch_group_id * self.num_mesh + microbatch_id_in_group % self.num_mesh + tensor_shape = self.schema.input_shapes_unpad[microbatch_id_curr_model_chunk] + if isinstance(tensor_shape, Dict): + tensor_shape = tensor_shape[input_id] + return tensor_shape + + def get_tensor_dtype(self, microbatch_id: int, input_id: int = 0): + if ( + self.batch_dtype_lists is None + or len(self.batch_dtype_lists) == 0 + or microbatch_id >= self.schema.total_num_microbatches + ): + return self.default_dtype + microbatch_group_size = self.num_mesh * self.num_chunks + microbatch_group_id = microbatch_id // microbatch_group_size + microbatch_id_in_group = microbatch_id % microbatch_group_size + microbatch_id_curr_model_chunk = microbatch_group_id * self.num_mesh + microbatch_id_in_group % self.num_mesh + tensor_dtype = self.batch_dtype_lists[microbatch_id_curr_model_chunk] + if isinstance(tensor_dtype, Dict): + tensor_dtype = tensor_dtype[input_id] + return tensor_dtype + + def get_shape_or_dtype(self, ff: Callable, comm_packages: List[CommPacket], microbatch_id): + def _get_shape_or_dtype(f: Callable, package: CommPacket): + return f(microbatch_id, package.input_id) + + return list(map(partial(_get_shape_or_dtype, ff), comm_packages)) + + # call by pipe emitter + def gen_instruction(self): + schedules: List = self.schema.schedules + self.instruction_list = [[] for _ in range(self.num_stage)] + first_time_1f1b = [True] * self.num_stage + first_time_cool_down = [True] * self.num_stage + _forward_only = self.forward_only + if not torch.is_grad_enabled(): + self.forward_only = True + + # before warmup + for s in range(self.num_meshes): + recv_comms = self.deps.get_recv_comms(s) + tensor_shapes = self.get_shape_or_dtype(self.get_tensor_shape, recv_comms, 0) + tensor_dtypes = self.get_shape_or_dtype(self.get_tensor_dtype, recv_comms, 0) + self._set_inst( + RECV_FORWARD( + comm_packages=recv_comms, + tensor_shapes=tensor_shapes, + tensor_dtypes=tensor_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + batch_id=0, + is_pp_first_stage=self.deps.is_pipeline_first_stage(s), + debug="before warm-up", + ), + s, + ) + + one_f_one_b_set = [set() for _ in range(self.num_meshes)] + + for clk, stages_schemas in enumerate(schedules): + for s, schema in enumerate(stages_schemas): + is_pp_first_stage = self.deps.is_pipeline_first_stage(s) + is_pp_last_stage = self.deps.is_pipeline_last_stage(s) + send_comms = self.deps.get_send_comms(s) + recv_comms = self.deps.get_recv_comms(s) + if schema: + stg = schema.stg + k = schema.k + send_shapes = self.get_shape_or_dtype(self.get_tensor_shape, send_comms, k) + send_dtypes = self.get_shape_or_dtype(self.get_tensor_dtype, send_comms, k) + send_shapes_unpad = self.get_shape_or_dtype(self.get_variable_tensor_shape_unpad, send_comms, k) + recv_shapes = self.get_shape_or_dtype(self.get_tensor_shape, recv_comms, k) + recv_dtypes = self.get_shape_or_dtype(self.get_tensor_dtype, recv_comms, k) + if "WUp" in stg: + if not self.overlap_p2p_comm: + self._set_inst(WAIT_FWD(), s) + elif not self.param_sync_overlap: + self._set_inst(DRAIN_RECV_REQS(drain_type="forward"), s) + # TODO: all warmup batch check + + model_chunk_id = self.schema.get_model_chunk_id(k, forward=True) + is_vpp_first_stage = self.deps.is_vpp_first_stage(s, model_chunk_id) + is_vpp_last_stage = self.deps.is_vpp_last_stage(s, model_chunk_id) + param_sync_microbatch_id = k + self.schema.num_mesh + param_sync_chunk_id = self.schema.get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1 + first_microbatch_for_model_chunk = self.schema.is_first_microbatch_for_model_chunk(k) + self._set_inst( + FWD( + microbatch_id=k, + model_chunk_id=model_chunk_id, + param_sync_chunk_id=param_sync_chunk_id, + is_vpp_first_stage=is_vpp_first_stage, + is_vpp_last_stage=is_vpp_last_stage, + forward_only=self.forward_only, + num_model_chunks=self.num_chunk, + num_microbatches=self.batches * self.num_chunk, + param_sync_microbatch_id=param_sync_microbatch_id, + first_microbatch_for_model_chunk=first_microbatch_for_model_chunk, + overlap_p2p_comm=self.overlap_p2p_comm, + param_sync_overlap=self.param_sync_overlap, + ), + s, + ) + # Determine if tensor should be received from previous stage. + next_forward_model_chunk_id = self.schema.get_model_chunk_id(k + 1, forward=True) + recv_prev = True + if is_pp_first_stage: + if next_forward_model_chunk_id == 0: + recv_prev = False + if k == (self.schema.total_num_microbatches - 1): + recv_prev = False + + if is_vpp_last_stage: + self._set_inst(SET_OUTPUT_TO_NONE(), s) + + if not self.overlap_p2p_comm: + if k == (self.schema.warmup_batches[s] - 1) and not self.forward_only: + self._set_inst(SET_INPUTGRAD_TO_NONE(), s) + recv_next = True + if is_pp_last_stage: + recv_next = False + self._set_inst( + SEND_FORWARD_BACKWARD_RECV_FORWARD_BACKWARD( + recv_prev=recv_prev, + recv_next=recv_next, + recv_comms=recv_comms, + send_comms=send_comms, + recv_shapes=recv_shapes, + recv_dtypes=recv_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + debug="none p2p overlap, last batch warm-up", + ), + s, + ) + + self._set_inst(APPEND_GRADS(chunk=self.num_chunk - 1), s) + else: + self._set_inst( + SEND_FORWARD_RECV_FORWARD( + recv_prev=recv_prev, + send_shapes=send_shapes, + send_tensor_shapes_unpad=send_shapes_unpad, + send_dtypes=send_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + overlap_p2p_comm=self.overlap_p2p_comm, + microbatch_id=k, + send_comms=send_comms, + recv_comms=recv_comms, + debug="none p2p overlap, warm-up", + ), + s, + ) + + self._set_inst(APPEND_INPUTS(chunk=next_forward_model_chunk_id), s) + else: + tensor_shapes = self.get_shape_or_dtype(self.get_tensor_shape, send_comms, k + 1) + tensor_dtypes = self.get_shape_or_dtype(self.get_tensor_dtype, send_comms, k + 1) + + self._set_inst( + SEND_FORWARD_RECV_FORWARD( + recv_prev=recv_prev, + send_shapes=tensor_shapes, + send_tensor_shapes_unpad=send_shapes_unpad, + send_dtypes=tensor_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + overlap_p2p_comm=self.overlap_p2p_comm, + send_comms=send_comms, + recv_comms=recv_comms, + debug="p2p overlap, warm up", + ), + s, + ) + if k == (self.schema.warmup_batches[s] - 1) and not self.forward_only: + self._set_inst(SET_INPUTGRAD_TO_NONE(), s) + recv_next = True + if is_pp_last_stage: + recv_next = False + self._set_inst( + SEND_BACKWARD_RECV_BACKWARD( + recv_next=recv_next, + send_shapes=send_shapes, + send_tensor_shapes_unpad=send_shapes_unpad, + send_dtypes=send_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + overlap_p2p_comm=self.overlap_p2p_comm, + send_comms=send_comms, + recv_comms=recv_comms, + debug="warm-up", + ), + s, + ) + self._set_inst(APPEND_GRADS(self.num_chunk - 1), s) + self._set_inst(APPEND_INPUTS(chunk=next_forward_model_chunk_id), s) + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + elif "1f1b" in stg: # 1f1b stage + forward_k = k + self.schema.warmup_batches[s] + if first_time_1f1b[s]: + if self.overlap_p2p_comm: + self._set_inst(DRAIN_SEND_REQS(), s) + first_time_1f1b[s] = False + if k in one_f_one_b_set[s]: + continue + else: + one_f_one_b_set[s].add(k) + if self.overlap_p2p_comm: + if not self.param_sync_overlap: + self._set_inst(DRAIN_RECV_REQS(drain_type="forward"), s) + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + + model_chunk_id = self.schema.get_model_chunk_id(forward_k, forward=True) + is_vpp_first_stage = self.deps.is_vpp_first_stage(s, model_chunk_id) + is_vpp_last_stage = self.deps.is_vpp_last_stage(s, model_chunk_id) + param_sync_microbatch_id = forward_k + self.schema.num_mesh + param_sync_chunk_id = ( + self.schema.get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1 + ) + first_microbatch_for_model_chunk = self.schema.is_first_microbatch_for_model_chunk( + forward_k + ) + self._set_inst( + FWD( + microbatch_id=forward_k, + model_chunk_id=model_chunk_id, + param_sync_chunk_id=param_sync_chunk_id, + param_sync_microbatch_id=param_sync_microbatch_id, + param_sync_overlap=self.param_sync_overlap, + first_microbatch_for_model_chunk=first_microbatch_for_model_chunk, + is_vpp_first_stage=is_vpp_first_stage, + is_vpp_last_stage=is_vpp_last_stage, + forward_only=self.forward_only, + num_model_chunks=self.num_chunk, + num_microbatches=self.batches * self.num_chunk, + debug="1f1b", + ), + s, + ) + # Determine if current stage has anything to send in either direction, + # otherwise set tensor to None. + # Last virtual stage no activation tensor to send + if is_vpp_last_stage: + self._set_inst(SET_OUTPUT_TO_NONE(), s) + # Determine if peers are sending, and where in data structure to put + # received tensors. + recv_prev = True + if is_pp_first_stage: + # First stage is ahead of last stage by (pipeline_parallel_size - 1). + next_forward_model_chunk_id = self.schema.get_model_chunk_id( + forward_k - (self.schema.num_mesh - 1), forward=True + ) + if next_forward_model_chunk_id == (self.schema.num_chunks - 1): + recv_prev = False + next_forward_model_chunk_id += 1 + else: + next_forward_model_chunk_id = self.schema.get_model_chunk_id( + forward_k + 1, forward=True + ) + + # If last iteration, don't receive; we already received one extra + # before the start of the for loop. + if k == (self.schema.remaining[s] - 1): + recv_prev = False + + # Send activation tensor to the next stage and receive activation tensor from the + # previous stage + tensor_shape = self.schema.get_variable_tensor_shape(forward_k + 1) + send_tensor_shape_unpad = self.schema.get_variable_tensor_shape_unpad(forward_k) + self._set_inst( + SEND_FORWARD_RECV_FORWARD( + recv_prev=recv_prev, + send_shapes=send_shapes, + send_tensor_shapes_unpad=send_shapes_unpad, + send_dtypes=send_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + overlap_p2p_comm=self.overlap_p2p_comm, + send_comms=send_comms, + recv_comms=recv_comms, + microbatch_id=forward_k, + debug="1f1b", + ), + s, + ) + self._set_inst(DRAIN_RECV_REQS(drain_type="backward"), s) + + # Backward pass. + backward_k = k + grad_sync_microbatch_id = backward_k - s + grad_sync_chunk_id = self.schema.get_model_chunk_id(grad_sync_microbatch_id, forward=False) + last_microbatch_for_model_chunk = self.schema.is_last_microbatch_for_model_chunk( + grad_sync_microbatch_id + ) + backward_model_chunk_id = self.schema.get_model_chunk_id(backward_k, forward=False) + is_vpp_first_stage = self.deps.is_vpp_first_stage(s, backward_model_chunk_id) + is_vpp_last_stage = self.deps.is_vpp_last_stage(s, backward_model_chunk_id) + self._set_inst( + BWD( + is_vpp_last_stage=is_vpp_last_stage, + last_microbatch_for_model_chunk=last_microbatch_for_model_chunk, + grad_sync_chunk_id=grad_sync_chunk_id, + grad_sync_microbatch_id=grad_sync_microbatch_id, + model_chunk_id=backward_model_chunk_id, + microbatch_id=backward_k, + debug="1f1b", + ), + s, + ) + + # First virtual stage no activation gradient tensor to send + if is_vpp_first_stage: + self._set_inst(SET_INPUTGRAD_TO_NONE(), s) + + # Determine if the current virtual stage has an activation gradient tensor to receive + recv_next = True + if is_pp_last_stage: + # Last stage is ahead of first stage by (pipeline_parallel_size - 1). + next_backward_model_chunk_id = self.schema.get_model_chunk_id( + backward_k - (self.schema.num_mesh - 1), forward=False + ) + if next_backward_model_chunk_id == 0: + recv_next = False + next_backward_model_chunk_id -= 1 + else: + next_backward_model_chunk_id = self.schema.get_model_chunk_id( + backward_k + 1, forward=False + ) + + tensor_shape = self.schema.get_variable_tensor_shape(backward_k + 1) + send_tensor_shape_unpad = self.schema.get_variable_tensor_shape_unpad(backward_k) + self._set_inst( + SEND_BACKWARD_RECV_BACKWARD( + recv_next=recv_next, + send_shapes=send_shapes, + send_tensor_shapes_unpad=send_shapes_unpad, + send_dtypes=send_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + overlap_p2p_comm=self.overlap_p2p_comm, + send_comms=send_comms, + recv_comms=recv_comms, + debug="1f1b", + ), + s, + ) + else: + model_chunk_id = self.schema.get_model_chunk_id(forward_k, forward=True) + is_vpp_first_stage = self.deps.is_vpp_first_stage(s, model_chunk_id) + is_vpp_last_stage = self.deps.is_vpp_last_stage(s, model_chunk_id) + param_sync_microbatch_id = forward_k + self.schema.num_mesh + param_sync_chunk_id = ( + self.schema.get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1 + ) + first_microbatch_for_model_chunk = self.schema.is_first_microbatch_for_model_chunk( + forward_k + ) + self._set_inst( + FWD( + microbatch_id=forward_k, + model_chunk_id=model_chunk_id, + is_vpp_first_stage=is_vpp_first_stage, + is_vpp_last_stage=is_vpp_last_stage, + param_sync_chunk_id=param_sync_chunk_id, + param_sync_microbatch_id=param_sync_microbatch_id, + first_microbatch_for_model_chunk=first_microbatch_for_model_chunk, + forward_only=self.forward_only, + ), + s, + ) + + # Backward pass. + backward_k = k + grad_sync_microbatch_id = backward_k - s + grad_sync_chunk_id = self.schema.get_model_chunk_id(grad_sync_microbatch_id, forward=False) + last_microbatch_for_model_chunk = self.schema.is_last_microbatch_for_model_chunk( + grad_sync_microbatch_id + ) + backward_model_chunk_id = self.schema.get_model_chunk_id(backward_k, forward=False) + is_vpp_first_stage = self.deps.is_vpp_first_stage(s, backward_model_chunk_id) + is_vpp_last_stage = self.deps.is_vpp_last_stage(s, backward_model_chunk_id) + self._set_inst( + BWD( + microbatch_id=backward_k, + model_chunk_id=backward_model_chunk_id, + is_vpp_last_stage=is_vpp_last_stage, + last_microbatch_for_model_chunk=last_microbatch_for_model_chunk, + grad_sync_microbatch_id=grad_sync_microbatch_id, + grad_sync_chunk_id=grad_sync_chunk_id, + debug="1f1b", + ), + s, + ) + + # Send output_tensor and input_tensor_grad, receive input_tensor + # and output_tensor_grad. + + # Determine if current stage has anything to send in either direction, + # otherwise set tensor to None. + forward_model_chunk_id = self.schema.get_model_chunk_id(forward_k, forward=True) + is_vpp_last_stage = self.deps.is_vpp_last_stage(s, forward_model_chunk_id) + if is_vpp_last_stage: + self._set_inst(SET_OUTPUT_TO_NONE(), s) + backward_model_chunk_id = self.schema.get_model_chunk_id(backward_k, forward=False) + is_vpp_first_stage = self.deps.is_vpp_first_stage(s, backward_model_chunk_id) + if is_vpp_first_stage: + self._set_inst(SET_INPUTGRAD_TO_NONE(), s) + + # Determine if peers are sending, and where in data structure to put + # received tensors. + recv_prev = True + if is_pp_first_stage: + # First stage is ahead of last stage by (pipeline_parallel_size - 1). + next_forward_model_chunk_id = self.schema.get_model_chunk_id( + forward_k - (self.num_meshes - 1), forward=True + ) + if next_forward_model_chunk_id == (self.num_chunks - 1): + recv_prev = False + next_forward_model_chunk_id += 1 + else: + next_forward_model_chunk_id = self.schema.get_model_chunk_id( + forward_k + 1, forward=True + ) + + recv_next = True + if is_pp_last_stage: + # Last stage is ahead of first stage by (pipeline_parallel_size - 1). + next_backward_model_chunk_id = self.schema.get_model_chunk_id( + backward_k - (self.num_meshes - 1), forward=False + ) + if next_backward_model_chunk_id == 0: + recv_next = False + next_backward_model_chunk_id -= 1 + else: + next_backward_model_chunk_id = self.schema.get_model_chunk_id( + backward_k + 1, forward=False + ) + + # If last iteration, don't receive; we already received one extra + # before the start of the for loop. + if k == (self.schema.remaining[s] - 1): + recv_prev = False + + self._set_inst( + SEND_FORWARD_BACKWARD_RECV_FORWARD_BACKWARD( + recv_prev=recv_prev, + recv_next=recv_next, + send_comms=send_comms, + recv_comms=recv_comms, + recv_shapes=recv_shapes, + recv_dtypes=recv_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + debug="1f1b", + ), + s, + ) + + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + + # Put input_tensor and output_tensor_grad in data structures in the + # right location. + if recv_prev: + self._set_inst(APPEND_INPUTS(chunk=next_forward_model_chunk_id), s) + if recv_next: + self._set_inst(APPEND_GRADS(chunk=next_backward_model_chunk_id), s) + + # launch grad_sync_func here to overlap with p2p communication + if self.grad_sync_overlap: + raise NotImplementedError("grad sync is not implement yet") + elif stg == "CD": # cool down stage + if first_time_cool_down[s]: + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + if self.overlap_p2p_comm: + self._set_inst(DRAIN_SEND_REQS(), s) + if not self.forward_only: + if self.overlap_p2p_comm: + self._set_inst(DRAIN_RECV_REQS(drain_type="all", check_bwd_wait=True), s) + first_time_cool_down[s] = False + if self.forward_only: + continue # forward have no backward phase + + grad_sync_microbatch_id = k - s + grad_sync_chunk_id = self.schema.get_model_chunk_id(grad_sync_microbatch_id, forward=False) + last_microbatch_for_model_chunk = self.schema.is_last_microbatch_for_model_chunk( + grad_sync_microbatch_id + ) + model_chunk_id = self.schema.get_model_chunk_id(k, forward=False) + is_vpp_last_stage = self.deps.is_vpp_first_stage(s, model_chunk_id) + self._set_inst( + BWD( + microbatch_id=k, + is_vpp_last_stage=is_vpp_last_stage, + model_chunk_id=model_chunk_id, + last_microbatch_for_model_chunk=last_microbatch_for_model_chunk, + grad_sync_chunk_id=grad_sync_chunk_id, + grad_sync_microbatch_id=grad_sync_microbatch_id, + debug="cooldown", + ), + s, + ) + next_backward_model_chunk_id = self.schema.get_model_chunk_id(k + 1, forward=False) + recv_next = True + if is_pp_last_stage: + if next_backward_model_chunk_id == (self.schema.num_chunks - 1): + recv_next = False + if k == (self.schema.total_num_microbatches - 1): + recv_next = False + + tensor_shape = self.schema.get_variable_tensor_shape(k + 1) + send_tensor_shape_unpad = self.schema.get_variable_tensor_shape_unpad(k) + self._set_inst( + SEND_BACKWARD_RECV_BACKWARD( + recv_next=recv_next, + send_shapes=send_shapes, + send_tensor_shapes_unpad=send_shapes_unpad, + send_dtypes=send_dtypes, + batch_p2p_comm=self.batch_p2p_comm, + overlap_p2p_comm=self.overlap_p2p_comm, + send_comms=send_comms, + recv_comms=recv_comms, + debug="cooldown", + ), + s, + ) + self._set_inst(APPEND_GRADS(chunk=next_backward_model_chunk_id), s) + + if self.grad_sync_overlap: + raise NotImplementedError("grad sync is not support yet") + + if self.overlap_p2p_comm: + self._set_inst(DRAIN_RECV_REQS(drain_type="all"), s) + else: # bubble + # do any other + self._set_inst(BUBBLE(), s) + # Launch any remaining grad reductions + # if grad_sync_func is not None: + # for model_chunk_id in range(num_model_chunks): + # if model_chunk_id not in synchronized_model_chunks: + # grad_sync_func(model[model_chunk_id], model_chunk_id) + # synchronized_model_chunks.add(model_chunk_id) + + # add cool down things + for s in range(self.num_meshes): + if not self.forward_only: + # Launch any remaining grad reductions + self._set_inst(LAUNCH_SHARED_UNITS_SYNC(num_chunks=self.deps.get_num_chunks()), s) + + if self.overlap_p2p_comm: + self._set_inst(DRAIN_SEND_REQS(), s) + + # restore original self.forward_only if the current context manager is torch.no_grad() + if not torch.is_grad_enabled(): + self.forward_only = _forward_only + + self.gen_instruction_str_list() + return self.instruction_list + + def gen_instruction_str_list(self): + instruction_lists = self.instruction_list + stage_strs = defaultdict(str) + for stage_id, instruction_list in enumerate(instruction_lists): + cur_stage_str = stage_strs[stage_id] + for inst in instruction_list: + cur_stage_str += f"{VESACLE_INSTRUCTION_MAPPING_V[type(inst)]}," + cur_stage_str = cur_stage_str[:-1] + stage_strs[stage_id] = cur_stage_str + builder.build_from_dict(stage_strs) + + @manage_dump_file + def execute( + self, + stage_id, + autocast_dtype=torch.float32, + enable_autocast=False, + grad_scaler=None, + deallocate_pipeline_outputs=False, + param_sync_func=None, + grad_sync_func=None, + ): + # if the current context manager is torch.no_grad(), do not compute backward + temp_forward_only = self.forward_only + if not torch.is_grad_enabled(): + self.forward_only = False + + # init constant data + builder.constant_data["autocast_dtype"] = autocast_dtype + builder.constant_data["enable_autocast"] = enable_autocast + builder.constant_data["grad_scaler"] = grad_scaler + builder.constant_data["deallocate_pipeline_outputs"] = deallocate_pipeline_outputs + builder.constant_data["param_sync_func"] = param_sync_func + builder.constant_data["grad_sync_func"] = grad_sync_func + + # Model chunk IDs with synchronized grads + builder.user_data["synchronized_model_chunks"] = set() + builder.user_data["input_tensors"] = [[] for _ in range(self.num_chunk)] + builder.user_data["output_tensors"] = [[] for _ in range(self.num_chunk)] + builder.user_data["output_tensor_grads"] = [[] for _ in range(self.num_chunk)] + builder.user_data["fwd_wait_handles"] = None + builder.user_data["bwd_wait_handles"] = None + builder.user_data["output_tensor"] = None + builder.user_data["input_tensor"] = None + builder.user_data["output_tensor_grad"] = None + builder.user_data["forward_data_store"] = [] + model = self.deps.get_current_model(stage_id) + + builder.model = model + instruction_list = self.get_instruction_list(stage_id) + builder.stage_id = stage_id + builder_instruction_list = builder.global_instructions_funcs[stage_id] + + for inst, fn in zip(instruction_list, builder_instruction_list): + builder.user_data["inst"] = inst + fn() + + # restore original self.forward_only if the current context manager is torch.no_grad() + if not torch.is_grad_enabled(): + self.forward_only = temp_forward_only + + return builder.user_data["forward_data_store"] + + +@register_instruction(name="vescale_interleavd_1f1b_recv_forward") +def vpp_recv_forward(): + inst = builder.user_data["inst"] + tmp = inst.run() + input_tensors = builder.user_data["input_tensors"] + input_tensors[0].append(tmp) + + +@register_instruction(name="vescale_interleavd_1f1b_forward") +def vpp_forward(): + inst = builder.user_data["inst"] + user_data = builder.user_data + forward_data_store = user_data["forward_data_store"] + input_tensors = user_data["input_tensors"] + output_tensors = user_data["output_tensors"] + + constant_data = builder.constant_data + autocast_dtype = constant_data["autocast_dtype"] + enable_autocast = constant_data["enable_autocast"] + param_sync_func = constant_data["param_sync_func"] + + forward_args = { + "data_iterator": builder.dataloader, + "model": builder.model, + "forward_data_store": forward_data_store, + "dtype": autocast_dtype, + "enable_autocast": enable_autocast, + } + output_tensor = inst.run( + input_tensors=input_tensors, + output_tensors=output_tensors, + param_sync_func=param_sync_func, + kwargs=forward_args, + ) + user_data["output_tensor"] = output_tensor + + +@register_instruction(name="vescale_interleaved_1f1b_backward") +def vpp_backward(): + inst = builder.user_data["inst"] + model = builder.model + grad_scaler = builder.constant_data["grad_scaler"] + deallocate_pipeline_outputs = builder.constant_data["deallocate_pipeline_outputs"] + backward_args = { + "grad_scaler": grad_scaler, + "model": model, + "deallocate_pipeline_outputs": deallocate_pipeline_outputs, + } + grad_sync_func = builder.constant_data["grad_sync_func"] + input_tensors = builder.user_data["input_tensors"] + output_tensors = builder.user_data["output_tensors"] + output_tensor_grads = builder.user_data["output_tensor_grads"] + synchronized_model_chunks = builder.user_data["synchronized_model_chunks"] + + input_tensor_grad = inst.run( + input_tensors=input_tensors, + output_tensors=output_tensors, + output_tensor_grads=output_tensor_grads, + grad_sync_func=grad_sync_func, + synchronized_model_chunks=synchronized_model_chunks, + kwargs=backward_args, + ) + builder.user_data["input_tensor_grad"] = input_tensor_grad + + +@register_instruction(name="vescale_interleavd_1f1b_set_output_to_none") +def vpp_set_output_to_none(): + inst = builder.user_data["inst"] + output_tensor = inst.run() + builder.user_data["output_tensor"] = None + + +@register_instruction(name="vescale_interleavd_1f1b_set_input_grad_to_none") +def vpp_set_input_grad_to_none(): + inst = builder.user_data["inst"] + input_tensor_grad = inst.run() + builder.user_data["input_tensor_grad"] = input_tensor_grad + + +@register_instruction(name="vescale_interleaved_1f1b_send_forward_recv_forward") +def vpp_send_forward_recv_forward(): + inst = builder.user_data["inst"] + output_tensor = builder.user_data["output_tensor"] + input_tensor = inst.run(output_tensor=output_tensor) + if inst.overlap_p2p_comm: + input_tensor, fwd_wait_handles = input_tensor + builder.user_data["fwd_wait_handles"] = fwd_wait_handles + builder.user_data["input_tensor"] = input_tensor + + +@register_instruction(name="vescale_interleavd_1f1b_send_backward_recv_backward") +def vpp_send_backward_recv_backward(): + inst = builder.user_data["inst"] + input_tensor_grad = builder.user_data["input_tensor_grad"] + output_tensor_grad = inst.run(input_tensor_grad=input_tensor_grad) + if inst.overlap_p2p_comm: + output_tensor_grad, bwd_wait_handles = output_tensor_grad + builder.user_data["bwd_wait_handles"] = bwd_wait_handles + builder.user_data["output_tensor_grad"] = output_tensor_grad + + +@register_instruction(name="vescale_interleaved_1f1b_send_forward_backward_recv_forward_backward") +def vpp_send_forward_backward_recv_forward_backward(): + inst = builder.user_data["inst"] + output_tensor = builder.user_data["output_tensor"] + input_tensor_grad = builder.user_data["input_tensor_grad"] + input_tensor, output_tensor_grad = inst.run(output_tensor=output_tensor, input_tensor_grad=input_tensor_grad) + builder.user_data["input_tensor"] = input_tensor + builder.user_data["output_tensor_grad"] = output_tensor_grad + + +@register_instruction(name="vescale_interleavd_1f1b_append_grads") +def vpp_append_grads(): + inst = builder.user_data["inst"] + output_tensor_grads = builder.user_data["output_tensor_grads"] + output_tensor_grad = builder.user_data["output_tensor_grad"] + inst.run(output_tensor_grad, output_tensor_grads) + + +@register_instruction(name="vescale_interleavd_1f1b_append_inputs") +def vpp_append_inputs(): + inst = builder.user_data["inst"] + input_tensor = builder.user_data["input_tensor"] + input_tensors = builder.user_data["input_tensors"] + inst.run(input_tensor, input_tensors) + + +@register_instruction(name="vescale_interleavd_1f1b_deallocate_output_tensor") +def vpp_deallocate_tensors(): + inst = builder.user_data["inst"] + deallocate_pipeline_outputs = builder.constant_data["deallocate_pipeline_outputs"] + output_tensor = builder.user_data["output_tensor"] + inst.run(output_tensor=output_tensor, deallocate_pipeline_outputs=deallocate_pipeline_outputs) + + +@register_instruction(name="vescale_interleaved_1f1b_drain_send_reqs") +def vpp_drain_send_reqs(): + inst = builder.user_data["inst"] + inst.run() + + +@register_instruction(name="vescale_interleaved_1f1b_drain_recv_reqs") +def vpp_drain_recv_reqs(): + inst = builder.user_data["inst"] + bwd_wait_handles = builder.user_data["bwd_wait_handles"] + inst.run(bwd_wait_handles=bwd_wait_handles) + + +@register_instruction(name="vescale_interleaved_1f1b_wait_fwd") +def vpp_wait_fwd(): + inst = builder.user_data["inst"] + fwd_wait_handles = builder.user_data["fwd_wait_handles"] + inst.run(fwd_wait_handles=fwd_wait_handles) + + +@register_instruction(name="vescale_interleavd_1f1b_launch_shared_units_sync") +def vpp_launch_shared_units_sync(): + model = builder.model + inst = builder.user_data["inst"] + inst.run(model=model) + + +@register_instruction(name="vescale_interleaved_1f1b_pre_forward_data") +def vpp_prepare_forward_args(): + fn = builder.user_data["prepare_data_fn"] + return fn() + + +@register_instruction(name="vescale_interleaved_1f1b_forward") +def forward_fn(p2p_input, local_input): + model_chunk_id = builder.user_data["model_chunk_id"] + if isinstance(builder.model, Sequence): + + def _feed_input(data): + if isinstance(data, Sequence): + return model(*data) + elif isinstance(data, Dict): + return model(**data) + else: + return model(data) + + model = builder.model[model_chunk_id] + if p2p_input is not None: + return _feed_input(p2p_input) + else: + return _feed_input(local_input) + else: + return builder.model(p2p_input, local_input, model_chunk_id) + + +@register_instruction(name="vescale_interleaved_1f1b_loss_fn") +def loss_fn(): + loss_func = builder.loss_fn + output_tensor = builder.user_data["output_tensor"] + if loss_func is None: + return output_tensor, None + temp_tensor = output_tensor + args_spec = signature(loss_func) + args_len = len(args_spec.parameters.keys()) + if args_len == 1: + output_tensor = loss_func(output_tensor) + else: + ground_truth = builder.user_data["ground_truth"] + loss_fn_inputs = [output_tensor] + ground_truth + output_tensor = loss_func(*loss_fn_inputs) + assert args_len == len(loss_fn_inputs), "Mismatch of loss function #args and #actual inputs!" + builder.user_data["output_tensor"] = output_tensor + return temp_tensor, output_tensor + + +VESACLE_INSTRUCTION_MAPPING_V = { + RECV_FORWARD: "vescale_interleavd_1f1b_recv_forward", + FWD: "vescale_interleavd_1f1b_forward", + BWD: "vescale_interleaved_1f1b_backward", + SET_OUTPUT_TO_NONE: "vescale_interleavd_1f1b_set_output_to_none", + SET_INPUTGRAD_TO_NONE: "vescale_interleavd_1f1b_set_input_grad_to_none", + SEND_FORWARD_RECV_FORWARD: "vescale_interleaved_1f1b_send_forward_recv_forward", + SEND_BACKWARD_RECV_BACKWARD: "vescale_interleavd_1f1b_send_backward_recv_backward", + SEND_FORWARD_BACKWARD_RECV_FORWARD_BACKWARD: "vescale_interleaved_1f1b_send_forward_backward_recv_forward_backward", + APPEND_GRADS: "vescale_interleavd_1f1b_append_grads", + APPEND_INPUTS: "vescale_interleavd_1f1b_append_inputs", + DEALLOCATE_OUTPUT_TENSOR: "vescale_interleavd_1f1b_deallocate_output_tensor", + DRAIN_SEND_REQS: "vescale_interleaved_1f1b_drain_send_reqs", + DRAIN_RECV_REQS: "vescale_interleaved_1f1b_drain_recv_reqs", + WAIT_FWD: "vescale_interleaved_1f1b_wait_fwd", + LAUNCH_SHARED_UNITS_SYNC: "vescale_interleavd_1f1b_launch_shared_units_sync", +} diff --git a/vescale/pipe/_schedules/pipedream_flush.py b/vescale/pipe/_schedules/pipedream_flush.py new file mode 100644 index 0000000..5ec3204 --- /dev/null +++ b/vescale/pipe/_schedules/pipedream_flush.py @@ -0,0 +1,1287 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from vescale.pipe._schedules.instruction_base import ( + PipelineSchema, + Status, + InstructionGenerator, + Shape, + BaseInstruction, + StageDeps, + CommPacket, + CompilePPCollectiveKind, + CompilePPCollectiveOperator, + VESCALE_INTRUCTION_BUILDER as builder, + register_instruction, + registed_functions, +) +from functools import partial +from dataclasses import dataclass +from dataclasses import field +from collections import defaultdict +from vescale.dtensor.dtensor import DTensor, make_dtensor +import contextlib +import torch +import torch.distributed as dist +from inspect import signature +from vescale.dtensor.device_mesh import DeviceMesh +from typing import Sequence, Optional, List, Union, Dict, Callable, Tuple +import numpy as np +from vescale.pipe.p2p_communication import ( + recv_backward, + recv_forward, + send_backward, + send_forward, + send_forward_recv_backward, + send_backward_recv_forward, +) +from vescale.ndtimeline import ndtimer, ndtimeit_p2p +from vescale.ndtimeline.predefined import FORWARD_COMPUTE, BACKWARD_COMPUTE, CROSS_MESH_RECV, CROSS_MESH_SEND +from vescale.pipe.pipe_stage import PipeModule +from vescale.dtensor._diff import dummy_p2p, manage_dump_file +from torch.distributed._functional_collectives import send, recv +from vescale.dtensor.placement_types import Placement +from vescale.dtensor._utils import compute_global_tensor_info +from torch.distributed.distributed_c10d import _get_default_group + + +def maybe_tensor(tensor): + if isinstance(tensor, DTensor): + return tensor._local_tensor + elif isinstance(tensor, torch.Tensor): + return tensor + else: + raise RuntimeError(f"Error parsing tensor {tensor}") + + +def cross_mesh_recv(comm, p2p_tensor): + mapping_group = comm.cur_mesh.get_mapping_rank(comm.peer_mesh) + if isinstance(mapping_group, int): # equal size + default_pg = _get_default_group() + with ndtimeit_p2p(CROSS_MESH_RECV, default_pg, mapping_group, is_batched=False): + tensor = torch.empty((3, 3), device=p2p_tensor.device, dtype=torch.int64) + recv(tensor, mapping_group, default_pg) + p_size = sum(tensor[:, 0] >= 0) + tensor = tensor[:p_size] + sharding_type = [Placement.serialize_from_tensor(p) for p in tensor] + sharding = sharding_type + if len(sharding_type) > 0: + global_shape, global_stride = compute_global_tensor_info(p2p_tensor, comm.cur_mesh, sharding) + p2p_tensor = make_dtensor( + p2p_tensor, + comm.cur_mesh, + sharding, + shape=torch.Size(global_shape), + dtype=p2p_tensor.dtype, + requires_grad=p2p_tensor.requires_grad, + stride=tuple(global_stride), + ) + return p2p_tensor + else: + return p2p_tensor + else: + raise NotImplementedError("currently not support change mesh size") + + +def cross_mesh_send(comm, dt): + mapping_group = comm.cur_mesh.get_mapping_rank(comm.peer_mesh) + if isinstance(mapping_group, int): # equal size + default_pg = _get_default_group() + with ndtimeit_p2p(CROSS_MESH_SEND, default_pg, mapping_group, is_batched=False): + if isinstance(dt, DTensor): + send_sharding = torch.stack( + [p.serialize_to_tensor(dt.device) for p in dt._spec.placements] + + [ + torch.full((3,), -1, device=dt.device, dtype=torch.int64) + for _ in range(3 - len(dt._spec.placements)) + ] + ) + send(send_sharding, mapping_group, default_pg) + else: # tensor + send(torch.full((3, 3), -1, device=dt.device, dtype=torch.int64), mapping_group, default_pg) + else: + raise NotImplementedError("currently not support change mesh size") + + +def cross_mesh_double(comm, fwd_tensor, p2p_tensor): + if isinstance(fwd_tensor, DTensor): + placements = fwd_tensor._spec.placements + global_shape, global_stride = compute_global_tensor_info(p2p_tensor, comm.cur_mesh, placements) + p2p_tensor = make_dtensor( + p2p_tensor, + comm.cur_mesh, + placements, + shape=torch.Size(global_shape), + dtype=p2p_tensor.dtype, + requires_grad=p2p_tensor.requires_grad, + stride=tuple(global_stride), + ) + return p2p_tensor + + +@dataclass +class RECV_FORWARD(BaseInstruction): + comm_packages: List[CommPacket] = field(default_factory=list) + tensor_shapes: Union[List[Shape], Shape] = field(default_factory=list) + tensor_dtypes: Union[List[torch.dtype], torch.dtype] = field(default_factory=list) + batch_id: Optional[int] = None + debug: str = "" + + @property + def name(self): + return "recv_forward" + + def run(self) -> List: + def f(info): + comm, shape, dtype = info + p2p_tensor = recv_forward( + tensor_shape=shape, + recv_dtype=dtype, + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + ) + p2p_tensor = cross_mesh_recv(comm, p2p_tensor) + return p2p_tensor + + infos = zip(self.comm_packages, self.tensor_shapes, self.tensor_dtypes) + out = list(map(f, infos)) + return out if len(out) > 0 else None + + def compile(self) -> List[CompilePPCollectiveOperator]: + out: List[CompilePPCollectiveOperator] = [] + for comm in self.comm_packages: + cur_mesh, peer_mesh = comm.cur_mesh, comm.peer_mesh + coordinate = (cur_mesh.mesh == dist.get_rank()).nonzero(as_tuple=True) + src = peer_mesh.mesh[coordinate].item() + + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.RECV, src=src)) + return out + + +@dataclass +class SEND_FORWARD(BaseInstruction): + comm_packages: List[CommPacket] = field(default_factory=list) + tensor_shapes: List[Shape] = field(default_factory=list) + batch_id: int = 0 + + @property + def name(self): + return "send_forward" + + @dummy_p2p + def run(self, output_tensors: List[torch.Tensor]): + if not isinstance(output_tensors, list): + output_tensors = [output_tensors] + + def f(info): + output_tensor, comm, shape = info + send_forward( + output_tensor=maybe_tensor(output_tensor), + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + tensor_shape=shape, + ) + cross_mesh_send(comm, output_tensor) + + infos = zip(output_tensors, self.comm_packages, self.tensor_shapes) + return list(map(f, infos)) + + def compile(self) -> List[CompilePPCollectiveOperator]: + out: List[CompilePPCollectiveOperator] = [] + for comm in self.comm_packages: + cur_mesh, peer_mesh = comm.cur_mesh, comm.peer_mesh + coordinate = (cur_mesh.mesh == dist.get_rank()).nonzero(as_tuple=True) + dst = peer_mesh.mesh[coordinate].item() + + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.SEND, dst=dst)) + return out + + +@dataclass +class RECV_BACKWARD(BaseInstruction): + comm_packages: List[CommPacket] = field(default_factory=list) + tensor_shapes: Union[List[Shape], Shape] = field(default_factory=list) + tensor_dtypes: List[torch.dtype] = field(default_factory=list) + + @property + def name(self): + return "recv_backward" + + @dummy_p2p + def run(self): + def f(info): + comm, shape, dtype = info + p2p_tensor = recv_backward( + tensor_shape=shape, + recv_dtype=dtype, + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + ) + p2p_tensor = cross_mesh_recv(comm, p2p_tensor) + return p2p_tensor + + infos = zip(self.comm_packages, self.tensor_shapes, self.tensor_dtypes) + out = list(map(f, infos)) + return out if len(out) > 0 else None + + def compile(self) -> List[CompilePPCollectiveOperator]: + out: List[CompilePPCollectiveOperator] = [] + for comm in self.comm_packages: + cur_mesh, peer_mesh = comm.cur_mesh, comm.peer_mesh + coordinate = (cur_mesh.mesh == dist.get_rank()).nonzero(as_tuple=True) + src = peer_mesh.mesh[coordinate].item() + + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.RECV, src=src, is_backward=True)) + return out + + +@dataclass +class SEND_BACKWARD(BaseInstruction): + recv_comms: List[CommPacket] = field(default_factory=list) + tensor_shapes: Union[List[Shape], Shape] = field(default_factory=list) + + @property + def name(self): + return "send_backward" + + @dummy_p2p + def run(self, input_tensor_grad): + if not isinstance(input_tensor_grad, list): + input_tensor_grad = [input_tensor_grad] + + def f(info): + grad, comm, shape = info + send_backward( + input_tensor_grad=maybe_tensor(grad), + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + tensor_shape=shape, + ) + cross_mesh_send(comm, grad) + + infos = zip(input_tensor_grad, self.recv_comms, self.tensor_shapes) + return list(map(f, infos)) + + def compile(self) -> List[CompilePPCollectiveOperator]: + out: List[CompilePPCollectiveOperator] = [] + for comm in self.recv_comms: + cur_mesh, peer_mesh = comm.cur_mesh, comm.peer_mesh + coordinate = (cur_mesh.mesh == dist.get_rank()).nonzero(as_tuple=True) + dst = peer_mesh.mesh[coordinate].item() + + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.SEND, dst=dst, is_backward=True)) + return out + + +@dataclass +class SEND_FORWARD_RECV_BACKWARD(BaseInstruction): + comm_packages: List[CommPacket] = field(default_factory=list) + tensor_shapes: Union[List[Shape], Shape] = field(default_factory=list) + tensor_dtypes: Union[List[torch.dtype], torch.dtype] = field(default_factory=list) + send_batch_id: int = 0 + recv_batch_id: int = 0 + + @property + def name(self): + return "send_forward_recv_backward" + + @dummy_p2p + def run(self, output_tensors): + if not isinstance(output_tensors, list): + output_tensors = [output_tensors] + + def f(info): + output_tensor, comm, shape, dtype = info + p2p_tensor = send_forward_recv_backward( + output_tensor=maybe_tensor(output_tensor), + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + tensor_shape=shape, + recv_dtype=dtype, + ) + p2p_tensor = cross_mesh_double(comm, output_tensor, p2p_tensor) + return p2p_tensor + + infos = zip(output_tensors, self.comm_packages, self.tensor_shapes, self.tensor_dtypes) + out = list(map(f, infos)) + return out if len(out) > 0 else None + + def compile(self) -> List[CompilePPCollectiveOperator]: + out: List[CompilePPCollectiveOperator] = [] + for comm in self.comm_packages: + cur_mesh, peer_mesh = comm.cur_mesh, comm.peer_mesh + coordinate = (cur_mesh.mesh == dist.get_rank()).nonzero(as_tuple=True) + peer_rank = peer_mesh.mesh[coordinate].item() + + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.SEND, dst=peer_rank)) + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.RECV, src=peer_rank, is_backward=True)) + return out + + +@dataclass +class SEND_BACKWARD_RECV_FORWARD(BaseInstruction): + recv_comms: List[CommPacket] + tensor_shapes: Union[List[Shape], Shape] = field(default_factory=list) + tensor_dtypes: Union[List[torch.dtype], torch.dtype] = field(default_factory=list) + + @property + def name(self): + return "send_backward_recv_forward" + + @dummy_p2p + def run(self, input_tensor_grad): + if not isinstance(input_tensor_grad, list): + input_tensor_grad = [input_tensor_grad] + + def f(info): + grad, comm, shape, dtype = info + p2p_tenosr = send_backward_recv_forward( + input_tensor_grad=maybe_tensor(grad), + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + tensor_shape=shape, + recv_dtype=dtype, + ) + p2p_tenosr = cross_mesh_double(comm, grad, p2p_tenosr) + return p2p_tenosr + + infos = zip(input_tensor_grad, self.recv_comms, self.tensor_shapes, self.tensor_dtypes) + + out = list(map(f, infos)) + return out if len(out) > 0 else None + + def compile(self) -> List[CompilePPCollectiveOperator]: + out: List[CompilePPCollectiveOperator] = [] + for comm in self.recv_comms: + cur_mesh, peer_mesh = comm.cur_mesh, comm.peer_mesh + coordinate = (cur_mesh.mesh == dist.get_rank()).nonzero(as_tuple=True) + peer_rank = peer_mesh.mesh[coordinate].item() + + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.SEND, dst=peer_rank, is_backward=True)) + out.append(CompilePPCollectiveOperator(kind=CompilePPCollectiveKind.RECV, src=peer_rank)) + return out + + +@dataclass +class FORWARD_STEP(BaseInstruction): + model: Optional[Union[torch.nn.Module, PipeModule]] = None + is_pp_first_stage: bool = False + is_pp_last_stage: bool = False + local_comm: List[CommPacket] = field(default_factory=list) + p2p_comm: List[CommPacket] = field(default_factory=list) + p2p_index_mapping: List[Tuple[int, int]] = field(default_factory=list) + stage_id: int = 0 + batch_id: int = 0 + forward_only: bool = False + + @property + def name(self): + return "forward_step" + + def construct_input_args(self, p2p_tensors, local_inputs): + """ + stage 0: a , c + stage 1: b + stage 2: dataloader + + stage 2: forward(c,b,dataloader,a) + + p2p_order: [(0, 2), (1, 0), (2, 0), (0, 0)] + send_order: [(0, 0), (0, 2), (1, 0)] + we assume that the p2p send is follow interge order + + we assume that the p2p will allways be args + + """ + if not isinstance(local_inputs, (Sequence, Dict)): + local_inputs = [local_inputs] + if not isinstance(p2p_tensors, list): + p2p_tensors = [p2p_tensors] + p2p_index_without_local = list( + filter(lambda item: item.peer_stage_idx != self.stage_id, self.p2p_index_mapping) + ) + p2p_send_order = sorted(p2p_index_without_local) + local_input_mapping = list(filter(lambda item: item.peer_stage_idx == self.stage_id, self.p2p_index_mapping)) + + args = [] + kwargs = {} + ground_truth = [] + for item in self.p2p_index_mapping: + if item.peer_stage_idx == self.stage_id: + index = local_input_mapping.index(item) + args.append(local_inputs[index]) + else: + index = p2p_send_order.index(item) + args.append(p2p_tensors[index]) + if isinstance(local_inputs, Sequence) and len(local_inputs) > 1: + ground_truth.append(local_inputs[-1]) + elif isinstance(local_inputs, Dict) and "labels" in local_inputs: + ground_truth.append(local_inputs["labels"]) + return args, kwargs, ground_truth + + @dummy_p2p + def run(self, input_tensor, kwargs): + """Forward step for passed-in model. + + If first stage, input tensor is obtained from data_iterator, otherwise + passed-in input_tensor is used. + + Returns output tensor.""" + + data_iterator, forward_data_store, autocast_dtype, enable_autocast = ( + kwargs["data_iterator"], + kwargs["forward_data_store"], + kwargs["autocast_dtype"], + kwargs["enable_autocast"], + ) + if enable_autocast: + context_manager = torch.autocast("cuda", dtype=autocast_dtype) + else: + context_manager = contextlib.nullcontext() + with context_manager: + + def prepare_data(): + local_tensors = [] + ground_truth = [] + if data_iterator is not None: + if isinstance(data_iterator, list): + if len(data_iterator) > self.batch_id: + local_tensors = data_iterator[self.batch_id] + else: + local_tensors = next(data_iterator) + if isinstance(local_tensors, Sequence) and len(local_tensors) > 1: + ground_truth.append(local_tensors[-1]) + elif isinstance(local_tensors, Dict) and "labels" in local_tensors: + ground_truth.append(local_tensors["labels"]) + return input_tensor, local_tensors, ground_truth + + builder.user_data["prepare_data_fn"] = prepare_data + builder.user_data["batch_id"] = self.batch_id + builder.user_data["p2p_tensors"] = input_tensor + p2p_tensor, local_tensors, ground_truth = registed_functions["vescale_1f1b_pre_forward_data"]() + builder.user_data["ground_truth"] = ground_truth + output_tensor = registed_functions["vescale_1f1b_forward"](p2p_tensor, local_tensors) + builder.user_data["output_tensor"] = output_tensor + + if self.is_pp_last_stage: + # update status machine + output_tensor, loss_tensor = registed_functions["vescale_1f1b_loss_fn"]() + forward_data_store.append((output_tensor, loss_tensor)) + if builder.loss_fn is None: + return output_tensor + else: + return loss_tensor + + return output_tensor + + +@dataclass +class BACKWARD_STEP(BaseInstruction): + @property + def name(self): + return "backward step" + + @dummy_p2p + def run(self, input_tensor, output_tensor, output_tensor_grad, kwargs): + """Backward step through passed-in output tensor. + + If last stage, output_tensor_grad is None, otherwise gradient of loss + with respect to stage's output tensor. + + Returns gradient of loss with respect to input tensor (None if first + stage).""" + + grad_scaler = kwargs["grad_scaler"] + deallocate_pipeline_outputs = kwargs["deallocate_pipeline_outputs"] + # NOTE: This code currently can handle at most one skip connection. It + # needs to be modified slightly to support arbitrary numbers of skip + # connections. + + # Retain the grad on the input_tensor. + unwrap_input_tensor_grad = False + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + unwrap_input_tensor_grad = True + for x in input_tensor: + if x is not None: + x.retain_grad() + + if not isinstance(output_tensor, list): + output_tensor = [output_tensor] + if not isinstance(output_tensor_grad, list): + output_tensor_grad = [output_tensor_grad] + + # extract loss value from output tensors + if isinstance(output_tensor[0], Sequence): + for j in range(len(output_tensor[0])): + if output_tensor[0][j].ndim == 0 and output_tensor[0][j].numel() == 1: + loss_value = output_tensor[0][j] + break + else: + loss_value = output_tensor[0][-1] + else: + loss_value = output_tensor[0] + + # Backward pass. + if len(output_tensor_grad) == 0 and grad_scaler is not None: + output_tensor = grad_scaler(loss_value) + + if deallocate_pipeline_outputs: + assert 0 + else: + torch.autograd.backward(loss_value, grad_tensors=output_tensor_grad[0]) + + # Collect the grad of the input_tensor. + input_tensor_grad = [None] + if input_tensor is not None: + input_tensor_grad = [] + for x in input_tensor: + if x is None: + input_tensor_grad.append(None) + else: + input_tensor_grad.append(x.grad) + + if unwrap_input_tensor_grad: + input_tensor_grad = input_tensor_grad[0] + + return input_tensor_grad + + +@dataclass +class DEALLOCATE_OUTPUT_TENSOR(BaseInstruction): + deallocate_out: bool = True + + @property + def name(self): + return "deallocate output tensor " + + @dummy_p2p + def run(self, out, deallocate_pipeline_outputs=False): + """Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field. + + This method should be called right after the output tensor has been + sent to the next pipeline stage. At this point, the output tensor is + only useful for its '.grad_fn' field, and not its '.data'. + """ + # TODO: support DTensor + if (out is None) or (not deallocate_pipeline_outputs): + return + + def f(out): + assert isinstance(out, [torch.Tensor, DTensor]), f"expected Tensor, found {type(out).__name__}." + assert out._base is None, "counter-productive to free a view of another tensor." + if isinstance(out, [torch.Tensor, DTensor]): + out._local_tensor.data = torch.empty( + (1,), + device=out.device, + dtype=out.dtype, + ) + else: + out.data = torch.empty( + (1,), + device=out.device, + dtype=out.dtype, + ) + + if not isinstance(out, list): + for o in out: + f(o) + else: + f(out) + + +@dataclass +class APPEND_INPUTS(BaseInstruction): + @property + def name(self): + return "append_inputs" + + @dummy_p2p + def run(self, input_tensors, input_tensor): + input_tensors.append(input_tensor) + + +@dataclass +class APPEND_OUTPUTS(BaseInstruction): + @property + def name(self): + return "append_outputs" + + @dummy_p2p + def run(self, output_tensors, output_tensor): + output_tensors.append(output_tensor) + + +@dataclass +class POP_INPUT(BaseInstruction): + @property + def name(self): + return "pop input" + + @dummy_p2p + def run(self, input_tensors): + input_tensor = input_tensors.pop(0) + return input_tensor + + +@dataclass +class POP_OUTPUT(BaseInstruction): + @property + def name(self): + return "pop output" + + @dummy_p2p + def run(self, output_tensors): + output_tensor = output_tensors.pop(0) + return output_tensor + + +class PipeDream(PipelineSchema): + """ + generate pipedream schedule (a.k.a 1f1b) + memory-efficient than gpipe + """ + + @property + def name(self): + return "1f1b" + + def _gen_schedule(self): + """ + run forward then run backward + the sequence timeline as show before + d: device + m: batches + T: timeline + + T (m,d) (m,d) (m,d) + - ------ ------ ------- + + 0 (0,0,F) + 1 (1,0,F) (0,1,F) + 2 (2,0,F) (1,1,F) (0,2,F) + 3 (0,2,B) + 4 (0,1,B) (1,2,F) + 5 (0,0,B) (2,1,F) (1,2,B) + 6 (3,0,F) (1,1,B) (2,2,F) + ... + """ + m = self.batches + d = self.num_mesh + + num_clock = (m + d - 1) * 2 # time todo flush + schedules = [[None] * d for c in range(num_clock)] + warmup_batches = [min(d - i - 1, m) for i in range(d)] + remain_batches = [m - i for i in warmup_batches] + next_fwd_batch_idx = [0 for _ in range(d)] + next_bwd_batch_idx = [0 for _ in range(d)] + + self.warmup_batches = warmup_batches + self.remain_batches = remain_batches + + new_timeline = list(range(d)) + """ + t_i|m + 0 1 2 + 0 0 0 0 + 1 0 0 0 + 2 0 0 0 + 3 0 0 1 + 4 0 1 1 + 5 1 1 1 + 1f1b + """ + bwd_done_idx = np.zeros(shape=[num_clock, d], dtype=np.int32) + # warm-up steps + for i in range(d): + for k in range(warmup_batches[i]): + t_i = new_timeline[i] + schedules[t_i][i] = Status(batch_idx=next_fwd_batch_idx[i], stage_id=i, f_b="F", stg="WUp", k=k) + new_timeline[i] += 1 # self add for new timeline + next_fwd_batch_idx[i] += 1 # do next micro batch + + # run 1f1b steps + for i in reversed(range(d)): + for idx in range(remain_batches[i]): + # do forward + t_i = new_timeline[i] + schedules[t_i][i] = Status(batch_idx=next_fwd_batch_idx[i], stage_id=i, f_b="F", stg="1f1b", k=idx) + next_fwd_batch_idx[i] += 1 + bwd_done_idx[t_i][i] = next_bwd_batch_idx[i] + t_i += 1 + + # do backward + if i + 1 < d: + while bwd_done_idx[t_i][i + 1] < next_bwd_batch_idx[i]: + # if the stage 2 is done, the stage i must be equal 0 + assert bwd_done_idx[t_i - 1][i] == next_bwd_batch_idx[i] + bwd_done_idx[t_i][i] = bwd_done_idx[t_i - 1][i] + t_i = t_i + 1 + + if idx == remain_batches[i] - 1: # last iterator + schedules[t_i][i] = Status( + batch_idx=next_bwd_batch_idx[i], stage_id=i, f_b="B", stg="1f1b-l", k=idx + ) + else: + schedules[t_i][i] = Status(batch_idx=next_bwd_batch_idx[i], stage_id=i, f_b="B", stg="1f1b", k=idx) + bwd_done_idx[t_i][i] = next_bwd_batch_idx[i] + next_bwd_batch_idx[i] += 1 + new_timeline[i] = t_i + 1 + + # run cool duwn + for i in reversed(range(d)): + for k in range(warmup_batches[i]): + assert i + 1 < d + t_i = new_timeline[i] + while bwd_done_idx[t_i][i + 1] <= next_bwd_batch_idx[i]: + bwd_done_idx[t_i][i] = next_bwd_batch_idx[i] + t_i = t_i + 1 + schedules[t_i][i] = Status(batch_idx=next_bwd_batch_idx[i], stage_id=i, f_b="B", stg="CD", k=k) + bwd_done_idx[t_i][i] = next_bwd_batch_idx[i] + next_bwd_batch_idx[i] += 1 + new_timeline[i] = t_i + 1 + if i > 0: + bwd_done_idx[new_timeline[i] : num_clock, i] = m + return schedules + + +class OneFOneBInstrcutionGenerator(InstructionGenerator): + def __init__( + self, + deps: StageDeps, + meshes: List[DeviceMesh], + batches: int, + default_shape: Optional[Shape] = None, + default_dtype: Optional[torch.dtype] = None, + batch_shape_lists: Optional[List[Dict[int, Shape]]] = None, + batch_dtype_lists: Optional[List[Dict[int, torch.dtype]]] = None, + forward_only: bool = False, + ): + forward_only = True if not torch.is_grad_enabled() else forward_only + super().__init__( + deps=deps, + meshes=meshes, + batches=batches, + default_shape=default_shape, + default_dtype=default_dtype, + batch_shape_lists=batch_shape_lists, + batch_dtype_lists=batch_dtype_lists, + forward_only=forward_only, + ) + self.num_stage = len(meshes) + self.schema = PipeDream(num_stage=self.num_stage, meshes=meshes, batches=self.batches) + self.forward_only = forward_only + + def get_tensor_shape(self, microbatch_id, input_id): + if self.batch_shape_lists: + if input_id in self.batch_shape_lists[microbatch_id].keys(): + return self.batch_shape_lists[microbatch_id][input_id] + return self.default_shape + + def get_tensor_dtype(self, microbatch_id, input_id): + if self.batch_dtype_lists: + if input_id in self.batch_dtype_lists[microbatch_id].keys(): + return self.batch_dtype_lists[microbatch_id][input_id] + return self.default_dtype + + def get_tensor_shapes_and_dtypes(self, comm_packages: List[CommPacket], microbatch_id: int): + def get_shape_or_dtype(f: Callable, package: CommPacket): + return f(microbatch_id, package.input_id) + + shapes = map(partial(get_shape_or_dtype, self.get_tensor_shape), comm_packages) + dtypes = map(partial(get_shape_or_dtype, self.get_tensor_dtype), comm_packages) + return list(shapes), list(dtypes) + + # call by pipe emitter + def gen_instruction(self): + # If the context is torch.no_grad(), only execute forward + _forward_only = self.forward_only + if not torch.is_grad_enabled(): + self.forward_only = True + + schedules = self.schema.schedules + self.instruction_list = [[] for _ in range(self.num_stage)] + stack = defaultdict(list) # for 1f1b + first_time_1f1b = [True] * self.num_stage + for clk, stages_schemas in enumerate(schedules): + for s, schema in enumerate(stages_schemas): + send_comms = self.deps.get_send_comms(s) + recv_comms = self.deps.get_recv_comms(s) + p2p_index_mapping = self.deps.mapping[s] + cur_model = self.deps.get_current_model(s) + local_comm = self.deps.get_local_comms(s) + is_pp_first_stage = self.deps.is_pipeline_first_stage(s) + is_pp_last_stage = self.deps.is_pipeline_last_stage(s) + if isinstance(cur_model, Sequence): + assert self.num_chunk == 1, "1f1b support model chunk is 1." + cur_model = cur_model[0] + # batch size, stage idx, forward backward, + if schema: + b_idx = schema.batch_idx + stg = schema.stg + if "WUp" in stg: # warmup stage + # recv forward + recv_shapes, recv_dtypes = self.get_tensor_shapes_and_dtypes(recv_comms, b_idx) + send_shapes, _ = self.get_tensor_shapes_and_dtypes(send_comms, b_idx) + self._set_inst( + RECV_FORWARD( + comm_packages=recv_comms, + tensor_shapes=recv_shapes, + tensor_dtypes=recv_dtypes, + batch_id=b_idx, + debug="warm-up", + ), + s, + ) + self._set_inst( + FORWARD_STEP( + model=cur_model, + is_pp_first_stage=is_pp_first_stage, + is_pp_last_stage=is_pp_last_stage, + local_comm=local_comm, + p2p_comm=recv_comms, + p2p_index_mapping=p2p_index_mapping, + stage_id=s, + batch_id=b_idx, + forward_only=self.forward_only, + ), + s, + ) + self._set_inst( + SEND_FORWARD( + comm_packages=send_comms, + tensor_shapes=send_shapes, + batch_id=b_idx, + ), + s, + ) + + if not self.forward_only: + self._set_inst(APPEND_INPUTS(), s) + self._set_inst(APPEND_OUTPUTS(), s) + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + elif "1f1b" in stg: # 1f1b stage + cur_st = stack[s] + if len(cur_st) < 2: + cur_st.append(schema) # lazy do + else: + raise RuntimeError("unknown schedule") + + if len(cur_st) == 2: + if first_time_1f1b[s]: + recv_shapes, recv_dtypes = self.get_tensor_shapes_and_dtypes(recv_comms, b_idx) + # before run 1f1b + self._set_inst( + RECV_FORWARD( + comm_packages=recv_comms, + tensor_shapes=recv_shapes, + tensor_dtypes=recv_dtypes, + batch_id=b_idx, + debug="first 1f1b", + ), + s, + ) + first_time_1f1b[s] = False + fwd = cur_st[0] + bwd = cur_st[1] + fw_b_idx = fwd.batch_idx + bw_b_idx = bwd.batch_idx + self._set_inst( + FORWARD_STEP( + model=cur_model, + is_pp_first_stage=is_pp_first_stage, + is_pp_last_stage=is_pp_last_stage, + local_comm=local_comm, + p2p_comm=recv_comms, + p2p_index_mapping=p2p_index_mapping, + stage_id=s, + batch_id=fw_b_idx, + forward_only=self.forward_only, + ), + s, + ) + + if self.forward_only: + send_shapes, _ = self.get_tensor_shapes_and_dtypes(send_comms, fw_b_idx) + self._set_inst( + SEND_FORWARD( + comm_packages=send_comms, tensor_shapes=send_shapes, batch_id=fw_b_idx + ), + s, + ) + last_iteration = fwd.k == (self.schema.remain_batches[s] - 1) + if not last_iteration: + recv_shapes, recv_dtypes = self.get_tensor_shapes_and_dtypes(recv_comms, fw_b_idx) + self._set_inst( + RECV_FORWARD( + comm_packages=recv_comms, + tensor_shapes=recv_shapes, + tensor_dtypes=recv_dtypes, + batch_id=fw_b_idx, + debug="last_1f1b", + ), + s, + ) + stack[s].clear() + else: + send_shapes, send_dtypes = self.get_tensor_shapes_and_dtypes(send_comms, bw_b_idx) + self._set_inst( + SEND_FORWARD_RECV_BACKWARD( + comm_packages=send_comms, + tensor_shapes=send_shapes, + tensor_dtypes=send_dtypes, + send_batch_id=fw_b_idx, + recv_batch_id=bw_b_idx, + ), + s, + ) + self._set_inst(APPEND_INPUTS(), s) + self._set_inst(APPEND_OUTPUTS(), s) + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + self._set_inst(POP_INPUT(), s) + self._set_inst(POP_OUTPUT(), s) + self._set_inst(BACKWARD_STEP(), s) + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(deallocate_out=False), s) + + if stg == "1f1b-l": + recv_shapes, recv_dtypes = self.get_tensor_shapes_and_dtypes(recv_comms, bw_b_idx) + self._set_inst(SEND_BACKWARD(recv_comms=recv_comms, tensor_shapes=recv_shapes), s) + else: + recv_shapes, recv_dtypes = self.get_tensor_shapes_and_dtypes(recv_comms, fw_b_idx) + self._set_inst( + SEND_BACKWARD_RECV_FORWARD( + recv_comms=recv_comms, tensor_shapes=recv_shapes, tensor_dtypes=recv_dtypes + ), + s, + ) + stack[s].clear() # save for next + else: # 1f1b do f + continue + elif stg == "CD": # cool down stage + if not self.forward_only: + self._set_inst(POP_INPUT(), s) + self._set_inst(POP_OUTPUT(), s) + # recv backward + + send_shapes, send_dtypes = self.get_tensor_shapes_and_dtypes(send_comms, b_idx) + self._set_inst( + RECV_BACKWARD( + comm_packages=send_comms, tensor_shapes=send_shapes, tensor_dtypes=send_dtypes + ), + s, + ) + # backward step + self._set_inst(BACKWARD_STEP(), s) + # deallocate input, output + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(), s) + self._set_inst(DEALLOCATE_OUTPUT_TENSOR(deallocate_out=False), s) + # send backward + recv_shapes, recv_dtypes = self.get_tensor_shapes_and_dtypes(recv_comms, b_idx) + self._set_inst(SEND_BACKWARD(recv_comms=recv_comms, tensor_shapes=recv_shapes), s) + else: # bubble + # TODO + # do any other + continue + self.gen_instruction_str_list() + + # restore original self.forward_only if the current context manager is torch.no_grad() + if not torch.is_grad_enabled(): + self.forward_only = _forward_only + + return self.instruction_list + + def gen_instruction_str_list(self): + instruction_lists = self.instruction_list + stage_strs = defaultdict(str) + for stage_id, instruction_list in enumerate(instruction_lists): + cur_stage_str = stage_strs[stage_id] + for inst in instruction_list: + cur_stage_str += f"{VESACLE_INSTRUCTION_MAPPING[type(inst)]}," + cur_stage_str = cur_stage_str[:-1] + stage_strs[stage_id] = cur_stage_str + builder.build_from_dict(stage_strs) + + @manage_dump_file + def execute( + self, + stage_id, + autocast_dtype=torch.float, + enable_autocast=False, + grad_scaler=None, + deallocate_pipeline_outputs=False, + ): + builder.constant_data["autocast_dtype"] = autocast_dtype + builder.constant_data["enable_autocast"] = enable_autocast + builder.constant_data["grad_scaler"] = grad_scaler + builder.constant_data["deallocate_pipeline_outputs"] = deallocate_pipeline_outputs + + user_data = builder.user_data + user_data["input_tensors"] = [] + user_data["output_tensors"] = [] + user_data["input_tensor"] = None # engine need to maintain the dataflow + user_data["output_tensor"] = None # engine need to maintian the output flow + user_data["output_tensor_grad"] = None + user_data["input_tensor_grad"] = None + user_data["forward_data_store"] = [] + + instruction_list = self.get_instruction_list(stage_id) + builder.stage_id = stage_id + builder_instruction_list = builder.global_instructions_funcs[stage_id] + + _forward_only = self.forward_only + if not torch.is_grad_enabled(): + self.forward_only = True + + for inst, fn in zip(instruction_list, builder_instruction_list): + user_data["inst"] = inst + fn() + + # restore original self.forward_only if the current context manager is torch.no_grad() + if not torch.is_grad_enabled(): + self.forward_only = _forward_only + + return user_data["forward_data_store"] + + +@register_instruction(name="vescale_1f1b_recv_forward") +def vescale_recv_forward(): + user_data = builder.user_data + inst = user_data["inst"] + input_tensor = inst.run() + builder.user_data["input_tensor"] = input_tensor + return input_tensor + + +@register_instruction(name="vescale_1f1b_recv_backward") +def vescale_recv_backward(): + user_data = builder.user_data + inst = user_data["inst"] + output_tensor_grad = inst.run() + builder.user_data["output_tensor_grad"] = output_tensor_grad + return output_tensor_grad + + +@register_instruction(name="vescale_1f1b_send_forward") +def vescale_send_forward(): + user_data = builder.user_data + inst = user_data["inst"] + output_tensor = user_data["output_tensor"] + inst.run(output_tensors=output_tensor) + + +@register_instruction(name="vescale_1f1b_send_backward") +def vescale_send_backward(): + user_data = builder.user_data + inst = user_data["inst"] + input_tensor_grad = user_data["input_tensor_grad"] + inst.run(input_tensor_grad=input_tensor_grad) + + +@register_instruction(name="vescale_1f1b_send_forward_recv_backward") +def vescale_send_forward_recv_backward(): + user_data = builder.user_data + inst = user_data["inst"] + output_tensor = user_data["output_tensor"] + output_tensor_grad = inst.run(output_tensors=output_tensor) + builder.user_data["output_tensor_grad"] = output_tensor_grad + + +@register_instruction(name="vescale_1f1b_send_backward_recv_forward") +def vescale_send_backward_recv_forward(): + user_data = builder.user_data + inst = user_data["inst"] + input_tensor_grad = user_data["input_tensor_grad"] + with torch.no_grad(): + input_tensor = inst.run(input_tensor_grad=input_tensor_grad) + builder.user_data["input_tensor"] = input_tensor + + +@register_instruction(name="vescale_1f1b_forward_step") +@ndtimer(FORWARD_COMPUTE) +def vescale_forward_step(): + user_data = builder.user_data + constant_data = builder.constant_data + inst = user_data["inst"] + input_tensor = user_data["input_tensor"] + forward_data_store = user_data["forward_data_store"] + autocast_dtype = constant_data["autocast_dtype"] + builder.model = inst.model + if not autocast_dtype: + autocast_dtype = torch.float32 + enable_autocast = constant_data["enable_autocast"] + if not enable_autocast: + enable_autocast = False + if forward_data_store is None: + forward_data_store = [] + forward_args = { + "data_iterator": builder.dataloader, + "forward_data_store": forward_data_store, + "autocast_dtype": autocast_dtype, + "enable_autocast": enable_autocast, + } + output_tensor = inst.run(input_tensor=input_tensor, kwargs=forward_args) + builder.user_data["output_tensor"] = output_tensor + builder.user_data["forward_data_store"] = forward_data_store + + +@register_instruction(name="vescale_1f1b_loss_fn") +def loss_fn(): + user_data = builder.user_data + output_tensor = user_data["output_tensor"] + loss_func = builder.loss_fn + if loss_func is None or output_tensor is None: + return output_tensor, None + temp_tensor = output_tensor + ground_truth = user_data["ground_truth"] + # signature provides a more uniform way to parse callable arguments, including lambda functions + args_spec = signature(loss_func) + args_len = len(args_spec.parameters.keys()) + if args_len == 1: + output_tensor = loss_func(output_tensor) + else: + ground_truth = builder.user_data["ground_truth"] + loss_fn_inputs = [output_tensor] + ground_truth + output_tensor = loss_func(*loss_fn_inputs) + assert args_len == len(loss_fn_inputs), "Mismatch of loss function #args and #actual inputs!" + return temp_tensor, output_tensor + + +@register_instruction(name="vescale_1f1b_pre_forward_data") +def prepare_data(): + user_data = builder.user_data + return user_data["prepare_data_fn"]() + + +@register_instruction(name="vescale_1f1b_forward") +def forward_fn(p2p_input, local_input): + if isinstance(builder.model, PipeModule): + return builder.model(p2p_input, local_input, chunk_id=0) + else: + + def _feed_input(model, data): + if isinstance(data, Sequence): + return model(*data) + elif isinstance(data, Dict): + return model(**data) + else: + return model(data) + + if p2p_input is not None: + return _feed_input(builder.model, p2p_input) + else: + return _feed_input(builder.model, local_input) + + +@register_instruction(name="vescale_1f1b_backward_step") +@ndtimer(BACKWARD_COMPUTE) +def vescale_backward_step(): + constant_data = builder.constant_data + grad_scaler = constant_data["grad_scaler"] + deallocate_pipeline_outputs = constant_data["deallocate_pipeline_outputs"] + backward_args = { + "grad_scaler": grad_scaler, + "deallocate_pipeline_outputs": deallocate_pipeline_outputs, + } + + user_data = builder.user_data + input_tensor = user_data["input_tensor"] + output_tensor = user_data["output_tensor"] + output_tensor_grad = user_data["output_tensor_grad"] + inst = user_data["inst"] + + input_tensor_grad = inst.run( + input_tensor=input_tensor, + output_tensor=output_tensor, + output_tensor_grad=output_tensor_grad, + kwargs=backward_args, + ) + builder.user_data["input_tensor_grad"] = input_tensor_grad + + +@register_instruction(name="vescale_1f1b_pop_input") +def vescale_1f1b_pop_input(): + user_data = builder.user_data + inst = user_data["inst"] + input_tensors = user_data["input_tensors"] + input_tensor = inst.run(input_tensors=input_tensors) + builder.user_data["input_tensor"] = input_tensor + + +@register_instruction(name="vescale_1f1b_pop_output") +def vescale_1f1b_pop_output(): + user_data = builder.user_data + inst = user_data["inst"] + output_tensors = user_data["output_tensors"] + output_tensor = inst.run(output_tensors=output_tensors) + builder.user_data["output_tensor"] = output_tensor + + +@register_instruction(name="vescale_1f1b_append_inputs") +def vescale_1f1b_append_inputs(): + user_data = builder.user_data + inst = user_data["inst"] + input_tensors = user_data["input_tensors"] + input_tensor = user_data["input_tensor"] + if input_tensors is None: + input_tensors = [] + inst.run(input_tensors=input_tensors, input_tensor=input_tensor) + user_data["input_tensors"] = input_tensors + + +@register_instruction(name="vescale_1f1b_append_outputs") +def vescale_1f1b_append_outputs(): + user_data = builder.user_data + inst = user_data["inst"] + output_tensors = user_data["output_tensors"] + output_tensor = user_data["output_tensor"] + if output_tensors is None: + output_tensors = [] + inst.run(output_tensors=output_tensors, output_tensor=output_tensor) + user_data["output_tensors"] = output_tensors + + +@register_instruction(name="vescale_1f1b_deallocate_output_tensor") +def vescale_1f1b_deallocate_output_tensor(): + user_data = builder.user_data + inst = user_data["inst"] + const_data = builder.constant_data + deallocate_pipeline_outputs = const_data["deallocate_pipeline_outputs"] + if inst.deallocate_out: + output_tensor = user_data["output_tensor"] + inst.run(output_tensor, deallocate_pipeline_outputs=deallocate_pipeline_outputs) + else: + input_tensor = user_data["input_tensor"] + if input_tensor and input_tensor[0] is not None: + input_tensor[0].grad = None + inst.run(input_tensor, deallocate_pipeline_outputs=deallocate_pipeline_outputs) + + +VESACLE_INSTRUCTION_MAPPING = { + RECV_FORWARD: "vescale_1f1b_recv_forward", + RECV_BACKWARD: "vescale_1f1b_recv_backward", + SEND_FORWARD: "vescale_1f1b_send_forward", + SEND_BACKWARD: "vescale_1f1b_send_backward", + SEND_FORWARD_RECV_BACKWARD: "vescale_1f1b_send_forward_recv_backward", + SEND_BACKWARD_RECV_FORWARD: "vescale_1f1b_send_backward_recv_forward", + FORWARD_STEP: "vescale_1f1b_forward_step", + BACKWARD_STEP: "vescale_1f1b_backward_step", + POP_INPUT: "vescale_1f1b_pop_input", + POP_OUTPUT: "vescale_1f1b_pop_output", + APPEND_INPUTS: "vescale_1f1b_append_inputs", + APPEND_OUTPUTS: "vescale_1f1b_append_outputs", + DEALLOCATE_OUTPUT_TENSOR: "vescale_1f1b_deallocate_output_tensor", +} diff --git a/vescale/pipe/_schedules/pp_collective_emitter.py b/vescale/pipe/_schedules/pp_collective_emitter.py new file mode 100644 index 0000000..edae5a5 --- /dev/null +++ b/vescale/pipe/_schedules/pp_collective_emitter.py @@ -0,0 +1,289 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from typing import List, Union, Dict +import logging + +import torch +from torch.export.graph_signature import TensorArgument + +from vescale.pipe.pipe_emmiter import ScheduleEngine, OneFOneBInstrcutionGenerator +from vescale.plan.spec import ScheduleType +from vescale.pipe._schedules.instruction_base import ( + BaseInstruction, + CompilePPCollectiveKind, + CompilePPCollectiveOperator, +) + +logger = logging.getLogger(__name__) + + +def read_fg(fg): + num_inputs = 0 + num_outputs = None + for node in fg.graph.nodes: + if node.op == "placeholder": + num_inputs += 1 + if node.op == "output": + num_outputs = len(node.args[0]) + return num_inputs, num_outputs + + +class PPCollectiveOpEmitter: + def __init__(self, curr_rank: int = None) -> None: + self.num_params_and_buffers = self.num_real_inputs = self.num_real_outputs = None + + self.curr_rank = curr_rank + + self.fwd_send_dsts = [] + self.bwd_send_dsts = [] + self.fwd_recv_srcs = [] + self.bwd_recv_srcs = [] + + def gen_pp_collective_topo_from_schedule_engine(self, pipe_engine: ScheduleEngine): + fwd_recv_srcs, fwd_send_dsts, bwd_send_dsts, bwd_recv_srcs = set(), set(), set(), set() + assert ( + pipe_engine.schedule == ScheduleType.SIMPLE_1F1B + ), "For inserting send/recv operators, we only need the topology information, please consider use this simplier PipeSchedule" + assert isinstance( + pipe_engine.p_emmiter.instruction_generator, OneFOneBInstrcutionGenerator + ), "For inserting send/recv operators, we only need the topology information, please consider use this simplier PipeSchedule" + insts: List[BaseInstruction] = pipe_engine.get_instruction_list(pipe_engine.stage_id) + compiled_insts: List[List[CompilePPCollectiveOperator]] = [ + inst.compile() for inst in insts if hasattr(inst, "compile") + ] + flat_compile_insts = [] + for list_insts in compiled_insts: + flat_compile_insts.extend(list_insts) + for inst in flat_compile_insts: + if inst.kind is CompilePPCollectiveKind.BORADCAST: + raise NotImplementedError("broadcast is not supported now") + elif inst.kind is CompilePPCollectiveKind.SEND: + if inst.is_backward: + bwd_send_dsts.add(inst.dst) + else: + fwd_send_dsts.add(inst.dst) + elif inst.kind is CompilePPCollectiveKind.RECV: + if inst.is_backward: + bwd_recv_srcs.add(inst.src) + else: + fwd_recv_srcs.add(inst.src) + else: + raise NotImplementedError("Unknown collective operators") + self.gen_pp_collective_topo_from_given( + list(fwd_send_dsts), list(fwd_recv_srcs), list(bwd_send_dsts), list(bwd_recv_srcs) + ) + + def gen_pp_collective_topo_from_given( + self, + fwd_send_dsts: List[int] = None, + fwd_recv_srcs: List[int] = None, + bwd_send_dsts: List[int] = None, + bwd_recv_srcs: List[int] = None, + ): + self.fwd_send_dsts = fwd_send_dsts + self.fwd_recv_srcs = fwd_recv_srcs + self.bwd_send_dsts = bwd_send_dsts + self.bwd_recv_srcs = bwd_recv_srcs + + # this function should return a dict to indicate a output_spec change in ExportedProgram + def insert_send_fwd(self, fg: torch.fx.GraphModule) -> Dict[str, str]: + if not self.fwd_send_dsts: + return {} + assert len(self.fwd_send_dsts) == self.num_real_outputs + replaced_outputs = {} + for node in fg.graph.nodes: + if node.op != "output": + continue + with fg.graph.inserting_before(node): + node_args = node.args[0] + for i in range(self.num_real_outputs): + arg = node_args[i] + new_node = fg.graph.create_node( + op="call_function", + target=torch.ops.c10d_functional.send.default, + args=( + arg, + self.fwd_send_dsts[i], + f"{self.curr_rank}{self.fwd_send_dsts[i]}", + [self.curr_rank, self.fwd_send_dsts[i]], + 2, + ), + kwargs={}, + name="pp_send_fwd", + ) + new_node.meta["stack_trace"] = "inserted by pp_collective_emitter" + new_node.meta["val"] = arg.meta.get("val", None) + new_node.meta["tensor_meta"] = arg.meta.get("tensor_meta", None) + replaced_outputs[arg.name] = new_node.name + node.replace_input_with(arg, new_node) + fg.recompile() + return replaced_outputs + + def insert_recv_fwd(self, fg: torch.fx.GraphModule): + if not self.fwd_recv_srcs: + return + assert len(self.fwd_recv_srcs) == self.num_real_inputs + seen_placeholders = 0 + for node in fg.graph.nodes: + if node.op != "placeholder": + continue + seen_placeholders += 1 + if seen_placeholders <= self.num_params_and_buffers: + continue + real_input_idx = seen_placeholders - self.num_params_and_buffers - 1 + with fg.graph.inserting_after(node): + src = self.fwd_recv_srcs[real_input_idx] + new_node = fg.graph.create_node( + op="call_function", + target=torch.ops.c10d_functional.recv.default, + args=( + node, + src, + f"{src}{self.curr_rank}", + [src, self.curr_rank], + 2, + ), + kwargs={}, + name="pp_recv_fwd", + ) + new_node.meta["stack_trace"] = "inserted by pp_collective_emitter" + new_node.meta["val"] = node.meta.get("val", None) + new_node.meta["tensor_meta"] = node.meta.get("tensor_meta", None) + for user in list(node.users): + if user == new_node: + continue + user.replace_input_with(node, new_node) + + fg.recompile() + + def insert_send_bwd(self, fg: torch.fx.GraphModule): + if not self.bwd_send_dsts: + return + assert len(self.bwd_send_dsts) == self.num_real_inputs + for node in fg.graph.nodes: + if node.op != "output": + continue + with fg.graph.inserting_before(node): + args = node.args[0] + for i in range(self.num_real_inputs): + dst = self.bwd_send_dsts[i] + arg = args[i + self.num_params_and_buffers] + new_node = fg.graph.create_node( + op="call_function", + target=torch.ops.c10d_functional.send.default, + args=( + arg, + dst, + f"{self.curr_rank}{dst}", + [self.curr_rank, dst], + 2, + ), + kwargs={}, + name="pp_send_bwd", + ) + new_node.meta["stack_trace"] = "inserted by pp_collective_emitter" + new_node.meta["val"] = arg.meta.get("val", None) + new_node.meta["tensor_meta"] = arg.meta.get("tensor_meta", None) + node.replace_input_with(arg, new_node) + fg.recompile() + + def insert_recv_bwd(self, fg: torch.fx.GraphModule): + if not self.bwd_recv_srcs: + return + assert len(self.bwd_recv_srcs) == self.num_real_outputs + seen_placeholders = 0 + for node in fg.graph.nodes: + if node.op != "placeholder": + continue + seen_placeholders += 1 + if seen_placeholders <= self.num_params_and_buffers: + continue + with fg.graph.inserting_after(node): + src = self.bwd_recv_srcs[seen_placeholders - self.num_params_and_buffers - 1] + new_node = fg.graph.create_node( + op="call_function", + target=torch.ops.c10d_functional.recv.default, + args=( + node, + src, + f"{src}{self.curr_rank}", + [src, self.curr_rank], + 2, + ), + kwargs={}, + name="pp_recv_bwd", + ) + new_node.meta["stack_trace"] = "inserted by pp_collective_emitter" + new_node.meta["val"] = node.meta.get("val", None) + new_node.meta["tensor_meta"] = node.meta.get("tensor_meta", None) + for user in list(node.users): + if user == new_node: + continue + user.replace_input_with(node, new_node) + + fg.recompile() + + def load_original_graph_module(self, original_gm): + named_parameters = dict(original_gm.named_parameters(remove_duplicate=False)) + named_buffers = dict(original_gm.named_buffers(remove_duplicate=False)) + self.num_params_and_buffers = len(named_buffers) + len(named_parameters) + self.num_real_inputs, self.num_real_outputs = read_fg(original_gm) + + def run(self, fg: Union[torch.fx.GraphModule, torch.export.ExportedProgram] = None, is_backward: bool = None): + if isinstance(fg, torch.fx.GraphModule): + logging.info( + "You are inserting PP collective operators to a torch.compiled graph, make sure call PPCollectiveOpEmitter.load_original_graph_module first" + ) + assert ( + self.num_real_outputs is not None + and self.num_params_and_buffers is not None + and self.num_real_inputs is not None + ), "Please call PPCollectiveOpEmitter.load_original_graph_module first" + + assert is_backward is not None, "Please provide is_backward argument" + if not is_backward: + num_total_inputs, _ = read_fg(fg) + else: + _, num_total_inputs = read_fg(fg) + assert num_total_inputs == self.num_real_inputs + self.num_params_and_buffers + if not is_backward: + self.insert_send_fwd(fg) + self.insert_recv_fwd(fg) + else: + self.insert_send_bwd(fg) + self.insert_recv_bwd(fg) + return fg + + elif isinstance(fg, torch.export.ExportedProgram): + logging.info("You are inserting PP collective operators to a torch.exported graph") + ep = fg + self.num_params_and_buffers = len(ep.state_dict) + fg = ep.graph_module + self.num_real_inputs, self.num_real_outputs = read_fg(fg) + self.num_real_inputs -= self.num_params_and_buffers + replaced_outputs = self.insert_send_fwd(fg) + self.insert_recv_fwd(fg) + + # output_spec changes + for o_spec in ep._graph_signature.output_specs: + if isinstance(o_spec.arg, TensorArgument) and o_spec.arg.name in replaced_outputs: + o_spec.arg = TensorArgument(replaced_outputs[o_spec.arg.name]) + return ep + + else: + raise NotImplementedError("Unknown model type") diff --git a/vescale/pipe/_schedules/zero_bubble_v.py b/vescale/pipe/_schedules/zero_bubble_v.py new file mode 100644 index 0000000..294a806 --- /dev/null +++ b/vescale/pipe/_schedules/zero_bubble_v.py @@ -0,0 +1,1170 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from typing import List, Sequence, Optional, Dict +from collections import deque, defaultdict +from dataclasses import dataclass +from inspect import signature +import contextlib + +import torch + +from vescale.pipe._schedules.instruction_base import ( + InstructionGenerator, + StageDeps, + CommPacket, + register_instruction, + Shape, + registed_functions, + VESCALE_INTRUCTION_BUILDER as builder, +) +from vescale.pipe.p2p_communication import ( + recv_backward, + recv_forward, + send_backward, + send_forward, +) +from vescale.dtensor._diff import manage_dump_file +from vescale.dtensor.device_mesh import DeviceMesh +from vescale.dtensor.dtensor import DTensor, make_dtensor +from vescale.ndtimeline import ndtimeit_p2p +from vescale.ndtimeline.predefined import CROSS_MESH_RECV, CROSS_MESH_SEND +from torch.distributed._functional_collectives import send, recv +from vescale.dtensor.placement_types import Placement +from vescale.dtensor._utils import compute_global_tensor_info +from torch.distributed.distributed_c10d import _get_default_group +from vescale.model.base_gpt.utils import switch_dtensor + +import logging + +logger = logging.getLogger(__file__) + + +def maybe_tensor(tensor): + if isinstance(tensor, DTensor): + return tensor._local_tensor + elif isinstance(tensor, torch.Tensor): + return tensor + else: + raise RuntimeError(f"Error parsing tensor {tensor}") + + +def cross_mesh_recv(comm, p2p_tensor): + mapping_group = comm.cur_mesh.get_mapping_rank(comm.peer_mesh) + if isinstance(mapping_group, int): # equal size + default_pg = _get_default_group() + with ndtimeit_p2p(CROSS_MESH_RECV, default_pg, mapping_group, is_batched=False): + tensor = torch.empty((3, 3), device=p2p_tensor.device, dtype=torch.int64) + recv(tensor, mapping_group, default_pg) + p_size = sum(tensor[:, 0] >= 0) + tensor = tensor[:p_size] + sharding_type = [Placement.serialize_from_tensor(p) for p in tensor] + sharding = sharding_type + if len(sharding_type) > 0: + global_shape, global_stride = compute_global_tensor_info(p2p_tensor, comm.cur_mesh, sharding) + p2p_tensor = make_dtensor( + p2p_tensor, + comm.cur_mesh, + sharding, + shape=torch.Size(global_shape), + dtype=p2p_tensor.dtype, + requires_grad=p2p_tensor.requires_grad, + stride=tuple(global_stride), + ) + return p2p_tensor + else: + return p2p_tensor + else: + raise NotImplementedError("currently not support change mesh size") + + +def cross_mesh_send(comm, dt): + mapping_group = comm.cur_mesh.get_mapping_rank(comm.peer_mesh) + if isinstance(mapping_group, int): # equal size + default_pg = _get_default_group() + with ndtimeit_p2p(CROSS_MESH_SEND, default_pg, mapping_group, is_batched=False): + if isinstance(dt, DTensor): + send_sharding = torch.stack( + [p.serialize_to_tensor(dt.device) for p in dt._spec.placements] + + [ + torch.full((3,), -1, device=dt.device, dtype=torch.int64) + for _ in range(3 - len(dt._spec.placements)) + ] + ) + send(send_sharding, mapping_group, default_pg) + else: # tensor + send(torch.full((3, 3), -1, device=dt.device, dtype=torch.int64), mapping_group, default_pg) + else: + raise NotImplementedError("currently not support change mesh size") + + +def cross_mesh_double(comm, fwd_tensor, p2p_tensor): + if isinstance(fwd_tensor, DTensor): + placements = fwd_tensor._spec.placements + global_shape, global_stride = compute_global_tensor_info(p2p_tensor, comm.cur_mesh, placements) + p2p_tensor = make_dtensor( + p2p_tensor, + comm.cur_mesh, + placements, + shape=torch.Size(global_shape), + dtype=p2p_tensor.dtype, + requires_grad=p2p_tensor.requires_grad, + stride=tuple(global_stride), + ) + return p2p_tensor + + +@dataclass(eq=True, frozen=True) +class ScheduledNode: + type: str + chunk: int + stage: int + minibatch: int + start_time: int + completion_time: int + rollback: bool = False + + def get_send_comms(self, total_stages, deps): + if self.chunk == 0: + return ( + [ + CommPacket( + cur_mesh=deps.get_current_mesh(self.stage), + peer_mesh=deps.get_current_mesh(self.stage + 1), + input_id=0, + peer_stage=self.stage + 1, + ) + ] + if self.stage != total_stages + else [] + ) + else: + return ( + [ + CommPacket( + cur_mesh=deps.get_current_mesh(self.stage), + peer_mesh=deps.get_current_mesh(self.stage - 1), + input_id=0, + peer_stage=self.stage - 1, + ) + ] + if self.stage != 0 + else [] + ) + + def get_recv_comms(self, total_stages, deps): + if self.chunk == 0: + return ( + [ + CommPacket( + cur_mesh=deps.get_current_mesh(self.stage), + peer_mesh=deps.get_current_mesh(self.stage - 1), + input_id=0, + peer_stage=self.stage - 1, + ) + ] + if self.stage != 0 + else [] + ) + else: + return ( + [ + CommPacket( + cur_mesh=deps.get_current_mesh(self.stage), + peer_mesh=deps.get_current_mesh(self.stage + 1), + input_id=0, + peer_stage=self.stage + 1, + ) + ] + if self.stage != total_stages + else [] + ) + + +class CostGraph: + def __init__(self, n_stage, n_micro, f_cost, b_cost, w_cost, c_cost, f_mem, b_mem, w_mem, max_mem=None): + self.n_node = 6 * n_stage * n_micro + self.n_stage = n_stage + self.n_micro = n_micro + self.f_cost = f_cost + self.b_cost = b_cost + self.w_cost = w_cost + self.c_cost = c_cost + self.f_mem = f_mem + self.b_mem = b_mem + self.w_mem = w_mem + self.fbw_cost = [f_cost, b_cost, w_cost] + self.fbw_mem = [f_mem, b_mem, w_mem] + self.max_mem = max_mem or f_mem * self.n_stage * 2 + + def get_id(self, cat, chunk, stage, micro): + return ( + cat * 2 * self.n_stage * self.n_micro + chunk * self.n_stage * self.n_micro + stage * self.n_micro + micro + ) + + def try_v_schedule(self, fill_f=True, fill_b=True, approved_bubble=None): + count = [] + for i in range(self.n_stage): + count.append([0] * 6) + + end_time = [-1] * self.n_node + cur_time = [0] * self.n_stage + mem = [0] * self.n_stage + stage_bubble = [0] * self.n_stage + pending_w = [deque() for _ in range(self.n_stage)] + schedule = [[] for _ in range(self.n_stage)] + stage_str = [" " * i for i in range(self.n_stage)] + + if approved_bubble is None: + approved_bubble = [-1] * self.n_stage + max_approved_bubble = max(approved_bubble) + + def get_max_stage_bubble(stage=-1): + max_stage_bubble = 0 + for bb in stage_bubble: + max_stage_bubble = max(max_stage_bubble, bb) + if stage >= 0: + max_stage_bubble = max(max_stage_bubble, max_approved_bubble - approved_bubble[stage]) + return max_stage_bubble + + def put_w(stage): + assert len(pending_w[stage]) > 0 + _, chunk_, _ = pending_w[stage].popleft() + put(2, chunk_, stage) + + def put(cat, chunk, stage, assert_cnt=True): + _tmp = _no_bubble = cur_time[stage] + self.fbw_cost[cat] + _cnt = count[stage][cat * 2 + chunk] + if _cnt >= self.n_micro: + if not assert_cnt: + stage_str[stage] += " " + cur_time[stage] = _tmp # TODO + return + raise AssertionError() + assert mem[stage] + self.fbw_mem[cat] <= self.max_mem + stage_str[stage] += "FfBbWw"[cat * 2 + chunk] + str(_cnt + 1) + " " * (3 - len(str(_cnt + 1))) + if cat > 0 or chunk > 0: + last_id = cat * 2 + chunk - 1 + if cat < 2: + assert end_time[self.get_id(last_id // 2, last_id % 2, stage, _cnt)] >= 0 + else: + assert end_time[self.get_id(1, chunk, stage, _cnt)] >= 0 + if chunk == 1 and cat < 2: + if stage < self.n_stage - 1: + _fa_id = self.get_id(cat, chunk, stage + 1, _cnt) + assert end_time[_fa_id] >= 0 + _tmp = max(_tmp, end_time[_fa_id] + self.c_cost + self.fbw_cost[cat]) + if chunk == 0 and cat < 2: + if stage > 0: + _fa_id = self.get_id(cat, chunk, stage - 1, _cnt) + assert end_time[_fa_id] >= 0, f"{cat}, {chunk}, {stage}, {_cnt}" + _tmp = max(_tmp, end_time[_fa_id] + self.c_cost + self.fbw_cost[cat]) + _id = self.get_id(cat, chunk, stage, _cnt) + if count[stage][0] > 0: + stage_bubble[stage] += _tmp - _no_bubble + end_time[_id] = _tmp + cur_time[stage] = _tmp + mem[stage] += self.fbw_mem[cat] + # noinspection PyTypeChecker + schedule[stage].append((cat, chunk, _cnt)) + if cat == 1: + pending_w[stage].append((2, chunk, _cnt)) + count[stage][cat * 2 + chunk] += 1 + + for i in range(self.n_stage): + put(0, 0, i) + for i in range(self.n_stage - 1, -1, -1): + if i == self.n_stage - 1: + put(0, 1, i) + continue + tmp = end_time[self.get_id(0, 1, i + 1, 0)] + self.c_cost + while ( + mem[i] + self.fbw_mem[0] * (2 + i * 2) <= self.max_mem + and cur_time[i] + self.fbw_cost[0] <= tmp + and count[i][0] < self.n_micro + ): + for j in range(i + 1): + put(0, 0, j) + put(0, 1, i) + iter_chunk_ = 0 + end_tmp = 0 + for i in range(self.n_stage): + if i == 0: + end_tmp = cur_time[0] + self.fbw_cost[1] + continue + tmp = end_tmp + self.c_cost + while ( + count[i][0] + count[i][1] < count[i - 1][0] + count[i - 1][1] + or count[i][1] <= count[i - 1][1] < self.n_micro + ): + for j in range(self.n_stage - 1, i - 1, -1): + if count[j][iter_chunk_] < self.n_micro: + put(0, iter_chunk_, j) + iter_chunk_ = 1 - iter_chunk_ + + for _ in range(2 * self.n_micro): + # check mem before putting b + for i in range(self.n_stage): + while mem[i] + self.fbw_mem[1] > self.max_mem: + assert len(pending_w[i]) > 0 + put_w(i) + b0_ranks, b1_ranks = [], [] + for i in range(self.n_stage): + if count[i][3] >= count[i][2]: + b0_ranks.append(i) + elif i == self.n_stage - 1: + b1_ranks.append(i) + else: + fa_id = self.get_id(1, 1, i + 1, count[i][3]) + if end_time[fa_id] >= 0 or count[i][2] >= self.n_micro: + b1_ranks.append(i) + else: + b0_ranks.append(i) + b_ranks = [] + # put b1 + for i in reversed(b1_ranks): + b_ranks.append((i, 1)) + # put b0 + for i in b0_ranks: + b_ranks.append((i, 0)) + for i, _chunk_ in b_ranks: + fa_id = -1 + if _chunk_ == 1 and i < self.n_stage - 1: + fa_id = self.get_id(1, 1, i + 1, count[i][3]) + if _chunk_ == 0 and i > 0: + fa_id = self.get_id(1, 0, i - 1, count[i][2]) + while ( + len(pending_w[i]) > 0 + and fa_id >= 0 + and end_time[fa_id] + self.c_cost >= cur_time[i] + self.fbw_cost[2] + ): + # fill the bubble + put_w(i) + if ( + len(pending_w[i]) > 0 + and end_time[fa_id] + self.c_cost - cur_time[i] > get_max_stage_bubble(i) - stage_bubble[i] + ): + if _chunk_ == 1: + put_w(i) + elif fill_b: + put_w(i) + put(1, _chunk_, i) + + # put f + for i in range(self.n_stage): + if count[i][1] >= self.n_micro: + continue + put_item = None + if count[i][1] >= count[i][0]: + put_item = 0 + elif i == self.n_stage - 1: + put_item = 1 + else: + if end_time[self.get_id(0, 1, i + 1, count[i][1])] >= 0: + put_item = 1 + elif count[i][0] < self.n_micro: + if i == 0: + put_item = 0 + elif end_time[self.get_id(0, 0, i - 1, count[i][0])] >= 0: + put_item = 0 + if put_item is None: + continue + # check mem before putting f + while mem[i] + self.fbw_mem[0] > self.max_mem: + assert len(pending_w[i]) > 0 + put_w(i) + fa_id = -1 + if put_item == 0 and i > 0: + fa_id = self.get_id(0, 0, i - 1, count[i][0]) + if put_item == 1 and i < self.n_stage - 1: + fa_id = self.get_id(0, 1, i + 1, count[i][1]) + while ( + len(pending_w[i]) > 0 + and fa_id >= 0 + and end_time[fa_id] + self.c_cost >= cur_time[i] + self.fbw_cost[2] + ): + # fill the bubble + put_w(i) + if ( + len(pending_w[i]) > 0 + and end_time[fa_id] + self.c_cost - cur_time[i] > get_max_stage_bubble(i) - stage_bubble[i] + ): + if fill_f: + put_w(i) + put(0, put_item, i) + + for i in range(self.n_stage): + while len(pending_w[i]) > 0: + put_w(i) + + max_bubble = get_max_stage_bubble() + expected_time = sum(self.fbw_cost) * self.n_micro * 2 + bubble_rate = max_bubble / expected_time + if max_approved_bubble < 0 or max_bubble < max_approved_bubble: + _schedule, _end_time, _max_bubble = self.try_v_schedule( + fill_f=fill_f, + fill_b=fill_b, + approved_bubble=stage_bubble, + ) + if _max_bubble < max_bubble: + return _schedule, _end_time, _max_bubble + return schedule, end_time, max_bubble + + def print_details(self, end_time, print_scaling=1): + for stage in range(self.n_stage): + stage_str = ["."] * int(max(end_time) / print_scaling) + for _cat in range(3): + for _chunk in range(2): + for _micro in range(self.n_micro): + _id = self.get_id(_cat, _chunk, stage, _micro) + if end_time[_id] < 0: + continue + end = int(end_time[_id] / print_scaling) + start = int((end_time[_id] - self.fbw_cost[_cat]) / print_scaling) + for j in range(start, end): + if j == start or j == end - 1: + stage_str[j] = "FfBbWw"[_cat * 2 + _chunk] + elif j == start + 1: + if _micro >= 10: + stage_str[j] = str(_micro // 10) + else: + stage_str[j] = str(_micro) + elif j == start + 2 and _micro >= 10: + stage_str[j] = str(_micro % 10) + else: + stage_str[j] = "-" + _str = "" + for _c in stage_str: + _str += _c + print(_str) + + def get_v_schedule(self, only_run_time=False): + schedule, end_time, max_bubble = None, None, None + expected_time = sum(self.fbw_cost) * self.n_micro * 2 + for fill_b in [True, False]: + for fill_f in [True, False]: + _schedule, _end_time, _max_bubble = self.try_v_schedule(fill_b=fill_b, fill_f=fill_f) + if max_bubble is None or _max_bubble < max_bubble: + max_bubble = _max_bubble + schedule = _schedule + end_time = _end_time + if only_run_time: + return max_bubble + expected_time + bubble_rate = max_bubble / (expected_time + max_bubble) + msg = "%2d %3d, [%5d %5d %5d %5d], %6d -> %6.4f" % ( + self.n_stage, + self.n_micro, + *self.fbw_cost, + self.c_cost, + self.max_mem // self.f_mem, + bubble_rate, + ) + + logger.info(msg) + local_order = [[] for _ in range(self.n_stage)] + comm_id = {} + comm_id_counter = 0 + post_validation_time = 0 + for i in range(self.n_stage - 1, -1, -1): + pv_id = min(2 * (self.n_stage - 1 - i), self.n_micro - 1) + post_validation_time = max( + post_validation_time, end_time[self.get_id(0, 0, i, pv_id)] - self.fbw_cost[0] - self.c_cost + ) + for it in ["RECV_", "SEND_", ""]: + if i == 0 and it == "SEND_": + continue + if i == self.n_stage - 1 and it == "RECV_": + continue + stage_ = i + local_order[stage_].append( + ScheduledNode( + type=it + "POST_VALIDATION", + chunk=0, + stage=stage_, + minibatch=0, + start_time=post_validation_time, + completion_time=post_validation_time, + ) + ) + comm_id[local_order[stage_][-1]] = comm_id_counter + comm_id_counter += 1 + for i in range(self.n_stage): + for _cat_, _chunk_, _micro_ in schedule[i]: + complete_time = end_time[self.get_id(_cat_, _chunk_, i, _micro_)] + local_order[i].append( + ScheduledNode( + type="FBW"[_cat_], + chunk=_chunk_ if _cat_ == 0 else 1 - _chunk_, + stage=i, + minibatch=_micro_, + start_time=complete_time - self.fbw_cost[_cat_], + completion_time=complete_time, + ) + ) + if _cat_ == 2: # no communication for W + continue + cat_str = "FORWARD" if _cat_ == 0 else "BACKWARD" + + def communicate(send_recv, stage_): + # noinspection PyTypeChecker + local_order[stage_].append( + ScheduledNode( + type=send_recv + cat_str, + chunk=_chunk_ if _cat_ == 0 else 1 - _chunk_, + stage=stage_, + minibatch=_micro_, + start_time=complete_time, + completion_time=complete_time, + ) + ) + comm_id[local_order[stage_][-1]] = comm_id_counter + + if _chunk_ == 1 and i > 0: + communicate("SEND_", i) + communicate("RECV_", i - 1) + if _chunk_ == 0 and i < self.n_stage - 1: + communicate("SEND_", i) + communicate("RECV_", i + 1) + comm_id_counter += 1 + for rank in range(self.n_stage): + # For nodes with the same timestamp on the same stage, communication will be prioritized. + def even_breaker(x: ScheduledNode): + # Compute nodes are always delayed. + if x.type in ["F", "B", "W"]: + return comm_id_counter + # For comm nodes, order by their unique comm id + return comm_id[x] + + local_order[rank] = sorted(local_order[rank], key=lambda x: (x.start_time, even_breaker(x))) + # If a recv with intersects with previous computation, reorder them so that recv + # is executed before computation and hence can be overlapped. + for i in range(len(local_order[rank])): + if ( + i > 0 + and local_order[rank][i - 1].type in {"F", "B", "W"} + and local_order[rank][i].type.startswith("RECV") + and "POST_VALIDATION" not in local_order[rank][i].type + and local_order[rank][i].start_time <= local_order[rank][i - 1].completion_time + ): + local_order[rank][i], local_order[rank][i - 1] = local_order[rank][i - 1], local_order[rank][i] + + local_order_with_rollback = [[] for _ in range(self.n_stage)] + for rank in range(self.n_stage): + rollback_comm = set() + if rank > 0: + for node in local_order[rank - 1]: + if node.type == "POST_VALIDATION": + break + if node.type == "SEND_FORWARD": + assert node.chunk == 0 + rollback_comm.add(node.minibatch) + for node in local_order[rank]: + if node.type == "RECV_FORWARD" and node.chunk == 0 and node.minibatch in rollback_comm: + rollback = True + rollback_comm.remove(node.minibatch) + else: + rollback = False + local_order_with_rollback[rank].append( + ScheduledNode( + type=node.type, + chunk=node.chunk, + stage=node.stage, + minibatch=node.minibatch, + start_time=node.start_time, + completion_time=node.completion_time, + rollback=rollback, + ) + ) + assert len(rollback_comm) == 0 + msg = "" + for node in local_order_with_rollback[rank]: + msg += f"{node.type}-{node.minibatch}-{int(node.rollback)}," + msg = msg[:-1] + "\n" + logger.info(msg) + + return local_order_with_rollback + + +class ZeroBubbleVInstrcutionGenerator(InstructionGenerator): + def __init__( + self, + deps: StageDeps, + meshes: List[DeviceMesh], + batches: int, + f_cost: int, + b_cost: int, + w_cost: int, + c_cost: int, + f_mem: int, + b_mem: int, + w_mem: int, + max_mem=None, + default_shape: Optional[Shape] = None, + default_dtype: Optional[torch.dtype] = None, + ): + self.num_chunk = 2 # for ZBV, manually set num chunks be 2 for each worker + self.deps = deps + n_stage = deps.num_stage + n_micro = batches + self.cost_graph = CostGraph(n_stage, n_micro, f_cost, b_cost, w_cost, c_cost, f_mem, b_mem, w_mem, max_mem=None) + self.num_stage = len(meshes) + self.schema = self.cost_graph.get_v_schedule() + self.default_shape = default_shape + self.default_dtype = default_dtype + + def gen_instruction(self): + self.instruction_list = [[] for _ in range(self.num_stage)] + self.instruction_list_str = ["" for _ in range(self.num_stage)] + + for stage in range(self.num_stage): + stage_str = "" + for node in self.schema[stage]: + self._set_inst(node, stage) + stage_str += node.type + "," + stage_str = stage_str[:-1] + + self.gen_instruction_str_list() + + def gen_instruction_str_list(self): + instruction_lists = self.instruction_list + stage_strs = defaultdict(str) + for stage_id, instruction_list in enumerate(instruction_lists): + cur_stage_str = stage_strs[stage_id] + for inst in instruction_list: + cur_stage_str += f"{VESCALE_INSTRUCTION_MAPPING_ZBV[inst.type]}," + cur_stage_str = cur_stage_str[:-1] + stage_strs[stage_id] = cur_stage_str + builder.build_from_dict(stage_strs) + + @manage_dump_file + def execute( + self, + stage_id, + autocast_dtype=torch.float32, + enable_autocast=False, + grad_scaler=None, + deallocate_pipeline_outputs=False, + ): + # init constant data + builder.constant_data["autocast_dtype"] = autocast_dtype + builder.constant_data["enable_autocast"] = enable_autocast + builder.constant_data["grad_scaler"] = grad_scaler + builder.constant_data["deallocate_pipeline_outputs"] = deallocate_pipeline_outputs + builder.constant_data["total_stages"] = self.num_stage + builder.constant_data["stagedeps"] = self.deps + builder.constant_data["default_shape"] = self.default_shape + builder.constant_data["default_dtype"] = self.default_dtype + + # Model chunk IDs with synchronized grads + builder.user_data["synchronized_model_chunks"] = set() + builder.user_data["input_tensors"] = [[] for _ in range(self.num_chunk)] + builder.user_data["output_tensors"] = [[] for _ in range(self.num_chunk)] + builder.user_data["output_tensor_grads"] = [[] for _ in range(self.num_chunk)] + builder.user_data["fwd_wait_handles"] = None + builder.user_data["bwd_wait_handles"] = None + builder.user_data["output_tensor"] = None + builder.user_data["input_tensor"] = (None, None) + builder.user_data["output_tensor_grad"] = None + builder.user_data["forward_data_store"] = [] + model = self.deps.get_current_model(stage_id) + + builder.model = model + instruction_list = self.get_instruction_list(stage_id) + builder.stage_id = stage_id + builder_instruction_list = builder.global_instructions_funcs[stage_id] + + assert len(instruction_list) == len(builder_instruction_list) + # print(f"cur stage {stage_id} debug inst list: {instruction_list} len inst {len(instruction_list)}") + + for inst, fn in zip(instruction_list, builder_instruction_list): + builder.user_data["inst"] = inst + fn() + + return builder.user_data["forward_data_store"] + + +# communication + + +@register_instruction(name="vescale_zbv_send_forward") +def vescale_zbv_send_forward(): + inst = builder.user_data["inst"] + output_tensors = builder.user_data["output_tensor"] + + if not isinstance(output_tensors, list): + output_tensors = [output_tensors] + + def f(info): + output_tensor, comm, shape = info + send_forward( + output_tensor=maybe_tensor(output_tensor), + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + tensor_shape=shape, + ) + cross_mesh_send(comm, output_tensor) + + comm_packages = inst.get_send_comms(builder.constant_data["total_stages"], builder.constant_data["stagedeps"]) + + shapes = [builder.constant_data["default_shape"] for _ in comm_packages] + infos = zip(output_tensors, comm_packages, shapes) + return list(map(f, infos)) + + +@register_instruction(name="vescale_zbv_recv_forward") +def vescale_zbv_recv_forward(): + inst = builder.user_data["inst"] + chunk_id = inst.chunk + mbx = inst.minibatch + + def f(info): + comm, shape, dtype = info + p2p_tensor = recv_forward( + tensor_shape=shape, + recv_dtype=dtype, + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + ) + p2p_tensor = cross_mesh_recv(comm, p2p_tensor) + return p2p_tensor + + comm_packages = inst.get_recv_comms(builder.constant_data["total_stages"], builder.constant_data["stagedeps"]) + shapes = [builder.constant_data["default_shape"] for _ in comm_packages] + dtypes = [builder.constant_data["default_dtype"] for _ in comm_packages] + infos = zip(comm_packages, shapes, dtypes) + out = list(map(f, infos)) + input_tensor = out if len(out) > 0 else None + builder.user_data["input_tensor"] = (input_tensor, mbx) + builder.user_data["input_tensors"][chunk_id].append((input_tensor, mbx)) + return input_tensor + + +@register_instruction(name="vescale_zbv_send_backward") +def vescale_zbv_send_backward(): + inst = builder.user_data["inst"] + input_tensor_grad = builder.user_data["input_tensor_grad"] + if not isinstance(input_tensor_grad, list): + input_tensor_grad = [input_tensor_grad] + + def f(info): + grad, comm, shape = info + send_backward( + input_tensor_grad=maybe_tensor(grad), + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + tensor_shape=shape, + ) + cross_mesh_send(comm, grad) + + recv_comms = inst.get_recv_comms(builder.constant_data["total_stages"], builder.constant_data["stagedeps"]) + shapes = [builder.constant_data["default_shape"] for _ in recv_comms] + infos = zip(input_tensor_grad, recv_comms, shapes) + return list(map(f, infos)) + + +@register_instruction(name="vescale_zbv_recv_backward") +def vescale_zbv_recv_backward(): + inst = builder.user_data["inst"] + chunk_id = inst.chunk + + def f(info): + comm, shape, dtype = info + p2p_tensor = recv_backward( + tensor_shape=shape, + recv_dtype=dtype, + current_device_mesh=comm.cur_mesh, + peer_device_mesh=comm.peer_mesh, + ) + p2p_tensor = cross_mesh_recv(comm, p2p_tensor) + return p2p_tensor + + comm_packages = inst.get_send_comms(builder.constant_data["total_stages"], builder.constant_data["stagedeps"]) + shapes = [builder.constant_data["default_shape"] for _ in comm_packages] + dtypes = [builder.constant_data["default_dtype"] for _ in comm_packages] + infos = zip(comm_packages, shapes, dtypes) + out = list(map(f, infos)) + output_tensor_grad = out if len(out) > 0 else None + + builder.user_data["output_tensor_grad"] = output_tensor_grad + builder.user_data["output_tensor_grads"][chunk_id].append(output_tensor_grad) + return output_tensor_grad + + +# forward + + +@register_instruction(name="vescale_zbv_forward") +def vescale_zbv_forward(): + inst = builder.user_data["inst"] + chunk_id = inst.chunk + stage_id = inst.stage + mbx = inst.minibatch + cur_model = builder.model[chunk_id] + + user_data = builder.user_data + forward_data_store = user_data["forward_data_store"] + input_tensors = user_data["input_tensors"] + output_tensors = user_data["output_tensors"] + + constant_data = builder.constant_data + autocast_dtype = constant_data["autocast_dtype"] + enable_autocast = constant_data["enable_autocast"] + + is_pp_first_stage = stage_id == 0 and chunk_id == 0 + is_pp_last_stage = stage_id == 0 and chunk_id == 1 + + # forward step + if is_pp_first_stage: + if len(input_tensors[chunk_id]) == len(output_tensors[chunk_id]): + input_tensors[chunk_id].append(None) + + # find corresponding input tensor + input_tensor = None + for cur_item in input_tensors[chunk_id]: + if cur_item is not None and cur_item[1] == mbx: + input_tensor = cur_item[0] + + if not is_pp_first_stage: + assert input_tensor is not None + + if enable_autocast: + context_manager = torch.autocast("cuda", dtype=autocast_dtype) + else: + context_manager = contextlib.nullcontext() + + with context_manager: + + def prepare_data(): + model_chunk_id = builder.user_data["model_chunk_id"] + ground_truth = [] + if builder.user_data["is_pp_first_stage"]: + true_input_tensor = next(builder.dataloader[model_chunk_id]) + # keep the input tensor in builder + if len(input_tensors[chunk_id]) == len(output_tensors[chunk_id]) + 1: + true_input_tensor.requires_grad_() + builder.user_data["input_tensors"][chunk_id].pop() + builder.user_data["input_tensors"][chunk_id].append((true_input_tensor, mbx)) + else: + local_tensors = next(builder.dataloader[model_chunk_id]) + if isinstance(local_tensors, Sequence) and len(local_tensors) > 1: + ground_truth.append(local_tensors[-1]) + elif isinstance(local_tensors, Dict) and "labels" in local_tensors: + ground_truth.append(local_tensors["labels"]) + true_input_tensor = builder.user_data["p2p_tensors"] + if isinstance(true_input_tensor, Sequence): + true_input_tensor = true_input_tensor[0] + + return (true_input_tensor,), {}, ground_truth + + builder.user_data["model_chunk_id"] = chunk_id + builder.user_data["p2p_tensors"] = input_tensor + builder.user_data["is_pp_first_stage"] = is_pp_first_stage + builder.user_data["is_pp_last_stage"] = is_pp_last_stage + builder.user_data["prepare_data_fn"] = prepare_data + args, kwargs, ground_truth = builder.user_data["prepare_data_fn"]() + builder.user_data["ground_truth"] = ground_truth + output_tensor = cur_model(*args, **kwargs) + + if is_pp_last_stage: + output_tensor, loss_tensor = registed_functions["vescale_zbv_loss_fn"](output_tensor) + forward_data_store.append((output_tensor, loss_tensor)) + output_tensor = output_tensor if builder.loss_fn is None else loss_tensor + + if stage_id + 1 == builder.constant_data["total_stages"] and chunk_id == 0: + # turn around the forward direction + builder.user_data["input_tensor"] = (output_tensor, mbx) + builder.user_data["input_tensors"][chunk_id + 1].append((output_tensor, mbx)) + + builder.user_data["output_tensors"][chunk_id].append(output_tensor) + user_data["output_tensor"] = output_tensor + + +# backward + + +@register_instruction(name="vescale_zbv_backward_b") +def vescale_zbv_backward_b(): + inst = builder.user_data["inst"] + chunk_id = inst.chunk + stage_id = inst.stage + grad_scaler = builder.constant_data["grad_scaler"] + deallocate_pipeline_outputs = builder.constant_data["deallocate_pipeline_outputs"] + + input_tensors = builder.user_data["input_tensors"] + output_tensors = builder.user_data["output_tensors"] + output_tensor_grads = builder.user_data["output_tensor_grads"] + + is_pp_last_stage = stage_id == 0 and chunk_id == 1 + + if is_pp_last_stage: + if len(output_tensor_grads[chunk_id]) == 0: + output_tensor_grads[chunk_id].append(None) + input_tensor = input_tensors[chunk_id].pop(0)[0] + output_tensor = output_tensors[chunk_id][0] + output_tensor_grad = output_tensor_grads[chunk_id][0] + + # Retain the grad on the input_tensor. + unwrap_input_tensor_grad = False + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + unwrap_input_tensor_grad = True + for x in input_tensor: + if x is not None: + x.retain_grad() + + if not isinstance(output_tensor, list): + output_tensor = [output_tensor] + if not isinstance(output_tensor_grad, list): + output_tensor_grad = [output_tensor_grad] + + # extract loss value from output tensors + if isinstance(output_tensor[0], Sequence): + for j in range(len(output_tensor[0])): + if output_tensor[0][j].ndim == 0 and output_tensor[0][j].numel() == 1: + loss_value = output_tensor[0][j] + break + else: + loss_value = output_tensor[0][-1] + else: + loss_value = output_tensor[0] + + # Backward pass. + if output_tensor_grad[0] is None and grad_scaler is not None: + loss_value = grad_scaler(loss_value) + # FIXME: For virtual pipeline, there may exist frozen layer without grad; + # Need to verify if this solution is correct + if not loss_value.requires_grad: + return None + + if deallocate_pipeline_outputs: + assert 0 + # custom_backward(output_tensor[0], output_tensor_grad[0]) + else: + input_tensor_grad = switch_dtensor(torch.autograd.grad)( + loss_value, + input_tensor, + grad_outputs=output_tensor_grad[0], + retain_graph=True, + allow_unused=True, + materialize_grads=True, + )[0] + + if unwrap_input_tensor_grad: + input_tensor_grad = input_tensor_grad[0] + + def f(input_tensor): + if input_tensor is not None: + assert isinstance(input_tensor, (torch.Tensor, DTensor)), input_tensor + input_tensor.grad = None + + nonlocal output_tensor + + if not isinstance(output_tensor, Sequence): + output_tensor = [output_tensor] + + if (output_tensor is None) or (not deallocate_pipeline_outputs): + return + assert isinstance( + output_tensor, [torch.Tensor, DTensor] + ), f"expected Tensor, found {type(output_tensor).__name__}." + assert output_tensor._base is None, "counter-productive to free a view of another tensor." + if isinstance(output_tensor, [torch.Tensor, DTensor]): + output_tensor._local_tensor.data = torch.empty( + (1,), + device=output_tensor.device, + dtype=output_tensor.dtype, + ) + else: + output_tensor.data = torch.empty( + (1,), + device=output_tensor.device, + dtype=output_tensor.dtype, + ) + return + + if not isinstance(input_tensor, Sequence): + map(f, [input_tensor]) + else: + map(f, input_tensor) + + if stage_id + 1 == builder.constant_data["total_stages"] and chunk_id == 1: + # turn around the forward direction + builder.user_data["output_tensor_grad"] = input_tensor_grad + builder.user_data["output_tensor_grads"][chunk_id - 1].append(output_tensor_grad) + + builder.user_data["input_tensor_grad"] = input_tensor_grad + + +@register_instruction(name="vescale_zbv_backward_w") +def vescale_zbv_backward_w(): + inst = builder.user_data["inst"] + chunk_id = inst.chunk + stage_id = inst.stage + cur_model = builder.model[chunk_id] + grad_scaler = builder.constant_data["grad_scaler"] + deallocate_pipeline_outputs = builder.constant_data["deallocate_pipeline_outputs"] + + output_tensors = builder.user_data["output_tensors"] + output_tensor_grads = builder.user_data["output_tensor_grads"] + + is_pp_last_stage = stage_id == 0 and chunk_id == 1 + + if is_pp_last_stage: + if len(output_tensor_grads[chunk_id]) == 0: + output_tensor_grads[chunk_id].append(None) + output_tensor = output_tensors[chunk_id].pop(0) + output_tensor_grad = output_tensor_grads[chunk_id].pop(0) + + if not isinstance(output_tensor, list): + output_tensor = [output_tensor] + if not isinstance(output_tensor_grad, list): + output_tensor_grad = [output_tensor_grad] + + # Backward pass. + if output_tensor_grad[0] is None and grad_scaler is not None: + output_tensor = grad_scaler(output_tensor[0]) + # FIXME: For virtual pipeline, there may exist frozen layer without grad; + # Need to verify if this solution is correct + if not output_tensor[0].requires_grad: + return None + + # Gather params + nps = {} + for key, value in cur_model.named_parameters(): + nps[key] = value + + if deallocate_pipeline_outputs: + assert 0 + else: + params_grad = switch_dtensor(torch.autograd.grad)( + output_tensor[0], + nps.values(), + grad_outputs=output_tensor_grad[0], + retain_graph=True, + allow_unused=True, + materialize_grads=True, + ) + + # Manually set each params grad + for param, grad in zip(nps.values(), params_grad): + param.grad = grad + + +# validation + + +@register_instruction(name="vescale_zbv_post_validation") +def vescale_zbv_post_validation(): + pass + + +@register_instruction(name="vescale_zbv_recv_post_validation") +def vescale_zbv_recv_post_validation(): + pass + + +@register_instruction(name="vescale_zbv_send_post_validation") +def vescale_zbv_send_post_validation(): + pass + + +# loss + + +@register_instruction(name="vescale_zbv_loss_fn") +def vescale_zbv_loss_fn(output_tensor): + loss_func = builder.loss_fn + if loss_func is None: + return output_tensor, None + temp_tensor = output_tensor + args_spec = signature(loss_func) + args_len = len(args_spec.parameters.keys()) + if args_len == 1: + output_tensor = loss_func(output_tensor) + else: + ground_truth = builder.user_data["ground_truth"] + loss_fn_inputs = [output_tensor] + ground_truth + output_tensor = loss_func(*loss_fn_inputs) + assert args_len == len(loss_fn_inputs), "Mismatch of loss function #args and #actual inputs!" + builder.user_data["output_tensor"] = output_tensor + return temp_tensor, output_tensor + + +VESCALE_INSTRUCTION_MAPPING_ZBV = { + "RECV_FORWARD": "vescale_zbv_recv_forward", + "SEND_FORWARD": "vescale_zbv_send_forward", + "F": "vescale_zbv_forward", + "B": "vescale_zbv_backward_b", + "W": "vescale_zbv_backward_w", + "RECV_BACKWARD": "vescale_zbv_recv_backward", + "SEND_BACKWARD": "vescale_zbv_send_backward", + "RECV_POST_VALIDATION": "vescale_zbv_recv_post_validation", + "SEND_POST_VALIDATION": "vescale_zbv_send_post_validation", + "POST_VALIDATION": "vescale_zbv_post_validation", +} + +if __name__ == "__main__": + settings = [ + # p, n, f, b, w, c, h, a, l + # (8, 24, 18522, 18086, 9337, 601, 2304, 24, 24), + # (8, 32, 18513, 18086, 9331, 626, 2304, 24, 24), + # (8, 64, 18546, 18097, 9321, 762, 2304, 24, 24), + # (8, 24, 29718, 29444, 19927, 527, 4096, 32, 32), + # (8, 32, 29802, 29428, 19530, 577, 4096, 32, 32), + # (8, 64, 29935, 29621, 19388, 535, 4096, 32, 32), + # (16, 48, 11347, 11248, 8132, 377, 5120, 40, 48), + # (16, 64, 11307, 11254, 8101, 379, 5120, 40, 48), + # (16, 128, 11325, 11308, 8109, 378, 5120, 40, 48), + # (32, 96, 10419, 10207, 7715, 408, 6144, 48, 64), + # (32, 128, 10408, 10204, 7703, 408, 6144, 48, 64), + # (32, 256, 10402, 10248, 7698, 460, 6144, 48, 64), + (4, 8, 6, 4, 4, 1, 4096, 32, 32), + # (8, 24, 29444, 29718, 19927, 527, 4096, 32, 32), + # ( 8, 32, 16099, 16504, 7589, 540, 2304, 24, 16), + # (16, 48, 14407, 14380, 9676, 1610, 4096, 32, 32), + # (16, 64, 14412, 14393, 9688, 1621, 4096, 32, 32), + # (16, 128, 14316, 14306, 9639, 1619, 4096, 32, 32), + # (24, 72, 6763, 6969, 5251, 755, 5120, 40, 48), + # (24, 96, 6783, 6984, 5259, 758, 5120, 40, 48), + # (24, 192, 6785, 6990, 5260, 770, 5120, 40, 48), + # (32, 96, 9458, 9748, 7288, 879, 6144, 48, 64), + # (32, 128, 9469, 9744, 7306, 892, 6144, 48, 64), + # (32, 256, 9447, 9644, 7193, 887, 6144, 48, 64), + ] + s = 1024 + + # h, a, s = 4096, 32, 1024 + # cost_f, cost_b, cost_w, cost_c = 29718, 29444, 19927, 527 + for p, n, f, b, w, c, h, a, _ in settings: + mem_f = 34 * h + 5 * a * s + mem_w = -32 * h + mem_b = -mem_w - mem_f + for m_offset in range(p + 1): + graph = CostGraph( + n_stage=p, + n_micro=n, + f_cost=f, + b_cost=b, + w_cost=w, + c_cost=c, + f_mem=mem_f, + b_mem=mem_b, + w_mem=mem_w, + max_mem=mem_f * (p * 2 + m_offset), + ) + graph.get_v_schedule() + break diff --git a/vescale/pipe/p2p_communication.py b/vescale/pipe/p2p_communication.py new file mode 100644 index 0000000..61605b6 --- /dev/null +++ b/vescale/pipe/p2p_communication.py @@ -0,0 +1,1005 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ +# Some code are adapted p2p_communication.py in Megatron-LM. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +################################################################################ + +from enum import Enum +import os +import torch +import torch.distributed as dist +from vescale.dtensor.dtensor import DTensor +from vescale.dtensor.device_mesh import DeviceMesh +from vescale.ndtimeline import ndtimeit_p2p +from vescale.ndtimeline.predefined import ( + RECV_FORWARD, + RECV_BACKWARD, + SEND_FORWARD, + SEND_BACKWARD, + SEND_FORWARD_RECV_BACKWARD, + SEND_BACKWARD_RECV_FORWARD, +) +from typing import Optional, List, Union, Tuple +from torch.distributed.distributed_c10d import ProcessGroup + +try: + from torch.distributed.distributed_c10d import _coalescing_manager +except ImportError: + print("Warning: cannot import coalescing_manager. It may impact PP performance") + +# Types +Shape = Union[List[int], torch.Size] +# For P2P overlap, currently we do not differ fwd/bwd reqs; +# Hence, drain func will sync both fwd and bwd p2p ops. +GLOBAL_COUNTER = 0 +INTERMEDIATE_SHAPES = [] +MINIBATCH_STEPS = 0 + + +def reset_global_counter(): + global GLOBAL_COUNTER + global MINIBATCH_STEPS + GLOBAL_COUNTER = 0 + MINIBATCH_STEPS += 1 + + +class OpType(Enum): + SEND, RECV_FWD, RECV_BWD = 0, 1, 2 + + +p2p_overlap = False +send_reqs = [] +recv_fwd_reqs = [] +recv_bwd_reqs = [] + + +# Sync P2P-send OP +def drain_send_reqs(): + global send_reqs + if len(send_reqs) == 0: + return + for req in send_reqs: + req.wait() + send_reqs.clear() + + +# Sync P2P-recv OP: we differ forward recv reqs from backward recv reqs +# to enable 1F1B P2P communication overlap +def drain_recv_reqs(drain_type="all"): + global recv_fwd_reqs, recv_bwd_reqs + if drain_type == "all" or drain_type == "forward": + if len(recv_fwd_reqs) > 0: + for req in recv_fwd_reqs: + req.wait() + recv_fwd_reqs.clear() + if drain_type == "all" or drain_type == "backward": + if len(recv_bwd_reqs) > 0: + for req in recv_bwd_reqs: + req.wait() + recv_bwd_reqs.clear() + + +def _mapping_local_rank_to_target_rank_by_device_mesh( + *, current_device_mesh: DeviceMesh, target_device_mesh: DeviceMesh, local_rank: int +): + """Mapping local rank in current device mesh to find target rank in target device mesh + + Takes the following arguments: + current_device_mesh: current device mesh for locate rank position + target_device_mesh: target device mesh for mapping to target rank + Returns: + target_rank + """ + if target_device_mesh is None: + return None + current_device_mesh_list = current_device_mesh.mesh.view(-1).tolist() + assert local_rank in current_device_mesh_list + current_rank_pos = current_device_mesh_list.index(local_rank) + target_rank = target_device_mesh.mesh.view(-1).tolist()[current_rank_pos] + return target_rank + + +def _get_p2p_send_recv_process_group( + *, current_device_mesh: DeviceMesh, target_device_mesh: DeviceMesh, local_rank: int +): + target_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + current_device_mesh=current_device_mesh, target_device_mesh=target_device_mesh + ) + return list(local_rank, target_rank) + + +def _communicate_shapes( + *, + tensor_send_next: torch.tensor, + tensor_send_prev: torch.tensor, + prev_rank: int, + next_rank: int, + recv_prev: bool, + recv_next: bool, + local_rank: int, + shape_dim: int = 3, +): + """Communicate tensor shapes between stages. Used to communicate + tensor shapes before the actual tensor communication happens. + This is required when the sequence lengths across micro batches + are not uniform. + + Takes the following arguments: + tensor_send_next: DTensor or torch.tensor to send to next rank (no tensor sent if + set to None). + tensor_send_prev: DTensor or torch.tensor to send to prev rank (no tensor sent if + set to None). + prev_rank: prev rank for send/recv rank + next_rank: next rank for send/recv rank + recv_prev: boolean for whether tensor should be received from + previous rank. + recv_next: boolean for whether tensor should be received from + next rank. + shape_dim: default to 3, which is set in megatron, in this refactor func, you can set shape dim + Returns: + (recv_prev_shape, recv_next_shape) + """ + + recv_prev_shape_tensor = None + recv_next_shape_tensor = None + send_prev_shape_tensor = None + send_next_shape_tensor = None + + if recv_prev: + recv_prev_shape_tensor = torch.empty((shape_dim), device=torch.cuda.current_device(), dtype=torch.int64) + if recv_next: + recv_next_shape_tensor = torch.empty((shape_dim), device=torch.cuda.current_device(), dtype=torch.int64) + if tensor_send_prev is not None: + if isinstance(tensor_send_prev, DTensor): + send_prev_shape_tensor = torch.tensor( + tensor_send_prev._local_tensor.size(), device=torch.cuda.current_device(), dtype=torch.int64 + ) + else: + send_prev_shape_tensor = torch.tensor( + tensor_send_prev.size(), device=torch.cuda.current_device(), dtype=torch.int64 + ) + if tensor_send_next is not None: + if isinstance(tensor_send_next, DTensor): + send_next_shape_tensor = torch.tensor( + tensor_send_next._local_tensor.size(), device=torch.cuda.current_device(), dtype=torch.int64 + ) + else: + send_next_shape_tensor = torch.tensor( + tensor_send_next.size(), device=torch.cuda.current_device(), dtype=torch.int64 + ) + ops = [] + if send_prev_shape_tensor is not None: + send_prev_op = torch.distributed.P2POp(torch.distributed.isend, send_prev_shape_tensor, prev_rank) + ops.append(send_prev_op) + if recv_next_shape_tensor is not None: + recv_next_op = torch.distributed.P2POp(torch.distributed.irecv, recv_next_shape_tensor, next_rank) + ops.append(recv_next_op) + if send_next_shape_tensor is not None: + send_next_op = torch.distributed.P2POp(torch.distributed.isend, send_next_shape_tensor, next_rank) + ops.append(send_next_op) + if recv_prev_shape_tensor is not None: + recv_prev_op = torch.distributed.P2POp(torch.distributed.irecv, recv_prev_shape_tensor, prev_rank) + ops.append(recv_prev_op) + + if len(ops) > 0: + reqs = torch.distributed.batch_isend_irecv(ops) + for req in reqs: + req.wait() + + # To protect against race condition when using batch_isend_irecv(). + # should take this out once the bug with batch_isend_irecv is resolved. + if not _coalescing_manager: + torch.cuda.synchronize() + + recv_prev_shape = [0, 0, 0] + if recv_prev_shape_tensor is not None: + recv_prev_shape = recv_prev_shape_tensor.tolist() + + recv_next_shape = [0, 0, 0] + if recv_next_shape_tensor is not None: + recv_next_shape = recv_next_shape_tensor.tolist() + + return recv_prev_shape, recv_next_shape + + +def _batched_p2p_ops( + *, + tensor_send_prev: Optional[torch.Tensor], + tensor_recv_prev: Optional[torch.Tensor], + tensor_send_next: Optional[torch.Tensor], + tensor_recv_next: Optional[torch.Tensor], + prev_rank: int, + next_rank: int, + group: torch.distributed.ProcessGroup, + local_rank: int, + send_tensor_shape_unpad: Shape = None, + p2p_overlap=False, +): + ops = [] + if tensor_send_prev is not None: + send_prev_op = torch.distributed.P2POp(torch.distributed.isend, tensor_send_prev, prev_rank) + ops.append(send_prev_op) + if tensor_recv_prev is not None: + recv_prev_op = torch.distributed.P2POp(torch.distributed.irecv, tensor_recv_prev, prev_rank) + ops.append(recv_prev_op) + if tensor_send_next is not None: + send_next_op = torch.distributed.P2POp(torch.distributed.isend, tensor_send_next, next_rank) + ops.append(send_next_op) + if tensor_recv_next is not None: + recv_next_op = torch.distributed.P2POp(torch.distributed.irecv, tensor_recv_next, next_rank) + ops.append(recv_next_op) + if len(ops) > 0: + reqs = torch.distributed.batch_isend_irecv(ops) + else: + reqs = [] + return reqs + + +def check_nan(tensor_list, check=False): + if check: + for t in tensor_list: + assert not torch.isnan(t).any(), ( + "tensor shape: " + + str(t.shape) + + ", dtype: " + + str(t.dtype) + + ", device: " + + str(t.device) + + ", # of NaN elements: " + + str(torch.sum(torch.isnan(t)).item()) + + ", NaN element indexes: " + + str(torch.isnan(t).nonzero()) + ) + + +def _p2p_ops( + *, + tensor_send_prev: Optional[torch.Tensor], + tensor_recv_prev: Optional[torch.Tensor], + tensor_send_next: Optional[torch.Tensor], + tensor_recv_next: Optional[torch.Tensor], + prev_rank: int, + next_rank: int, + group: torch.distributed.ProcessGroup, + local_rank: int, + p2p_overlap=False, + send_tensor_shape_unpad: Shape = None, + # file=None, +): + reqs = [] + + """ + by now the megatron pingpong + send recv is not supported because the global + devicemeshmanager is not impled. we will use + the ucx and mpi two-end no-blocking api to do + the send recv + """ + stage_id = int(os.environ.get("STAGE_ID", "0")) + op_type = [] + if stage_id % 2: + if tensor_send_next is not None: + if send_tensor_shape_unpad is not None: + assert ( + send_tensor_shape_unpad[0] <= tensor_send_next.shape[0] + ), f"{send_tensor_shape_unpad} vs {tensor_send_next.shape}" + check_nan([tensor_send_next[: send_tensor_shape_unpad[0]]]) + else: + check_nan([tensor_send_next]) + send_next_req = torch.distributed.isend( + tensor=tensor_send_next, + dst=next_rank, + group=group, + ) + reqs.append(send_next_req) + op_type.append(OpType.SEND) + + if tensor_recv_prev is not None: + recv_prev_req = torch.distributed.irecv( + tensor=tensor_recv_prev, + src=prev_rank, + group=group, + ) + reqs.append(recv_prev_req) + op_type.append(OpType.RECV_FWD) + + if tensor_send_prev is not None: + if send_tensor_shape_unpad is not None: + assert ( + send_tensor_shape_unpad[0] <= tensor_send_prev.shape[0] + ), f"{send_tensor_shape_unpad} vs {tensor_send_prev.shape}" + check_nan([tensor_send_prev[: send_tensor_shape_unpad[0]]]) + else: + check_nan([tensor_send_prev]) + + send_prev_req = torch.distributed.isend( + tensor=tensor_send_prev, + dst=prev_rank, + group=group, + ) + reqs.append(send_prev_req) + op_type.append(OpType.SEND) + + if tensor_recv_next is not None: + recv_next_req = torch.distributed.irecv( + tensor=tensor_recv_next, + src=next_rank, + group=group, + ) + reqs.append(recv_next_req) + op_type.append(OpType.RECV_BWD) + + else: + if tensor_recv_prev is not None: + recv_prev_req = torch.distributed.irecv( + tensor=tensor_recv_prev, + src=prev_rank, + group=group, + ) + reqs.append(recv_prev_req) + op_type.append(OpType.RECV_FWD) + if tensor_send_next is not None: + if send_tensor_shape_unpad is not None: + assert ( + send_tensor_shape_unpad[0] <= tensor_send_next.shape[0] + ), f"{send_tensor_shape_unpad} vs {tensor_send_next.shape}" + check_nan([tensor_send_next[: send_tensor_shape_unpad[0]]]) + else: + check_nan([tensor_send_next]) + send_next_req = torch.distributed.isend( + tensor=tensor_send_next, + dst=next_rank, + group=group, + ) + reqs.append(send_next_req) + op_type.append(OpType.SEND) + + if tensor_recv_next is not None: + recv_next_req = torch.distributed.irecv( + tensor=tensor_recv_next, + src=next_rank, + group=group, + ) + reqs.append(recv_next_req) + op_type.append(OpType.RECV_BWD) + + if tensor_send_prev is not None: + if send_tensor_shape_unpad is not None: + assert ( + send_tensor_shape_unpad[0] <= tensor_send_prev.shape[0] + ), f"{send_tensor_shape_unpad} vs {tensor_send_prev.shape}" + check_nan([tensor_send_prev[: send_tensor_shape_unpad[0]]]) + else: + check_nan([tensor_send_prev]) + + send_prev_req = torch.distributed.isend( + tensor=tensor_send_prev, + dst=prev_rank, + group=group, + ) + reqs.append(send_prev_req) + op_type.append(OpType.SEND) + + if p2p_overlap: + # For P2P-comm overlap + global send_reqs, recv_fwd_reqs, recv_bwd_reqs + for i in range(len(op_type)): + if op_type[i] == OpType.SEND: + send_reqs.append(reqs[i]) + elif op_type[i] == OpType.RECV_FWD: + recv_fwd_reqs.append(reqs[i]) + elif op_type[i] == OpType.RECV_BWD: + recv_bwd_reqs.append(reqs[i]) + + return reqs + + +def _communicate( + *, + tensor_send_next: Optional[torch.Tensor], + tensor_send_prev: Optional[torch.Tensor], + recv_prev: bool, + recv_next: bool, + current_device_mesh: DeviceMesh, + prev_device_mesh: DeviceMesh = None, + next_device_mesh: DeviceMesh = None, + tensor_shape: Shape = None, + send_tensor_shape_unpad: Shape = None, + batch_p2p_comm: bool = True, + wait_on_reqs: bool = True, + dtype: Optional[torch.dtype], + group: ProcessGroup = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Communicate tensors between stages. Used as helper method in other + communication methods that are used in vescale/schedules.py. + + Arguments: + tensor_send_next (torch.Tensor, optional): + Tensor to send to next rank (no tensor sent if None) + + tensor_send_prev (torch.Tensor, optional): + Tensor to send to prev rank (no tensor sent if None) + + current_device_mesh (DeviceMesh, required): + Current device mesh for locate rank position + + prev_device_mesh (DeviceMesh, required): + Target device mesh for mapping to pre rank + + next_device_mesh (DeviceMesh, required): + Target device mesh for mapping to next rank + + recv_prev (boolean, required): + whether tensor should be received from previous rank. + + recv_next (boolean, required): + whether tensor should be received from next rank. + + tensor_shape (List[int] or torch.Size, required): + shape of tensor to receive (this method assumes that all + tensors sent and received in a single function call are + the same shape). If none, using dynamic shape + + batch_p2p_comm (boolean, required): + If true use batch_isend_irecv, otherwise use individual + isend and irecv calls. + + wait_on_reqs (boolean, optional, default=False): + For non-batched p2p communication, wait on each request + before returning. + + dtype (torch.dtype, required if either recv_{prev,next} is True): + this must be the type of the tensors that will be + received, will typically be params_dtype, but in the case + of fp32 residual connections might be torch.float. + + variable_seq_lengths (bool, optional, default=False): + Support for variable sequence lengths across + microbatches. Setting this communicates the size of + tensors during pipeline parallelism communication, because + of this extra overhead it should only be set if the + sequence length is not constant during training. + + Returns: + tuple containing + + - tensor_recv_prev: torch.Tensor if recv_prev is True, None otherwise. + - tensor_recv_next: torch.Tensor if recv_next is True, None otherwise. + + """ + # Init p2p_overlap: Use a global var to enable p2p comm overlap, + # so as not to change the original APIs + + global p2p_overlap + if not wait_on_reqs and not p2p_overlap: + p2p_overlap = True + + # Create placeholder tensors for receive in forward and backward directions + # if needed. + tensor_recv_prev = None + tensor_recv_next = None + + # This will come from config in the next version, for now hard + # code it here to match existing functionality. + batch_p2p_sync = True + local_rank = current_device_mesh.get_rank() + # parse current device mesh and target device mesh + prev_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=local_rank, current_device_mesh=current_device_mesh, target_device_mesh=prev_device_mesh + ) + next_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=local_rank, current_device_mesh=current_device_mesh, target_device_mesh=next_device_mesh + ) + # flag to reuse intermediate tensor shapes of recorded tensors in first minibatch + reuse_intermediate_shapes = os.environ.get("REUSE_COMM_SHAPE", "0") == "1" + + if tensor_shape is not None: + recv_prev_shape = tensor_shape + recv_next_shape = tensor_shape + else: + global GLOBAL_COUNTER + global INTERMEDIATE_SHAPES + global MINIBATCH_STEPS + if reuse_intermediate_shapes and MINIBATCH_STEPS > 1: + recv_prev_shape, recv_next_shape = INTERMEDIATE_SHAPES[GLOBAL_COUNTER] + else: + recv_prev_shape, recv_next_shape = _communicate_shapes( + tensor_send_next=tensor_send_next, + tensor_send_prev=tensor_send_prev, + recv_prev=recv_prev, + recv_next=recv_next, + prev_rank=prev_rank, + next_rank=next_rank, + local_rank=local_rank, + ) + if reuse_intermediate_shapes: + INTERMEDIATE_SHAPES.append((recv_prev_shape, recv_next_shape)) + GLOBAL_COUNTER += 1 + + if recv_prev: + if dtype is None: + raise RuntimeError("dtype must be provided if recv_prev is True") + if recv_prev_shape is None: + raise RuntimeError( + "tensor_shape must be specified if recv_prev is True. " + "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)" + ) + tensor_recv_prev = torch.empty( + recv_prev_shape, requires_grad=True, device=torch.cuda.current_device(), dtype=dtype + ) + if recv_next: + if dtype is None: + raise RuntimeError("dtype must be provided if recv_next is True") + if recv_next_shape is None: + raise RuntimeError( + "tensor_shape must be specified if recv_next is True. " + "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)" + ) + tensor_recv_next = torch.empty( + recv_next_shape, requires_grad=True, device=torch.cuda.current_device(), dtype=dtype + ) + + # Send tensors in both the forward and backward directions as appropriate. + if batch_p2p_comm: + assert wait_on_reqs + p2p_func = _batched_p2p_ops + else: + p2p_func = _p2p_ops + + # if file: + # file.write( + # f"\np2p tensor_send_prev:{tensor_send_prev}, tensor_recv_prev:{tensor_recv_prev} {id(tensor_recv_prev)}, tensor_send_next:{tensor_send_next} {id(tensor_send_next)}, tensor_recv_next:{tensor_recv_next}, prev_rank: {prev_rank}, next_rank: {next_rank}, local_rank: {local_rank}\n" + # ) + # file.flush() + reqs = p2p_func( + tensor_send_prev=tensor_send_prev, + tensor_recv_prev=tensor_recv_prev, + tensor_send_next=tensor_send_next, + tensor_recv_next=tensor_recv_next, + prev_rank=prev_rank, + next_rank=next_rank, + group=group, + local_rank=local_rank, + send_tensor_shape_unpad=send_tensor_shape_unpad, + p2p_overlap=p2p_overlap, + ) + + if wait_on_reqs and len(reqs) > 0: + for req in reqs: + req.wait() + reqs = None + + if batch_p2p_comm and batch_p2p_sync: + # To protect against race condition when using batch_isend_irecv(). + if not _coalescing_manager: + torch.cuda.synchronize() + + return tensor_recv_prev, tensor_recv_next, reqs + + +def recv_forward( + tensor_shape: Shape, + recv_dtype: torch.dtype, + current_device_mesh: DeviceMesh, + peer_device_mesh: Optional[DeviceMesh] = None, + batch_p2p_comm: bool = True, +) -> torch.Tensor: + """Receive tensor from previous rank in pipeline (forward receive). + + See _communicate for argument details. + + Args: + tensor_shape (Shape): shape of imminenently arrived tensors + recv_dtype (torch.dtype): data types of received tensors + current_device_mesh (DeviceMesh): sub-DeviceMesh of current stage + peer_device_mesh (Optional[DeviceMesh]): sub-DeviceMesh of sender/recipient stage + batch_p2p_comm (bool): switch to execute batched p2p transfer when turned on + + Returns: + Received forward tensor + + """ + if peer_device_mesh is None: + intput_tensor = None + return intput_tensor + prev_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=current_device_mesh.get_rank(), + current_device_mesh=current_device_mesh, + target_device_mesh=peer_device_mesh, + ) + with ndtimeit_p2p(RECV_FORWARD, dist.group.WORLD, prev_rank, batch_p2p_comm): + input_tensor, _, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + prev_device_mesh=peer_device_mesh, + recv_prev=True, + recv_next=False, + tensor_shape=tensor_shape, + batch_p2p_comm=batch_p2p_comm, + dtype=recv_dtype, + ) + return input_tensor + + +def recv_backward( + tensor_shape: Shape, + recv_dtype: torch.dtype, + current_device_mesh: DeviceMesh, + peer_device_mesh: Optional[DeviceMesh] = None, + batch_p2p_comm: bool = True, +) -> torch.Tensor: + """Receive tensor from next rank in pipeline (backward receive). + + See _communicate for argument details. + + Args: + tensor_shape (Shape): shape of imminenently arrived tensors + recv_dtype (torch.dtype): data types of received tensors + current_device_mesh (DeviceMesh): sub-DeviceMesh of current stage + peer_device_mesh (Optional[DeviceMesh]): sub-DeviceMesh of sender/recipient stage + batch_p2p_comm (bool): switch to execute batched p2p transfer when turned on + + Returns: + Received output tensor gradient. + + """ + if peer_device_mesh is None: + output_tensor_grad = None + return output_tensor_grad + next_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=current_device_mesh.get_rank(), + current_device_mesh=current_device_mesh, + target_device_mesh=peer_device_mesh, + ) + with ndtimeit_p2p(RECV_BACKWARD, dist.group.WORLD, next_rank, batch_p2p_comm): + _, output_tensor_grad, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + next_device_mesh=peer_device_mesh, + recv_prev=False, + recv_next=True, + tensor_shape=tensor_shape, + dtype=recv_dtype, + batch_p2p_comm=batch_p2p_comm, + ) + return output_tensor_grad + + +def send_forward( + output_tensor: torch.Tensor, + current_device_mesh: DeviceMesh, + peer_device_mesh: Optional[DeviceMesh] = None, + tensor_shape: Optional[Shape] = None, + batch_p2p_comm: bool = True, +) -> None: + """Send tensor to next rank in pipeline (forward send). + + See _communicate for argument details. + + Args: + output_tensor (torch.Tensor): backward input received from previous stage + current_device_mesh (DeviceMesh): sub-DeviceMesh of current stage + peer_device_mesh (Optional[DeviceMesh]): sub-DeviceMesh of sender/recipient stage + tensor_shape (Shape): shape of imminenently arrived tensors + batch_p2p_comm (bool): switch to execute batched p2p transfer when turned on + + """ + + if peer_device_mesh is None: + return + next_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=current_device_mesh.get_rank(), + current_device_mesh=current_device_mesh, + target_device_mesh=peer_device_mesh, + ) + with ndtimeit_p2p(SEND_FORWARD, dist.group.WORLD, next_rank, batch_p2p_comm): + _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + next_device_mesh=peer_device_mesh, + recv_prev=False, + recv_next=False, + tensor_shape=tensor_shape, + batch_p2p_comm=batch_p2p_comm, + dtype=None, + ) + + +def send_backward( + input_tensor_grad: torch.Tensor, + current_device_mesh: DeviceMesh, + peer_device_mesh: Optional[DeviceMesh] = None, + tensor_shape: Optional[Shape] = None, + batch_p2p_comm: bool = True, +) -> None: + """Send tensor to previous rank in pipeline (backward send). + + See _communicate for argument details. + + Args: + input_tensor_grad (torch.Tensor): input tensor gradients + current_device_mesh (DeviceMesh): sub-DeviceMesh of current stage + peer_device_mesh (Optional[DeviceMesh]): sub-DeviceMesh of sender/recipient stage + tensor_shape (Shape): shape of imminenently arrived tensors + batch_p2p_comm (bool): switch to execute batched p2p transfer when turned on + + """ + + if peer_device_mesh is None: + return + prev_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=current_device_mesh.get_rank(), + current_device_mesh=current_device_mesh, + target_device_mesh=peer_device_mesh, + ) + with ndtimeit_p2p(SEND_BACKWARD, dist.group.WORLD, prev_rank, batch_p2p_comm): + _communicate( + tensor_send_next=None, + tensor_send_prev=input_tensor_grad, + current_device_mesh=current_device_mesh, + prev_device_mesh=peer_device_mesh, + recv_prev=False, + recv_next=False, + tensor_shape=tensor_shape, + batch_p2p_comm=batch_p2p_comm, + dtype=None, + ) + + +def send_forward_recv_backward( + output_tensor: torch.Tensor, + tensor_shape: Shape, + recv_dtype: torch.dtype, + current_device_mesh: DeviceMesh, + peer_device_mesh: Optional[DeviceMesh] = None, + batch_p2p_comm: bool = True, +) -> torch.Tensor: + """Batched send and recv with next rank in pipeline. + + See _communicate for argument details. + + Args: + output_tensor (torch.Tensor): backward input received from previous stage + tensor_shape (Shape): shape of imminenently arrived tensors + recv_dtype (torch.dtype): data types of received tensors + current_device_mesh (DeviceMesh): sub-DeviceMesh of current stage + peer_device_mesh (Optional[DeviceMesh]): sub-DeviceMesh of sender/recipient stage + batch_p2p_comm (bool): switch to execute batched p2p transfer when turned on + + Returns: + Received output tensor gradients. + + """ + + if peer_device_mesh is None: + output_tensor_grad = None + return output_tensor_grad + next_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=current_device_mesh.get_rank(), + current_device_mesh=current_device_mesh, + target_device_mesh=peer_device_mesh, + ) + with ndtimeit_p2p(SEND_FORWARD_RECV_BACKWARD, dist.group.WORLD, next_rank, batch_p2p_comm): + _, output_tensor_grad, _ = _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + next_device_mesh=peer_device_mesh, + recv_prev=False, + recv_next=True, + tensor_shape=tensor_shape, + dtype=recv_dtype, + batch_p2p_comm=batch_p2p_comm, + ) + return output_tensor_grad + + +def send_backward_recv_forward( + input_tensor_grad: torch.Tensor, + tensor_shape: Shape, + recv_dtype: torch.dtype, + current_device_mesh: DeviceMesh, + peer_device_mesh: Optional[DeviceMesh] = None, + batch_p2p_comm: bool = True, +) -> torch.Tensor: + """ + Batched send and recv with previous rank in pipeline. + + See _communicate for argument details. + + Args: + input_tensor_grad (torch.Tensor): input tensor gradients + tensor_shape (Shape): shape of imminenently arrived tensors + recv_dtype (torch.dtype): data types of received tensors + current_device_mesh (DeviceMesh): sub-DeviceMesh of current stage + peer_device_mesh (Optional[DeviceMesh]): sub-DeviceMesh of sender/recipient stage + batch_p2p_comm (bool): switch to execute batched p2p transfer when turned on + + Returns: + Received tensor. + + """ + if peer_device_mesh is None: + input_tensor = None + return input_tensor + prev_rank = _mapping_local_rank_to_target_rank_by_device_mesh( + local_rank=current_device_mesh.get_rank(), + current_device_mesh=current_device_mesh, + target_device_mesh=peer_device_mesh, + ) + with ndtimeit_p2p(SEND_BACKWARD_RECV_FORWARD, dist.group.WORLD, prev_rank, batch_p2p_comm): + input_tensor, _, _ = _communicate( + tensor_send_next=None, + tensor_send_prev=input_tensor_grad, + current_device_mesh=current_device_mesh, + prev_device_mesh=peer_device_mesh, + recv_prev=True, + recv_next=False, + tensor_shape=tensor_shape, + dtype=recv_dtype, + batch_p2p_comm=batch_p2p_comm, + ) + return input_tensor + + +def send_forward_recv_forward( + output_tensor: torch.Tensor, + recv_prev: bool, + tensor_shape: Shape, + current_device_mesh: DeviceMesh, + prev_device_mesh: DeviceMesh, + next_device_mesh: DeviceMesh, + send_tensor_shape_unpad: Shape = None, + overlap_p2p_comm: bool = False, + recv_dtype: Optional[torch.dtype] = None, + batch_p2p_comm: bool = True, + group: ProcessGroup = None, +) -> torch.Tensor: + """Batched recv from previous rank and send to next rank in pipeline. + + See _communicate for argument details. + """ + # auto state change + if prev_device_mesh is None: + recv_prev = False + if next_device_mesh is None: + input_tensor, _, wait_handles = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + prev_device_mesh=prev_device_mesh, + next_device_mesh=next_device_mesh, + recv_prev=recv_prev, + recv_next=False, + tensor_shape=tensor_shape, + send_tensor_shape_unpad=send_tensor_shape_unpad, + batch_p2p_comm=batch_p2p_comm, + wait_on_reqs=(not overlap_p2p_comm), + dtype=recv_dtype, + group=group, + ) + else: + input_tensor, _, wait_handles = _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + prev_device_mesh=prev_device_mesh, + next_device_mesh=next_device_mesh, + recv_prev=recv_prev, + recv_next=False, + tensor_shape=tensor_shape, + send_tensor_shape_unpad=send_tensor_shape_unpad, + batch_p2p_comm=batch_p2p_comm, + wait_on_reqs=(not overlap_p2p_comm), + dtype=recv_dtype, + group=group, + ) + if overlap_p2p_comm: + return input_tensor, wait_handles + return input_tensor + + +def send_backward_recv_backward( + input_tensor_grad: torch.Tensor, + recv_next: bool, + tensor_shape: Shape, + current_device_mesh: DeviceMesh, + prev_device_mesh: DeviceMesh, + next_device_mesh: DeviceMesh, + send_tensor_shape_unpad: Shape = None, + overlap_p2p_comm: bool = False, + recv_dtype: Optional[torch.dtype] = None, + batch_p2p_comm: bool = True, + group: ProcessGroup = None, +) -> torch.Tensor: + """Batched recv from next rank and send to previous rank in pipeline. + + See _communicate for argument details. + """ + # auto state change + if next_device_mesh is None: + recv_next = False + if prev_device_mesh is None: + _, output_tensor_grad, wait_handles = _communicate( + tensor_send_next=None, + tensor_send_prev=None, + current_device_mesh=current_device_mesh, + prev_device_mesh=prev_device_mesh, + next_device_mesh=next_device_mesh, + recv_prev=False, + recv_next=recv_next, + tensor_shape=tensor_shape, + send_tensor_shape_unpad=send_tensor_shape_unpad, + batch_p2p_comm=batch_p2p_comm, + wait_on_reqs=(not overlap_p2p_comm), + dtype=recv_dtype, + group=group, + # file=file, + ) + else: + _, output_tensor_grad, wait_handles = _communicate( + tensor_send_next=None, + tensor_send_prev=input_tensor_grad, + current_device_mesh=current_device_mesh, + prev_device_mesh=prev_device_mesh, + next_device_mesh=next_device_mesh, + recv_prev=False, + recv_next=recv_next, + tensor_shape=tensor_shape, + send_tensor_shape_unpad=send_tensor_shape_unpad, + batch_p2p_comm=batch_p2p_comm, + wait_on_reqs=(not overlap_p2p_comm), + dtype=recv_dtype, + group=group, + ) + if overlap_p2p_comm: + return output_tensor_grad, wait_handles + return output_tensor_grad + + +def send_forward_backward_recv_forward_backward( + output_tensor: torch.Tensor, + input_tensor_grad: torch.Tensor, + recv_prev: bool, + recv_next: bool, + tensor_shape: Shape, + current_device_mesh: DeviceMesh, + prev_device_mesh: DeviceMesh, + next_device_mesh: DeviceMesh, + recv_dtype: Optional[torch.dtype] = None, + batch_p2p_comm: bool = True, +) -> torch.Tensor: + """Batched send and recv with previous and next ranks in pipeline. + + See _communicate for argument details. + """ + input_tensor, output_tensor_grad, _ = _communicate( + tensor_send_next=output_tensor, + tensor_send_prev=input_tensor_grad, + current_device_mesh=current_device_mesh, + prev_device_mesh=prev_device_mesh, + next_device_mesh=next_device_mesh, + recv_prev=recv_prev, + recv_next=recv_next, + tensor_shape=tensor_shape, + dtype=recv_dtype, + batch_p2p_comm=batch_p2p_comm, + ) + return input_tensor, output_tensor_grad diff --git a/vescale/pipe/pipe_emmiter.py b/vescale/pipe/pipe_emmiter.py new file mode 100644 index 0000000..d60fc3d --- /dev/null +++ b/vescale/pipe/pipe_emmiter.py @@ -0,0 +1,356 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +# mypy: ignore-errors +from vescale.dtensor.device_mesh import DeviceMesh +from vescale.plan.pipeline_parallel import PipelineParallelPlan +from vescale.plan.spec import PipelineScheduleType +from vescale.pipe._schedules import ( + OneFOneBInstrcutionGenerator, + InterleavedOneFOneBInstructionGenerator, + ZeroBubbleVInstrcutionGenerator, + StageDeps, + Shape, +) +from vescale.pipe._schedules.instruction_base import VESCALE_INTRUCTION_BUILDER as builder +from vescale.pipe.p2p_communication import reset_global_counter +from vescale.devicemesh_api.api import VeDeviceMesh +from collections import OrderedDict +from typing import Callable, Iterator, List, Sequence, Union +import torch +import torch.distributed as dist +import logging +import os + + +logger = logging.Logger(__file__) + + +class PipelineEmitter: + """Pipeline Emitter.""" + + def __init__( + self, + deps: StageDeps, + meshes: Sequence[DeviceMesh], + schedule: str, + batches: int, + tensor_shape: Shape, + dtype: torch.dtype, + num_chunks: int = 1, + input_shapes: List[Shape] = None, + input_shapes_unpad: List[Shape] = None, + forward_only=False, + overlap_p2p_comm=False, + batch_p2p_comm: bool = True, + param_sync_overlap=False, + grad_sync_overlap=False, + **kwargs, + ): + self.deps = deps + self.num_stage = deps.num_stage + self.meshes = meshes + self.batches = batches + self.num_chunks = num_chunks + self.overlap_p2p_comm = overlap_p2p_comm + self.batch_p2p_comm = batch_p2p_comm + self.param_sync_overlap = param_sync_overlap + self.forward_only = forward_only + self.grad_sync_overlap = grad_sync_overlap + if schedule == PipelineScheduleType.SIMPLE_1F1B: + self.num_meshes = meshes + self.instruction_generator = OneFOneBInstrcutionGenerator( + deps=deps, + meshes=self.meshes, + batches=batches, + default_shape=tensor_shape, + default_dtype=dtype, + forward_only=self.forward_only, + ) + + elif schedule == PipelineScheduleType.INTERLEAVED_1F1B: + self.instruction_generator = InterleavedOneFOneBInstructionGenerator( + deps=deps, + meshes=self.meshes, + batches=batches, + default_shape=tensor_shape, + default_dtype=dtype, + input_shapes=input_shapes, + input_shapes_unpad=input_shapes_unpad, + num_chunks=self.num_chunks, + batch_p2p_comm=batch_p2p_comm, + overlap_p2p_comm=overlap_p2p_comm, + param_sync_overlap=param_sync_overlap, + grad_sync_overlap=grad_sync_overlap, + forward_only=forward_only, + ) + + elif schedule == PipelineScheduleType.ZERO_BUBBLE: + self.instruction_generator = ZeroBubbleVInstrcutionGenerator( + deps=deps, + meshes=self.meshes, + batches=batches, + default_shape=tensor_shape, + default_dtype=dtype, + **kwargs, + ) + else: + raise NotImplementedError("unsupport schedule type") + self.instruction_list: List[List] = self.gen_instruction() + + def gen_instruction(self): + """ + Generates instruction steps of a pipeline schedule. + """ + return self.instruction_generator.gen_instruction() + + def get_instruction_list(self, stage: int): + """ + Generates instruction steps of a pipeline schedule for a particular pipeline stage. + + Args: + stage (int): pipeline stage id + + """ + return self.instruction_generator.get_instruction_list(stage) + + +class ScheduleEngine: + def __init__( + self, + deps: StageDeps, + meshes: int, + schedule: PipelineScheduleType, + batches: int, + data_iterator: Union[Iterator, List[Iterator]], + stage_id: int, + shape: Union[Shape, Sequence[Shape]], + dtype: Union[torch.dtype, Sequence[torch.dtype]] = torch.float32, + num_chunks=1, + input_shapes: List[Shape] = None, + input_shapes_unpad: List[Shape] = None, + forward_only=False, + overlap_p2p_comm=False, + batch_p2p_comm: bool = True, + param_sync_overlap=False, + grad_sync_overlap=False, + send_dtypes_map: OrderedDict = None, + loss_fn: Callable = lambda x: torch.sum(x), + global_mesh: VeDeviceMesh = None, + **kwargs, + ): + os.environ["STAGE_ID"] = str(stage_id) + self.p_emmiter = PipelineEmitter( + deps, + meshes, + schedule, + batches, + shape, + dtype, + num_chunks=num_chunks, + input_shapes=input_shapes, + input_shapes_unpad=input_shapes_unpad, + forward_only=forward_only, + overlap_p2p_comm=overlap_p2p_comm, + batch_p2p_comm=batch_p2p_comm, + param_sync_overlap=param_sync_overlap, + grad_sync_overlap=grad_sync_overlap, + **kwargs, + ) + self.schedule = schedule + self.deps = deps + self.instruction_list = self.get_instruction_list(stage_id) + self.stage_id = stage_id + self.shape = shape + self.dtype = dtype + self.chunk = num_chunks + self.send_dtypes_map = send_dtypes_map + builder.topo = deps + builder.dataloader = data_iterator + builder.loss_fn = loss_fn + self.src_loss_rank = -1 + self.global_mesh = global_mesh + if self.global_mesh: + all_ranks = list(range(dist.get_world_size())) + dp_rank = self.global_mesh.get_data_parallel_rank() + tp_rank = self.global_mesh.get_tensor_parallel_rank() + same_pipeline_group = [ + rank for rank in all_ranks if self.global_mesh.get_strategy_coordinate(rank)[1:] == [dp_rank, tp_rank] + ] + for rank in same_pipeline_group: + if self.global_mesh.get_strategy_coordinate(rank)[0] == self.global_mesh.size(0) - 1: + self.src_loss_rank = rank + break + # the group for all ranks in the same pipeline to share final loss outputs + self.sync_loss_group = dist.new_group(ranks=same_pipeline_group, backend="nccl") + + def set_data_iterator(self, data_iterator: List, data_shape=None): + """ + Assigns minibatch data to instruction builder. + + Args: + data_iterator (List): a minibatch list of microbatch data + + """ + assert builder.dataloader + builder.dataloader = data_iterator + if data_shape: + self.shape = data_shape + builder.constant_data["shape"] = data_shape + + def get_instruction_list(self, stage_id): + return self.p_emmiter.get_instruction_list(stage_id) + + def sync_output_loss_per_pipeline(self, loss: torch.Tensor): + """ + A debug mode function that synchronizes minibatch loss + with all stages of a pipeline. + + Args: + data_iterator (List): a minibatch list of microbatch data + + """ + assert self.global_mesh, "Must initialize per-pipeline dist group before synchronizing loss!" + if loss is None: + loss = torch.tensor(0.0, dtype=torch.float).cuda(dist.get_rank()) + dist.broadcast(loss, src=self.src_loss_rank, group=self.sync_loss_group) + + # monkey patch torch.tensor loss backward as empty tensor to make it a dummy function + def _empty_backward(): + return None + + loss.backward = _empty_backward + return loss + + def _collect_microbatch_losses(self, outputs): + # monkey patch torch.tensor loss backward as empty tensor to make it a dummy function + def _empty_backward(): + return None + + output_losses = [] + for microbatch_output, microbatch_loss in outputs: + if microbatch_loss is None: + if isinstance(microbatch_output, Sequence): + for j in range(len(microbatch_output)): + if microbatch_output[j].ndim == 0 and microbatch_output[j].numel() == 1: + loss_value = microbatch_output[j] + break + else: + raise ValueError("Loss values not found.") + else: + loss_value = microbatch_output + else: + # monkey patch microbatch loss backward as empty tensor to make it a dummy function + loss_value = microbatch_loss + output_losses.append(loss_value) + if not output_losses: + return None + tensor_device = output_losses[0].device + minibatch_loss = torch.tensor(sum(output_losses), device=tensor_device) + minibatch_loss.backward = _empty_backward + return minibatch_loss + + @staticmethod + def execute( + instance, + *, + deallocate_pipeline_outputs: bool = False, + autocast_dtype: torch.dtype = torch.float, + enable_autocast: bool = False, + grad_scaler=None, + param_sync_func=None, + grad_sync_func=None, + debug_mode=False, + ): + """ + Main entry point of executing forward and backward + computation of a minibatch. + + Args: + instance (ScheduleEngine): a minibatch list of microbatch data + deallocate_pipeline_outputs (bool): deallocate tensors + autocast_dtype (torch.dtype): autocast data types + enable_autocast (bool): turn on to enable tensor autocast + grad_scaler (Callable): gradient scaler + param_sync_func (Callable): gradient synchronization function + debug_mode (bool): turn on to generate debugging outputs + + Returns: + A tuple of two elements: + 1). loss of this minibatch of data, + 2). a list of tuple of outputs per microbatch, where for each tuple: + - 2.1). the first element is output of the original model + - 2.2). the second element is the loss of this microbatch. + If loss_fn is not provided at initialization, it means loss + is computed in 2.1) and here will return None + + """ + reset_global_counter() + if instance.schedule == PipelineScheduleType.SIMPLE_1F1B: + minibatch_outputs = instance.p_emmiter.instruction_generator.execute( + stage_id=instance.stage_id, + enable_autocast=enable_autocast, + autocast_dtype=autocast_dtype, + grad_scaler=grad_scaler, + deallocate_pipeline_outputs=deallocate_pipeline_outputs, + ) + minibatch_loss = instance._collect_microbatch_losses(minibatch_outputs) + if debug_mode: + minibatch_loss = instance.sync_output_loss_per_pipeline(minibatch_loss) + return minibatch_loss, minibatch_outputs + elif instance.schedule == PipelineScheduleType.INTERLEAVED_1F1B: + minibatch_outputs = instance.p_emmiter.instruction_generator.execute( + stage_id=instance.stage_id, + enable_autocast=enable_autocast, + autocast_dtype=autocast_dtype, + grad_scaler=grad_scaler, + deallocate_pipeline_outputs=deallocate_pipeline_outputs, + param_sync_func=param_sync_func, + grad_sync_func=grad_sync_func, + ) + minibatch_loss = instance._collect_microbatch_losses(minibatch_outputs) + if debug_mode: + minibatch_loss = instance.sync_output_loss_per_pipeline(minibatch_loss) + return minibatch_loss, minibatch_outputs + elif instance.schedule == PipelineScheduleType.ZERO_BUBBLE: + minibatch_outputs = instance.p_emmiter.instruction_generator.execute( + stage_id=instance.stage_id, + enable_autocast=enable_autocast, + autocast_dtype=autocast_dtype, + grad_scaler=grad_scaler, + deallocate_pipeline_outputs=deallocate_pipeline_outputs, + ) + minibatch_loss = instance._collect_microbatch_losses(minibatch_outputs) + if debug_mode: + minibatch_loss = instance.sync_output_loss_per_pipeline(minibatch_loss) + return minibatch_loss, minibatch_outputs + else: + raise NotImplementedError("Unsupported Schedule!") + + +def validate_pipeline_schedule(plan: PipelineParallelPlan): + """ + Validates pipeline schedule settings in Pipeline ParallelPlan. + + Args: + plan (PipelineParallelPlan): configuration of pipeline parallel API attributes + + """ + if plan.schedule_type == PipelineScheduleType.INTERLEAVED_1F1B: + assert plan.virtual_chunks > 1 + elif plan.schedule_type == PipelineScheduleType.SIMPLE_1F1B: + assert plan.virtual_chunks == 1 diff --git a/vescale/pipe/pipe_parser.py b/vescale/pipe/pipe_parser.py new file mode 100644 index 0000000..18cd180 --- /dev/null +++ b/vescale/pipe/pipe_parser.py @@ -0,0 +1,652 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + + +from typing import Sequence, Dict, List, Union, Any, Optional +import torch +import re +import torch.nn as nn +import logging +from inspect import signature +from vescale.pipe.tracer import ModelTracer, HFModelTracer, hf_symbolic_trace +from torch.fx.passes.split_utils import split_by_tags +from vescale.plan.pipeline_parallel import PipelineParallelPlan +from vescale.plan.spec import PipelineSplitMethodType, TracerType + +NUM_DEFAULT_ARGS = 3 + +try: + # New import path + from torch.export._trace import _export_to_torch_ir # noqa: PGH004 +except ImportError: + try: + # Old import path + from torch._export import _export_to_torch_ir # noqa: F401 + except ImportError: + print("Could not import _export_to_torch_ir. Please make sure your PyTorch " "version is newer than 2.2.0.") + + +logger = logging.Logger(__file__) + + +class PipeParser: + def __init__(self): + self.orig_to_split_fqn_mapping = {} + + def parse( + self, module: nn.Module, plan: Optional[PipelineParallelPlan] = None, **kwargs: Any + ) -> torch.fx.GraphModule: + """ + Applies cascade trace capture using upstream torch.fx symbolic tracer, huggingface + tracer and dynamo export tracer respectively. To trigger cascade parser, select + TracerType.AUTO in PipelineParallelPlan's tracer_type field + + Args: + module (nn.Module): the model from which we trace its forward execution graph. + + Returns: + Model trace graph. + + """ + parser_args = {} + if plan and plan.smallest_unsplittable_units: + parser_args["partition_units"] = plan.smallest_unsplittable_units + if kwargs: + parser_args.update(kwargs) + try: + msg = "Applying Default torch.fx symbolic tracing..." + logger.info(msg) + traced = self.parse_torch_fx(module, **parser_args) + except Exception as e: + try: + msg = f"Default torch.fx symbolic tracing failed: {e}\nApplying HuggingFace Tracer..." + logger.warning(msg) + traced = self.parse_huggingface_fx(module, **parser_args) + except Exception as e2: + try: + msg = f"HuggingFace tracing failed: {e2}\nApplying Dynamo Export Tracer..." + logger.warning(msg) + traced = self.parse_dynamo_export(module, **parser_args) + except Exception as e3: + msg = f"Dynamo export tracing failed: {e3}" + logger.warning(msg) + raise e3 + print(f"Below is visualization of the traced model graph:\n{traced}") + return traced + + def partition_stage( + self, module: nn.Module, model_graph: torch.fx.GraphModule, plan: PipelineParallelPlan + ) -> List[str]: + """ + Partitions models by split criterion. The function first annotates graph nodes and ops by stage + boundary, and then split stages into model partition modules (torch.fx.GraphModule). + + Args: + module (nn.Module): the model. + model_graph (torch.fx.GraphModule): the trace graph of the model. + plan (PipelineParallelPlan): configuration of pipeline paralellism API. + + Returns: + The executable trace graph partitioned by stage boundary, + and mappings of submodules before and after partition. + + """ + split_points = self.split(model_graph, plan) + plan.split_points = split_points + splited_graph = self.split_stage(model_graph, module, plan) + return splited_graph + + def split(self, graph: torch.fx.GraphModule, plan: PipelineParallelPlan): + """ + Generates or verifies pipeline split points, and writes updates to PipelineParallelPlan. + + Args: + graph (torch.fx.GraphModule): symbolic trace graph of the entire model + plan (PipelineParallelPlan): configuration of attributes for pipeline parallleism API + + Returns: + A list of fully qualified names of stage split points. + + """ + criterion = plan.split_method + boundaries = plan.split_points + nodes = list(graph.graph.nodes) + trimmed_nodes = nodes[1:-1] # remove input and output nodes in graph + node_names = [nd.name for nd in nodes] + trimmed_node_names = [] + for nd in nodes[1:-1]: + if nd.op == "call_module": + trimmed_node_names.append(nd.target) + else: + trimmed_node_names.append(nd.name) + num_stages = plan.num_stages + num_chunk_per_stage = plan.virtual_chunks + num_model_partitions = num_stages * num_chunk_per_stage + nodes_size = len(trimmed_nodes) + trimmed_module_indices = [idx for idx in range(nodes_size) if trimmed_nodes[idx].op == "call_module"] + modules_only_size = len(trimmed_module_indices) + assert criterion in [ + PipelineSplitMethodType.UNIFORM, + PipelineSplitMethodType.MANUAL, + PipelineSplitMethodType.AUTO, + PipelineSplitMethodType.PARAMETERS, + PipelineSplitMethodType.SIMULATOR, + PipelineSplitMethodType.FLOPS, + ] + if criterion == PipelineSplitMethodType.UNIFORM: + if plan.uniform_split_ops: + module_indices = self._partition_uniform(modules_only_size, num_model_partitions) + indices = [trimmed_module_indices[module_indices[idx]] for idx in range(len(module_indices))] + else: + indices = self._partition_uniform(nodes_size, num_model_partitions) + final_boundaries = [] + for idx in indices: + if nodes[idx].op == "call_module" and trimmed_nodes[idx].name != trimmed_nodes[idx].target: + final_boundaries.append(trimmed_nodes[idx].name.replace("_", ".")) + else: + final_boundaries.append(trimmed_nodes[idx].name) + plan.split_points = final_boundaries + elif criterion == PipelineSplitMethodType.MANUAL: + assert boundaries, "Must provide stage boundaries for MANUAL mode during stage partition!" + if boundaries and all(isinstance(x, str) for x in boundaries): + for fqn in boundaries: + assert ( + fqn in node_names + or fqn.replace(".", "_") in node_names + or any(name.startswith(fqn) for name in node_names) + ) + elif boundaries and all(isinstance(x, int) for x in boundaries): + # Under indexing-based partition, model graph's execution order is visualized as followed + boundaries.sort() + assert 0 <= boundaries[0] <= boundaries[-1] < len(nodes) + # convert submodule indices into fully qualified names + new_boundaries = [] + for idx in boundaries: + if nodes[idx].op == "call_module": + new_boundaries.append(nodes[idx].name.replace("_", ".")) + else: + new_boundaries.append(nodes[idx].name) + boundaries = new_boundaries + else: + raise ValueError("Input must be either a list of path strings or partition indices!") + if boundaries[-1] != node_names[-2]: + boundaries.append(node_names[-2]) + + final_boundaries = self._handle_virtual_stage_boundaries( + boundaries, + trimmed_node_names, + num_chunk_per_stage, + plan.enable_vpp_split_points, + ) + # assert no stage boundary is a prefix of other boundaries + _boundaries = set(final_boundaries) + for this_bd in _boundaries: + for bd in _boundaries: + if this_bd != bd: + assert not this_bd.startswith(bd) + assert len(final_boundaries) == num_model_partitions + else: + raise NotImplementedError + return final_boundaries + + def _partition_uniform(self, num_items, num_parts): + assert num_items % num_parts == 0, "#graph nodes must be partitioned by #stages!" + assert num_items >= num_parts, "#model partitions must not be less than #graph nodes!" + parts = [0] * (num_parts + 1) + # First check for the trivial edge case + if num_items <= num_parts: + for p in range(num_parts + 1): + parts[p] = min(p, num_items) + else: + chunksize = num_items // num_parts + residual = num_items - (chunksize * num_parts) + parts = torch.arange(0, (num_parts + 1) * chunksize, chunksize) + for i in range(residual): + parts[i + 1 :] += 1 + parts = parts.tolist() + if parts[0] == 0: + parts = parts[1:] + parts = [x - 1 for x in parts] + return parts + + def _handle_virtual_stage_boundaries( + self, + boundaries: List[Union[str, int]], + node_names: List[str], + num_chunk_per_stage: int, + use_manual_vpp_boundary: bool, + ): + if isinstance(boundaries[0], int): + boundaries = [node_names[idx] for idx in boundaries] + if num_chunk_per_stage > 1 and not use_manual_vpp_boundary: + new_indices = [] + indices = list(range(len(node_names))) + raw_stage_indices = [] + for fqn in boundaries: + if fqn not in node_names: + fqn = fqn.replace(".", "_") + raw_stage_indices.append(node_names.index(fqn)) + if raw_stage_indices[-1] < len(node_names) - 1: + raw_stage_indices[-1].append(len(node_names) - 1) + for i in range(len(raw_stage_indices)): + if i == 0: + sublist = torch.tensor(indices[: raw_stage_indices[i] + 1]) + else: + sublist = torch.tensor(indices[raw_stage_indices[i - 1] + 1 : raw_stage_indices[i] + 1]) + assert ( + len(sublist) >= num_chunk_per_stage + ), "#operators and modules in a stage must be no smaller than #virtual pipeline chunks!" + sublist_list = sublist.tensor_split(num_chunk_per_stage) + new_indices += [int(sub[-1]) for sub in sublist_list] + boundaries = [node_names[idx] for idx in new_indices] + return boundaries + + def annotate_pipeline_stage( + self, graph: torch.fx.GraphModule, root_module: nn.Module, boundaries: List, partition_units: List + ): + """ + Annotates stage split boundaries of each stage on the model graph. + + Args: + graph (torch.fx.GraphModule): model trace graph + root_module (nn.Module): raw model + boundaries (List): a list of pipeline stage split points in the form of fully qualified names + partition_units (List): smallest unsplittable unit in a model trace graph + + Returns: + Model graph with stage split points annotated. + + """ + + def identify_base_units(submodule, partition_units, submodule_name): + return ( + len(list(submodule.children())) == 0 + or submodule_name in partition_units + or type(submodule) in partition_units + ) + + splited_module_names = boundaries + assert len(splited_module_names) > 0, "need to have bigger than 1 nodes" + max_dfn_for_modules = [0 for _ in range(len(splited_module_names))] + node_lists = list(graph.graph.nodes) + node_lists_names = [node.name for node in node_lists] + node_lists_target_names = [node.target for node in node_lists] + submodule_paths = {name for name, _ in root_module.named_modules()} + for stage_id, submodule_name in enumerate(splited_module_names): + stage_tag = stage_id + sub_module_unions = [] + if submodule_name in node_lists_names: + boundary_node = node_lists[node_lists_names.index(submodule_name)] + else: + boundary_node = node_lists[node_lists_target_names.index(submodule_name)] + if submodule_name in submodule_paths: + submodule = root_module.get_submodule(submodule_name) + if identify_base_units(submodule, partition_units, submodule_name): # for leaf module + sub_module_unions.append(submodule_name) + else: + for name, _ in submodule.named_children(): + sub_module_unions.append(submodule_name + "." + name) + sub_module_unions = [re.sub(r"\.", "_", name) for name in sub_module_unions] + else: + if boundary_node.op == "call_method" or boundary_node.op == "call_function": + sub_module_unions.append(boundary_node.name) + else: + raise ValueError( + "Stage boundary can only be of ``call_module``, ``call_method`` and ``call_function``!" + ) + stage_max_dfn = 0 + # set tag with the node Sequence, to O(N) + for dfn in range(len(node_lists)): + node = node_lists[dfn] + if node.name in sub_module_unions: + # TODO: tag should be partition_chunk{id} instead of stage, as it may lead to confusion in interleaved 1F1B schedules + node.tag = f"stage{str(stage_tag)}" + stage_max_dfn = max(dfn, stage_max_dfn) + max_dfn_for_modules[stage_id] = stage_max_dfn + + # annotate the first stage + for dfn in range(len(node_lists)): + if dfn <= max_dfn_for_modules[0]: + node_lists[dfn].tag = "stage0" + else: + break + + slow = 0 + cur_dfn_num = 0 + fast = max_dfn_for_modules[cur_dfn_num] + # using fast slow ptr to annotate graph + + while fast < len(node_lists) and slow < len(node_lists): + while slow <= fast: + node_lists[slow].tag = node_lists[fast].tag + slow += 1 + cur_dfn_num += 1 + if cur_dfn_num < len(max_dfn_for_modules): + fast = max_dfn_for_modules[cur_dfn_num] + else: + while slow < len(node_lists): + node_lists[slow].tag = node_lists[fast].tag + slow += 1 + return graph + + def split_stage( + self, graph: torch.fx.GraphModule, root_module: nn.Module, plan: PipelineParallelPlan + ) -> torch.fx.GraphModule: + """ + Split a model graph into multiple pipeline stage subgraphs. + + Args: + graph (torch.fx.GraphModule): model graph + root_module (nn.Module): raw model + plan (PipelineParallelPlan): configuration of attributes for pipeline parallleism API + + Returns: + Edited model graph that contains subgraph of each virtual module chunk of a pipeline stage. + For example, + ``` + Before: + original_graph: + module1: xxx + module2: xxx + module3: xxx + module4: xxx + + After: + split_graph: + stage0: + module1: xxx + module2: xxx + stage1: + module3: xxx + module4: xxx + ``` + + """ + if graph is None: + return None + + boundaries = plan.split_points + partition_units = plan.smallest_unsplittable_units + graph = self.annotate_pipeline_stage(graph, root_module, boundaries, partition_units) + tags = [f"stage{str(num)}" for num in range(len(boundaries))] + # split by PyTorch upstream's split_by_tags + split_graph, orig_to_split_fqn_mapping = split_by_tags(graph, tags, return_fqn_mapping=True) + for i in range(1, len(tags)): + # input placeholder node of each stage-specific graph + placeholder_node = list(getattr(split_graph, tags[i]).graph.nodes)[0] + if placeholder_node.op == "placeholder" and placeholder_node.name != "x": + placeholder_node.name = "x" + + return split_graph + + def parse_torch_fx( + self, model: nn.Module, partition_units: List[str] = None, shard_plan: Dict = None + ) -> torch.fx.GraphModule: + """ + Applies torch.fx symbolic trace to capture model graph. + + Args: + model (nn.Module): raw model + partition_units (List[str]): a list of smallest unsplittable modules such that the parser will + not flatten their underlying components during parsing + shard_plan (Dict): dictionary of sharding plan, if users would like to wrap up tensor parallelized + modules as unsplittable units + + Returns: + Captured torch.fx.GraphModule + + """ + if partition_units is None: + partition_units = [] + input_names = list(signature(model.forward).parameters.keys()) + if "input_ids" in input_names and "inputs_embeds" in input_names: + input_names.remove("inputs_embeds") + if shard_plan: + hierarchy_substructure_qualified_names = self._hierarchy_structure_names(model, shard_plan) + partition_units += hierarchy_substructure_qualified_names + traced: torch.fx.GraphModule = hf_symbolic_trace( + model, + input_names=input_names, + disable_check=True, + tracer_cls=ModelTracer, + partition_modules=partition_units, + ) + return traced + + def parse_dynamo_export(self, model: nn.Module, *args: Sequence, **kwargs: Dict): + """ + Applies capture model graph with torch dynamo.export. + + Args: + model (nn.Module): raw model + + Returns: + Captured torch.fx.GraphModule + + """ + traced: torch.fx.GraphModule = _export_to_torch_ir(model, args=args, kwargs=kwargs) + return traced + + def parse_huggingface_fx( + self, model, partition_units: List[str] = None, shard_plan: Dict = None, default_settings: bool = True + ): + """ + Applies symbolic trace with huggingface-like fx. + + Args: + model (nn.Module): raw model + partition_units (List[str]): a list of smallest unsplittable modules such that the parser will + not flatten their underlying components during parsing + shard_plan (Dict): dictionary of sharding plan, if users would like to wrap up tensor parallelized + modules as unsplittable units + + Returns: + Captured torch.fx.GraphModule + + """ + if partition_units is None: + partition_units = [] + input_arguments = signature(model.forward).parameters.keys() + # parser flattens module hierachy during parse. Maintain hierachy so that it can still be accessed by a sharding plan + if shard_plan: + hierarchy_substructure_qualified_names = self._hierarchy_structure_names(model, shard_plan) + partition_units += hierarchy_substructure_qualified_names + input_names = list(input_arguments) + if default_settings: + default_input_names, default_unit_modules = self._default_parse_info(model, input_names) + if default_input_names: + input_names = default_input_names + if default_unit_modules: + partition_units = default_unit_modules + if "input_ids" in input_names and "inputs_embeds" in input_names: + # two arguments cannot occur simultanenously + input_names.remove("inputs_embeds") + input_names = input_names[:NUM_DEFAULT_ARGS] + traced: torch.fx.GraphModule = hf_symbolic_trace( + model, + input_names=input_names, + disable_check=True, + tracer_cls=HFModelTracer, + partition_modules=partition_units, + ) + return traced + + def _hierarchy_structure_names(self, model, shard_plan): + modules_to_maintain_hierarchy = set() + self._collect_hierachical_modules_paths(model, shard_plan["forward"], modules_to_maintain_hierarchy) + self._collect_hierachical_modules_paths(model, shard_plan["parameter"], modules_to_maintain_hierarchy) + return modules_to_maintain_hierarchy + + def _collect_hierachical_modules_paths(self, model, plan_dict, module_paths): + for path_to_submodule, _ in model.named_modules(): + for plan_fqn in plan_dict: + pattern = plan_fqn.rsplit(".", 1)[0] + if ( + re.match(pattern, path_to_submodule) + and len(list(model.get_submodule(path_to_submodule).children())) != 0 + ): + module_paths.add(path_to_submodule) + + def _locate_module_classes(self, model, paths_to_submodules): + if paths_to_submodules is None: + return paths_to_submodules + visited = set(paths_to_submodules) + submodule_classes = set() + for name, submodule in model.named_modules(): + if name in visited: + submodule_classes.add(type(submodule)) + return list(submodule_classes) + + def _default_parse_info(self, model, input_names, num_default_args=3): + from transformers.models.whisper.modeling_whisper import WhisperModel + from transformers.models.mixtral.modeling_mixtral import ( + MixtralModel, + MixtralRMSNorm, + MixtralSparseMoeBlock, + MixtralAttention, + ) + from transformers.models.biogpt.modeling_biogpt import BioGptModel, BioGptAttention + from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Model, DisentangledSelfAttention + from transformers.models.marian.modeling_marian import MarianModel, MarianAttention, MarianEncoderLayer + from transformers.models.blenderbot.modeling_blenderbot import ( + BlenderbotModel, + BlenderbotAttention, + BlenderbotEncoderLayer, + ) + from transformers.models.layoutlmv3.modeling_layoutlmv3 import LayoutLMv3Model, LayoutLMv3SelfAttention + from transformers.models.phi.modeling_phi import PhiModel, PhiAttention + from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel, GPTNeoXAttention + from transformers.models.falcon.modeling_falcon import FalconModel, FalconAttention + from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel, GPTBigCodeAttention + from transformers.models.vit.modeling_vit import ViTModel, ViTEmbeddings, ViTSelfAttention + from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model, Wav2Vec2Attention + from transformers.models.speecht5.modeling_speecht5 import SpeechT5Model, SpeechT5Attention + from transformers.models.bloom.modeling_bloom import BloomModel, BloomAttention + + model_type = type(model) + input_names = partition_unit_classes = None + if model_type == MixtralModel: + partition_unit_classes = [MixtralRMSNorm, MixtralSparseMoeBlock, MixtralAttention] + elif model_type == BioGptModel: + partition_unit_classes = [BioGptAttention] + elif model_type == DebertaV2Model: + partition_unit_classes = [DisentangledSelfAttention] + elif model_type == MarianModel: + partition_unit_classes = [MarianAttention, MarianEncoderLayer] + elif model_type == BlenderbotModel: + partition_unit_classes = [BlenderbotAttention, BlenderbotEncoderLayer] + elif model_type == LayoutLMv3Model: + partition_unit_classes = [LayoutLMv3SelfAttention] + elif model_type == PhiModel: + partition_unit_classes = [PhiAttention] + elif model_type == GPTNeoXModel: + partition_unit_classes = [GPTNeoXAttention] + elif model_type == FalconModel: + partition_unit_classes = [FalconAttention] + elif model_type == GPTBigCodeModel: + partition_unit_classes = [GPTBigCodeAttention] + elif model_type == ViTModel: + partition_unit_classes = [ViTEmbeddings, ViTSelfAttention] + elif model_type == Wav2Vec2Model: + partition_unit_classes = [Wav2Vec2Attention] + elif model_type == SpeechT5Model: + partition_unit_classes = [SpeechT5Attention] + elif model_type == BloomModel: + input_names = ["attention_mask", "head_mask", "inputs_embeds"] + partition_unit_classes = [BloomAttention] + elif model_type == WhisperModel: + input_names = ["input_features", "decoder_input_ids"] + + if input_names: + input_names = input_names[:num_default_args] + return input_names, partition_unit_classes + + +def parse_model_graph(parser: PipeParser, model: nn.Module, plan: PipelineParallelPlan) -> torch.fx.GraphModule: + """ + Pipeline Parallelism API that performs parsing given tracer types. + + Args: + parser (PipeParser): model parser + model (nn.Module): raw model + plan (PipelineParallelPlan): configuration of pipeline paralellism API. + + Returns: + Captured torch.fx.GraphModule + + """ + tracer_type = plan.tracer_type + tracer_kwargs = plan.tracer_kwargs + if tracer_kwargs is None: + tracer_kwargs = {} + if tracer_type == TracerType.AUTO: + model_graph = parser.parse(model, plan) + else: + if "partition_units" not in tracer_kwargs and tracer_type in [TracerType.TORCH_FX, TracerType.HF_FX]: + tracer_kwargs["partition_units"] = plan.smallest_unsplittable_units + if tracer_type == TracerType.TORCH_FX: + model_graph = parser.parse_torch_fx(model, **tracer_kwargs) + elif tracer_type == TracerType.HF_FX: + model_graph = parser.parse_huggingface_fx(model, **tracer_kwargs) + elif tracer_type == TracerType.TORCH_DYNAMO: + model_graph = parser.parse_dynamo_export(model, **tracer_kwargs) + else: + raise NotImplementedError(f"Logic of tracer {tracer_type} has not been implemented yet.") + return model_graph + + +def split_pipeline_point(model: nn.Module, plan: PipelineParallelPlan): + """ + Pipeline Parallelism API that updates pipeline stage split points. + + Args: + model (nn.Module): raw model + plan (PipelineParallelPlan): configuration of pipeline paralellism API. + + Returns: + Captured torch.fx.GraphModule. + + """ + # obtain the traced graph of entire model if pipeline parallelism is on + parser = PipeParser() + model_graph = parse_model_graph(parser, model, plan) + split_points = parser.split(model_graph, plan) + plan.split_points = split_points + return split_points, model_graph, parser + + +def construct_pipeline_split_graph(model: nn.Module, plan: PipelineParallelPlan, update_split_points: bool = False): + """ + Pipeline Parallelism API that performs pipeline stage split. + + Args: + model (nn.Module): raw model + plan (PipelineParallelPlan): configuration of pipeline paralellism API. + update_split_points (bool): set this switch on to update pipeline split points in-place. + + Returns: + Captured torch.fx.GraphModule. + + """ + parser = PipeParser() + model_graph = parse_model_graph(parser, model, plan) + if update_split_points: + split_points = parser.split(model_graph, plan) + plan.split_points = split_points + # partition model graph into virtual pipeline chunks per stage + split_graph = parser.split_stage(model_graph, model, plan) + return split_graph diff --git a/vescale/pipe/pipe_stage.py b/vescale/pipe/pipe_stage.py new file mode 100644 index 0000000..9e91f56 --- /dev/null +++ b/vescale/pipe/pipe_stage.py @@ -0,0 +1,563 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +""" +This `PipeModule` Class is the abstraction of a pipeline stage. + +PipeModule takes as microbatch input 1). List of data per microbatch, 2). Dictionary of data per microbatch, 3). torch.Tensor + +PipeModule takes both 1). p2p transmitted data from incoming stages, and 2). local data inputs + +Each Pipeline stage can run single batch data forward, just like nn.Modules, we can +use forward functions and new p2p ops to replement pipeline forward and backward + +For Example 1. + ```python + stage: PipeModule = ... + single_data = ... # a single microbatch of data + fwd = stage(single_data) + p2p_send_recv( ... ) + ``` + +For Example 2. + ```python + stage: PipeModule = ... + p2p_data = ... # a torch.Tensor from last stage + local_data = Dict(...) # a single microbatch of data + fwd = stage(p2p_data, local_inputs=local_data) + p2p_send_recv( ... ) + ``` + +""" + +import torch +import torch.nn as nn +import torch.distributed as dist +import numpy as np +import inspect +import re +from typing import Dict, List, Tuple, Union, Optional, Sequence, Callable, Any +from vescale.optim.base_optimizer import BasicOptimizer +from vescale.optim.distributed_optimizer import DistributedOptimizer +from vescale.devicemesh_api.api import VeDeviceMesh +from vescale.ddp.distributed_data_parallel import DistributedDataParallel as DDP +from vescale.dtensor.dtensor import DTensor +from vescale.plan import PipelineParallelPlan, PipelineP2PSpec +from vescale.pipe.pipe_parser import construct_pipeline_split_graph +from collections import defaultdict + + +class PipeModule(nn.Module): + def __init__( + self, + module: Union[nn.Module, List], + doptimizer: Union[BasicOptimizer, DistributedOptimizer], + lr_scheduler: Callable, + stage_deps: np.ndarray, + p2p_index_mapping: Dict, + config: PipelineParallelPlan, + ): + super().__init__() + self.stage_modules = {} + if isinstance(module, List): + for i in range(len(module)): + self.stage_modules[i] = module[i] + else: + self.stage_modules[0] = module + self.doptimizer = doptimizer + self.lr_scheduler = lr_scheduler + self.shared_module_process_groups = defaultdict() + self.sync_chunk_ids = set() + self.shared_path_this_stage = {} + self.shared_module_mapping = {} + self.config = config + self.num_stages = self.config.num_stages + self.virtual_chunks = self.config.virtual_chunks + self.stage_deps = stage_deps + self.p2p_index_mapping = p2p_index_mapping + + def forward( + self, + inputs: Union[torch.Tensor, List, Dict], + local_inputs: Union[torch.Tensor, List, Dict] = None, + chunk_id: int = 0, + ): + """ + Forward propagation function of a pipeline stage. This function processes inputs to model chunks from p2p data transfers + and local dataloaders. + + Note: + - inputs (Union[torch.Tensor, List, Dict]): transmitted data received from another pipeline stage + - local_inputs (Union[torch.Tensor, List, Dict]): optional input of local data + - chunk_id (int): identifier of dictating what virtual model chunk to execute in interleaved 1f1b schedule. + If it is the simple 1f1b schedule, chunk_id=0. + + Args: + inputs (torch.Tensor, list, dict): inputs fed into model partition module. + local_inputs (torch.Tensor, list, dict): local inputs from dataloaders, used when executing pipeline schedule. + + Returns: + Output activations. + + """ + chunk_module = self.stage_modules[chunk_id] + if local_inputs is None: + if isinstance(inputs, list): + return chunk_module(*inputs) + elif isinstance(inputs, dict): + return chunk_module(**inputs) + elif inputs is None: + return chunk_module() + else: + return chunk_module(inputs) + else: + combined_data = self._prepare_inputs(chunk_module, inputs, local_inputs) + return chunk_module(**combined_data) + + def _prepare_inputs(self, module, inputs, local_inputs=None): + fwd = module.module.forward if isinstance(module, DDP) else module.forward + sig = inspect.signature(fwd) + arguments = list(sig.parameters.keys()) + dict_inputs = self._prepare_data_formats(arguments, inputs) + dict_local_inputs = self._prepare_data_formats(arguments, local_inputs) + final_inputs = {} + for key in arguments: + input_val, local_val = dict_inputs.get(key), dict_local_inputs.get(key) + if input_val is not None: + final_inputs[key] = input_val + elif local_val is not None: + final_inputs[key] = local_val + elif sig.parameters[key].default is not inspect.Parameter.empty: + final_inputs[key] = sig.parameters[key].default.default + return final_inputs + + def _prepare_data_formats(self, keys, data): + if data is None or isinstance(data, Sequence) and len(data) == 1 and data[0] is None: + if keys: + return {keys[0]: None} + return None + if isinstance(data, torch.Tensor): + data = [data] + if isinstance(data, Sequence): + args_length = min(len(data), len(keys)) + data = {keys[i]: data[i] for i in range(args_length)} + return data + + def __getitem__(self, module_chunk_id: int): + assert module_chunk_id in self.stage_modules, "Virtual chunk id not existed!" + return self.stage_modules[module_chunk_id] + + @property + def get_optimizer(self): + return self.doptimizer + + @property + def get_lr_scheduler(self): + return self.lr_scheduler + + def parameters(self): + parameters = [] + for chunk_id in range(self.virtual_chunks): + parameters += list(self.stage_modules[chunk_id].parameters()) + return parameters + + def has_shared_params(self, global_mesh: VeDeviceMesh, group_id: int, tp_rank: int) -> bool: + """ + Checks whether this stage has submodules to synchronize parameters or gradients. + An additional use case of this function is to dictate if a submodule's shared parameter + (invoked by self.get_shared_module()) participates in grad norm clipping. + + Args: + global_mesh (VeDeviceMesh): global DeviceMesh with which one looks up communication information. + group_id (int): specify groups of modules across stages to synchronize. Default by 0. + tp_rank (int): tensor model parallel rank of current stage. + + Returns: + whether a stage contains sharable parameters + + """ + local_rank = global_mesh.get_local_rank() + return not ( + not self.shared_module_process_groups + or tp_rank not in self.shared_module_process_groups[group_id] + or local_rank not in dist.get_process_group_ranks(self.shared_module_process_groups[group_id][tp_rank]) + ) + + def sync_shared_params( + self, global_mesh: VeDeviceMesh, group_id: int = 0, share_params: bool = True, chunk_id: int = 0 + ): + """ + Synchronize parameters of reused modules e.g. + Embedding. This function is invoked in each run of PP schedule. + + Args: + global_mesh (VeDeviceMesh): global DeviceMesh with which one looks up communication information. + group_id (int): specify groups of modules across stages to synchronize. Default by 0. + share_params (bool): if True, sync weight parameters; otherwise, share gradients. + chunk_id (int): identify if current virtual model chunk in this stage has any module to synchronize. + + """ + tp_rank = global_mesh.get_tensor_parallel_rank() + if ( + not self.has_shared_params(global_mesh, group_id=group_id, tp_rank=tp_rank) + or chunk_id not in self.sync_chunk_ids + ): + return + # assume that each model chunk has at most 1 sharable sub-module per shared group + shared_submodule_path = self.shared_path_this_stage[(group_id, chunk_id)] + model_chunk = self.stage_modules[chunk_id] + if isinstance(model_chunk, DDP): + model_chunk = model_chunk.module + target_module = model_chunk.get_submodule(shared_submodule_path) + if getattr(target_module, "get_word_embeddings_weight", None): + target_module = target_module.get_word_embeddings_weight() + + # assume tp coordinate is always the last dimension + sync_group = self.shared_module_process_groups[group_id][tp_rank] + group_size = dist.get_world_size(group=sync_group) + + if share_params: + if isinstance(target_module.data, DTensor): + dist.all_reduce(target_module.data._local_tensor, group=sync_group) + else: + dist.all_reduce(target_module.data, group=sync_group) + target_module.data /= group_size + else: + # if type is DTensr, then do local_tensor.grad + if target_module.grad is not None: + target_module.grad.data /= group_size + dist.all_reduce(target_module.grad.data, group=sync_group) + else: # DDP Module + target_module.main_grad /= group_size + dist.all_reduce(target_module.main_grad, group=sync_group) + + +def construct_stage_modules( + model: nn.Module, + plan: PipelineParallelPlan, + global_mesh: VeDeviceMesh, + update_split_points: bool = False, +): + """ + Pipeline Parallelism API that constructs ingredients for building PipelineModule. + + Args: + model (nn.Module): raw model + plan (PipelineParallelPlan): configuration of pipeline paralellism API. + update_split_points (bool): set this switch on to update pipeline split points in-place. + + Returns: + Triplet of 1). list of modules in a pipeline stage, 2). abstraction of send-receive dependency relationship + among stages, 3). P2P input index mapping. + + """ + num_stages = plan.num_stages + virtual_chunks = plan.virtual_chunks + split_graph = construct_pipeline_split_graph(model, plan, update_split_points=update_split_points) + + # assign modules to stage, establish stage dependency and input mapping + stage_modules, stage_dependency, p2p_index_mapping = build_stage_module_and_dependency( + split_graph, + num_stages, + virtual_chunks, + stage_id=global_mesh.get_pipeline_parallel_rank(), + ) + submodules_this_stage = [] + for chunk_id in range(len(stage_modules)): + submodules_this_stage.append(stage_modules[chunk_id]) + return submodules_this_stage, stage_dependency, p2p_index_mapping + + +def construct_pipeline_stage( + model: nn.Module, + plan: PipelineParallelPlan, + global_mesh: VeDeviceMesh, + lr_scheduler: Optional[Union[Callable, Tuple[Callable, Any]]] = None, + update_split_points: bool = False, +): + """ + Pipeline Parallelism API that constructs PipeModule from the raw model. + + Args: + model (nn.Module): raw model. + plan (PipelineParallelPlan): configuration of pipeline paralellism API. + lr_scheduler (Optional[Union[Callable, Tuple[Callable, Any]]]): learning rate scheduler. + update_split_points (bool): set this switch on to update pipeline split points in-place. + + Returns: + Pipeline stage. + + """ + stage_modules, stage_dependency, p2p_index_mapping = construct_stage_modules( + model, plan, global_mesh, update_split_points + ) + return PipeModule(stage_modules, None, lr_scheduler, stage_dependency, p2p_index_mapping, plan) + + +def build_shared_module_group( + pipe_module: PipeModule, + split_graph: torch.fx.GraphModule, + num_stages: int, + virtual_chunks: int, + shared_module_path_groups: List[List], + global_mesh: VeDeviceMesh, +): + """ + Pipeline Parallelism API that establishes groups of modules which + synchronize parameters or gradients amongst one another. + + Args: + pipe_module (PipeModule): pipeline stage to assign synchronzied mapping. + split_graph (torch.fx.GraphModule): the global model graph split into stages. + num_stages (int): number of pipeline stages. + virtual_chunks (int): number of virtual pipeline stage chunks in a stage. + shared_module_path_groups (List[List]): list of groups of module fully qualified names, + where modules in the same group synchronizes parameters or gradients. + global_mesh (VeDeviceMesh): global DeviceMesh with which one looks up communication information. + + Returns: + Tuple of 1). a dictionary of shared group items, 2). a dictionary of shared group this stage is involved + 3). synchronized model chunk ids, and 4). path to the shared submodule, if applicable. + + """ + shared_module_process_groups = defaultdict() + shared_module_mapping = {} + sync_chunk_ids = set() + shared_path_this_stage = {} + module_partition_names_by_stage = [[] for _ in range(num_stages)] + num_model_partitions = num_stages * virtual_chunks + for j in range(num_model_partitions): + module_partition_names_by_stage[j % num_stages].append(f"stage{j}") + stage_id = global_mesh.get_pipeline_parallel_rank() + # establish process groups of synchronizing shared embeddings + if shared_module_path_groups: + shared_module_process_groups, shared_module_mapping, shared_info = _establish_shared_module_groups( + num_stages, + virtual_chunks, + module_partition_names_by_stage, + split_graph, + shared_module_path_groups, + global_mesh, + ) + for group_id, group in enumerate(shared_info): + for _stage_id, chunk_id, path in group: + if _stage_id == stage_id: + sync_chunk_ids.add(chunk_id) + shared_path_this_stage[(group_id, chunk_id)] = path + pipe_module.shared_module_process_groups = shared_module_process_groups + pipe_module.shared_module_mapping = shared_module_mapping + pipe_module.sync_chunk_ids = sync_chunk_ids + pipe_module.shared_path_this_stage = shared_path_this_stage + return shared_module_process_groups, shared_module_mapping, sync_chunk_ids, shared_path_this_stage + + +def build_stage_module_and_dependency( + split_graph: torch.fx.GraphModule, + num_stages: int, + virtual_chunks: int, + stage_id: int, +): + """ + Establishes sub-modules of the same stage as well as the send-receive relationship among stages. + + Args: + split_graph (torch.fx.GraphModule): the global model graph split into stages. + num_stages (int): number of pipeline stages. + virtual_chunks (int): number of virtual pipeline stage chunks in a stage. + stage_id (int): pipeline stage id. + + Returns: + Submodules of a pipeline stage, inter-stage dependency, and P2P input mapping. + + """ + # generate inter-stage communication dependency and communication mapping + stage_dependency, p2p_index_mapping = _generate_stage_dependencies(split_graph, num_stages, virtual_chunks) + # build sub-modules belonging to the current pipeline stage + stage_modules = _build_module(split_graph, num_stages, virtual_chunks, stage_id) + return stage_modules, stage_dependency, p2p_index_mapping + + +def _generate_stage_dependencies(graph: torch.fx.GraphModule, num_stage: int, virtual_chunks: int): + """ + Generates inter-stage dependency and P2P index mapping across stages. + + Args: + graph (torch.fx.GraphModule): the whole trace graph of the model. + + Returns: + Mapping of inter-stage dependency and p2p index mapping. + + """ + stage_to_chunk_mapping = _get_stage_to_chunk_mapping(virtual_chunks, num_stage) + _stage_to_chunk_mapping = {} + for stage_id, partition_ids in stage_to_chunk_mapping.items(): + for part_id in partition_ids: + _stage_to_chunk_mapping[part_id] = stage_id + stage_to_chunk_mapping = _stage_to_chunk_mapping + + stage_rule = r"stage\d+" + stage2node = {} + for node in graph.graph.nodes: + if re.match(stage_rule, node.name): + stage2node.update({node.name: node}) + + stage_deps = np.zeros((num_stage, num_stage)) + for node_name, node in stage2node.items(): + partition_id = int(node_name[5:]) + stage_id = stage_to_chunk_mapping[partition_id] + node_user = node.users.keys() + for u_node in node_user: + if u_node.name in stage2node: + u_id = int(u_node.name[5:]) + target_stage_id = stage_to_chunk_mapping[u_id] + if stage_deps[target_stage_id][stage_id] or stage_id == num_stage - 1: + # no recurring edge! + continue + stage_deps[stage_id][target_stage_id] = 1 + + # construct p2p index mapping + p2p_index_mapping = {} + for node_name, node in stage2node.items(): + partition_id = int(node_name[5:]) + stage_id = stage_to_chunk_mapping[partition_id] + args_mapping = [] + for input_id, arg_node in enumerate(node.args): + if arg_node.name in stage2node: + arg_partition_id = int(arg_node.name[5:]) + arg_stage_id = stage_to_chunk_mapping[arg_partition_id] + args_mapping.append(PipelineP2PSpec(arg_stage_id, input_id)) + else: # should from local + args_mapping.append(PipelineP2PSpec(stage_id, input_id)) + p2p_index_mapping.update({stage_id: args_mapping}) + + return stage_deps, p2p_index_mapping + + +def _establish_shared_module_groups( + num_stage, + virtual_chunks, + module_partition_names_by_stage, + split_graph, + shared_module_path_groups, + global_mesh: VeDeviceMesh, +): + """ + Identify groups of modules to share gradients/weights, e.g. embedding layers + upon initialization and at the end of a pipeline schedule run. + """ + all_named_modules = [[] for _ in range(num_stage)] + for stage_id in range(num_stage): + for chunk_id in range(virtual_chunks): + key_name = module_partition_names_by_stage[stage_id][chunk_id] + module_graph = split_graph.get_submodule(key_name) + all_named_modules[stage_id].append({name for name, _ in module_graph.named_modules()}) + + shared_module_paths = [[] for _ in range(len(shared_module_path_groups))] + for idx, shared_module_group in enumerate(shared_module_path_groups): + for module_path in shared_module_group: + stage_id, chunk_id = _locate_shared_module(module_path, all_named_modules, num_stage, virtual_chunks) + shared_module_paths[idx].append((stage_id, chunk_id, module_path)) + shared_stages_groups = [ + [stage for stage, _, _ in shared_module_paths[idx]] for idx in range(len(shared_module_path_groups)) + ] + + all_tp_submeshes = global_mesh.get_global_tensor_parallel_meshes() + # TODO: in future, keep track of multiple groups of shared modules + all_tp_groups = [] + map_id = 0 + for dm in all_tp_submeshes: + mesh_list = dm.mesh.tolist() + converted_pp_ranks = [global_mesh.get_strategy_coordinate(_idx)[0] for _idx in mesh_list] + assert all(i == converted_pp_ranks[0] for i in converted_pp_ranks) + for pp_rank in shared_stages_groups[map_id]: + if pp_rank == converted_pp_ranks[0]: + all_tp_groups.append(mesh_list) + break + + shared_tp_comm_groups = list(zip(*all_tp_groups)) + shared_module_process_groups = defaultdict(dict) + shared_module_mapping = {} + shared_module_mapping[map_id] = shared_stages_groups[map_id] + for tp_idx, shared_group in enumerate(shared_tp_comm_groups): + sync_embed_pg = dist.new_group(ranks=shared_group, backend="nccl") + shared_module_process_groups[map_id][tp_idx] = sync_embed_pg + return shared_module_process_groups, shared_module_mapping, shared_module_paths + + +def _locate_shared_module(module_path, all_named_modules, num_stage, virtual_chunks): + for stage_id in range(num_stage): + for chunk_id in range(virtual_chunks): + if module_path in all_named_modules[stage_id][chunk_id]: + return stage_id, chunk_id + raise ValueError(f"Module to be synchronized not found: {module_path}") + + +def _build_model_chunks(stage_id, model_graph, mapping): + assert stage_id in mapping + pipeline_chunks = {} + unique_id = 0 + for chunk_id, partition_id in enumerate(mapping[stage_id]): + key = f"stage{partition_id}" + virtual_pipeline_module = getattr(model_graph, key) + # assign unique id for each low-level submodule + for _, submodule in virtual_pipeline_module.named_modules(): + if len(list(submodule.children())) == 0: + registered_module_id = f"module_{stage_id}_{chunk_id}_{unique_id}" + virtual_pipeline_module.module_id = registered_module_id + unique_id += 1 + pipeline_chunks[chunk_id] = virtual_pipeline_module + return pipeline_chunks + + +def _build_module(model_graph: torch.fx.GraphModule, num_stages: int, num_model_chunks: int, stage_id: int): + """ + Builds model chunks by stage, and assigns unique submodule id to every basic modules. + + Args: + model_graph (torch.fx.GraphModule): the model trace graph with stage partitions. + num_stages (int): number of pipeline stages. + num_model_chunks (int): number of virtual pipeline chunks per stage. + dist_api (VeDeviceMesh): an object of DeviceMesh API. + + Returns: + Mapping of chunk id to model partitions of the current stage. + + """ + stage_to_chunk = _get_stage_to_chunk_mapping(num_model_chunks, num_stages) + return _build_model_chunks(stage_id, model_graph, stage_to_chunk) + + +def _get_stage_to_chunk_mapping(num_model_chunks, num_stages): + """ + Gets a mapping from stage id to model partition ids. + + Args: + num_model_chunks (int): number of virtual pipeline chunks per stage. + num_stages (int): number of pipeline stages. + + Returns: + Mapping from stages to their model chunks. + + """ + if num_model_chunks == 1: + stage_to_chunk = {i: [i] for i in range(num_stages)} + else: + length = num_stages * num_model_chunks + stage_to_chunk = {i: [] for i in range(num_stages)} + for i in range(length): + stage_to_chunk[i % num_stages].append(i) + return stage_to_chunk diff --git a/vescale/pipe/tracer.py b/vescale/pipe/tracer.py new file mode 100644 index 0000000..85ded75 --- /dev/null +++ b/vescale/pipe/tracer.py @@ -0,0 +1,709 @@ +################################################################################ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +# Modification Copyright 2023 ByteDance Ltd. and/or its affiliates. +################################################################################ + +import torch +import torch.nn as nn +import torch.fx as fx +import collections +import warnings +import math +import inspect +from torch.fx import Tracer, Graph, Proxy, GraphModule +from torch.fx.proxy import ParameterProxy +from transformers.utils.fx import ( + _proxies_to_metas, + _generate_random_int, + check_if_model_is_supported, + _FX_SUPPORTED_MODELS_WITH_KV_CACHE, + _IS_IN_DEBUG_MODE, + _MANUAL_META_OVERRIDES, + HFProxy, + HFAttribute, + HFTracer, +) + +try: + from transformers.utils.fx import _gen_constructor_wrapper +except Exception as e: + warnings.warn("Util path changed. Now load from a new path") + from transformers.utils.fx import gen_constructor_wrapper as _gen_constructor_wrapper + +from transformers.utils.import_utils import ( + TORCH_FX_REQUIRED_VERSION, + get_torch_version, + is_torch_fx_available, + is_peft_available, +) +from torch.fx._compatibility import compatibility +from transformers.modeling_utils import PreTrainedModel +from typing import Any, Callable, Dict, List, Optional, Union, Sequence, Type +from transformers.models.auto import get_values +from transformers.models.auto.modeling_auto import ( + MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_BACKBONE_MAPPING_NAMES, + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_CTC_MAPPING_NAMES, + MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_MASKED_LM_MAPPING_NAMES, + MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES, + MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES, + MODEL_FOR_PRETRAINING_MAPPING_NAMES, + MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, +) + +if is_peft_available(): + from peft import PeftModel + + +_IS_PARTITION_MODULE = "PARTITION" + + +class ModelTracer(fx.Tracer): + def __init__(self, *args, **kwargs): + super().__init__() + + def is_leaf_module(self, m, module_qualified_name): + return ( + m.__module__.startswith("torch.nn") + or m.__module__.startswith("torch.ao.nn") + or hasattr(m, _IS_PARTITION_MODULE) + ) and not isinstance(m, torch.nn.Sequential) + + +class HFModelTracer(Tracer): + """ + Tracer that is able to symbolically trace models from the library. To do that, it uses the HFProxy instead of the + regular PyTorch torch.fx.Proxy. + """ + + # Feature flag for proxying accesses to buffer values + proxy_buffer_attributes: bool = True + allow_insert_stateless_mods: bool = True + _TORCH_METHODS_TO_PATCH = [ + "arange", + "zeros", + "ones", + "full", + "full_like", + "eye", + "empty", + "tensor", + "clamp", + "finfo", + ] + supported_archs = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel) + + def __init__(self, autowrap_modules=(math,), autowrap_functions=(), partition_modules=None): + super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions) + + if not is_torch_fx_available(): + raise ImportError( + f"Found an incompatible version of torch. Found version {get_torch_version()}, but only version " + f"{TORCH_FX_REQUIRED_VERSION} is supported." + ) + + self.visited_partition_module_paths = set() + self.partition_module_classes_and_fqns = set() if partition_modules is None else set(partition_modules) + + def _generate_dummy_input( + self, model: PreTrainedModel, input_name: str, shape: List[int], input_names: List[str] + ) -> Dict[str, torch.Tensor]: + """Generates dummy input for model inference recording.""" + # Retrieving the model class, either from the "class_for_deserialization" attribute if the model was restored + # from pickle, or from the "__class__" attribute in the general case. + model_class_name = getattr(model, "class_for_deserialization", model.__class__).__name__ + device = model.device + inputs_dict = {} + + # when tracing a model with KV cache, we simply need to unsure that the KV cache length is larger than one to + # rightfully pass certain controlflows (Example: https://github.com/huggingface/transformers/blob/5c8d941d66734811d2ef6f57f15b44f7fb7a98c4/src/transformers/modeling_attn_mask_utils.py#L162). + # After tracing, the model can then still be used with arbitrary lengths different than the one used during tracing. + kv_cache_length = 5 + + if input_name in ["labels", "start_positions", "end_positions"]: + batch_size = shape[0] + if model_class_name in [ + *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES), + *get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES), + *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES), + *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES), + *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES), + ]: + inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device) + elif model_class_name in [ + *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES), + *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES), + "XLNetForQuestionAnswering", + ]: + inputs_dict["start_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device) + inputs_dict["end_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device) + elif model_class_name in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES): + if not hasattr(model.config, "problem_type") or model.config.problem_type is None: + raise ValueError( + "Could not retrieve the problem type for the sequence classification task, please set " + 'model.config.problem_type to one of the following values: "regression", ' + '"single_label_classification", or "multi_label_classification".' + ) + + if model.config.problem_type == "regression": + labels_shape = (batch_size, model.config.num_labels) + labels_dtype = torch.float32 + elif model.config.problem_type == "single_label_classification": + labels_shape = (batch_size,) + labels_dtype = torch.long + elif model.config.problem_type == "multi_label_classification": + labels_shape = (batch_size, model.config.num_labels) + labels_dtype = torch.float32 + else: + raise ValueError( + 'Expected model.config.problem_type to be either: "regression", "single_label_classification"' + f', or "multi_label_classification", but "{model.config.problem_type}" was provided.' + ) + inputs_dict["labels"] = torch.zeros(*labels_shape, dtype=labels_dtype, device=device) + + elif model_class_name in [ + *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES), + *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES), + *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), + *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES), + *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES), + *get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES), + "GPT2DoubleHeadsModel", + "PeftModelForCausalLM", + "PeftModelForSeq2SeqLM", + ]: + inputs_dict["labels"] = torch.zeros(shape, dtype=torch.long, device=device) + elif model_class_name in [*get_values(MODEL_FOR_CTC_MAPPING_NAMES)]: + inputs_dict["labels"] = torch.zeros(shape, dtype=torch.float32, device=device) + else: + raise NotImplementedError( + f"Generating the dummy input named {input_name} for {model_class_name} is not supported yet." + ) + elif "pixel_values" in input_name: + batch_size = shape[0] + image_size = getattr(model.config, "image_size", None) + if image_size is None: + if hasattr(model.config, "vision_config"): + image_size = model.config.vision_config.image_size + elif hasattr(model.config, "encoder"): + image_size = model.config.encoder.image_size + else: + image_size = (_generate_random_int(), _generate_random_int()) + + # If no num_channels is in the config, use some arbitrary value. + num_channels = getattr(model.config, "num_channels", 3) + if not isinstance(image_size, collections.abc.Iterable): + image_size = (image_size, image_size) + height, width = image_size + inputs_dict[input_name] = torch.zeros( + batch_size, num_channels, height, width, dtype=torch.float32, device=device + ) + elif "bbox" in input_name: + inputs_dict[input_name] = torch.zeros(*shape, 4, dtype=torch.float, device=device) + elif "input_features" in input_name: + inputs_dict[input_name] = torch.zeros( + *shape, model.config.input_feat_per_channel, dtype=torch.float, device=device + ) + elif "visual_feats" in input_name: + inputs_dict[input_name] = torch.zeros( + shape + + [ + model.config.visual_feat_dim, + ], + dtype=torch.float, + device=device, + ) + elif "visual_pos" in input_name: + inputs_dict[input_name] = torch.zeros( + shape + + [ + model.config.visual_pos_dim, + ], + dtype=torch.float, + device=device, + ) + elif "inputs" in input_name: + inputs_dict[input_name] = torch.zeros(*shape, dtype=torch.float, device=device) + elif "input_values" in input_name: + batch_size, _ = shape + # Generating big sequence length for audio inputs. + seq_length = _generate_random_int(low=10000, high=20000) + inputs_dict[input_name] = torch.zeros(batch_size, seq_length, dtype=torch.float, device=device) + elif "mask" in input_name: + if "past_key_values" in input_names: + mask_shape = [shape[0], shape[1] + kv_cache_length] + else: + mask_shape = shape + + inputs_dict[input_name] = torch.zeros(mask_shape, dtype=torch.long, device=device) + elif "ids" in input_name: + inputs_dict[input_name] = torch.zeros(shape, dtype=torch.long, device=device) + elif "past_key_values" in input_name: + if model.config.model_type not in _FX_SUPPORTED_MODELS_WITH_KV_CACHE: + raise NotImplementedError( + f"Symbolic trace with past_key_values input is not supported yet for the model {model.config.model_type}. Please open an issue or a PR in Transformers repository if you would like to see the support added." + ) + num_heads = model.config.num_attention_heads + head_dim = model.config.hidden_size // model.config.num_attention_heads + + cache_shape = (shape[0], num_heads, kv_cache_length, head_dim) + pkv = tuple( + ( + torch.rand(cache_shape, dtype=torch.float, device=device), + torch.rand(cache_shape, dtype=torch.float, device=device), + ) + for i in range(model.config.num_hidden_layers) + ) + inputs_dict[input_name] = pkv + else: + shape_with_hidden_size = shape + [model.config.hidden_size] + inputs_dict[input_name] = torch.zeros(shape_with_hidden_size, dtype=torch.float, device=device) + + return inputs_dict + + def is_leaf_module(self, m, module_qualified_name): + return (not self._stateless_mod_instanciation_depends_on_proxies(m)) and ( + hasattr(m, _IS_PARTITION_MODULE) + or ( + m.__module__.startswith("torch.nn") + or m.__module__.startswith("torch.ao.nn") + and not isinstance(m, torch.nn.Sequential) + ) + ) + + def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None): + rv = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn) + + if kind == "placeholder" and target in self.meta_args: + rv.install_metadata(self.meta_args[target]) + return rv + + if target in self.orig_fns: + # NOTE: tensor constructors in PyTorch define the `device` argument as + # *kwargs-only*. That is why this works. If you add methods to + # _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only, + # this will break and you will likely see issues where we cannot infer + # the size of the output. + if "device" in kwargs: + kwargs["device"] = "meta" + + try: + args_metas = torch.fx.node.map_aggregate(args, _proxies_to_metas) + kwargs_metas = torch.fx.node.map_aggregate(kwargs, _proxies_to_metas) + + if kind == "call_function": + meta_target = _MANUAL_META_OVERRIDES.get(target, target) + meta_out = meta_target(*args_metas, **kwargs_metas) + if isinstance(meta_out, torch.Tensor): + meta_out = meta_out.to(device="meta") + elif kind == "call_method": + method = getattr(args_metas[0].__class__, target) + meta_target = _MANUAL_META_OVERRIDES.get(method, method) + meta_out = meta_target(*args_metas, **kwargs_metas) + elif kind == "call_module": + if not hasattr(self, "orig_forward"): + raise AttributeError(f"{self} does not have an attribute called orig_forward") + self._disable_module_getattr = True + try: + mod = self.root.get_submodule(target) + + mod_type = type(mod) + assert not any(path for path in self.visited_partition_module_paths if target.startswith(path)) + self.visited_partition_module_paths.add(target) + # assert mod_type not in self.partition_module_classes + if mod_type in _MANUAL_META_OVERRIDES: + meta_out = _MANUAL_META_OVERRIDES[mod_type](mod, *args_metas, **kwargs_metas) + else: + if self.partition_module_classes_and_fqns and ( + target in self.partition_module_classes_and_fqns + or mod_type in self.partition_module_classes_and_fqns + ): + raise ValueError # not to recurse into partition module's forward() + meta_out = self.orig_forward(*args_metas, **kwargs_metas) + except: # noqa: E722 + mod = self.root.get_submodule(target) + mod_type = type(mod) + meta_out = None + finally: + self._disable_module_getattr = False + elif kind == "get_attr": + self._disable_module_getattr = True + try: + attr_itr = self.root + atoms = target.split(".") + for atom in atoms: + attr_itr = getattr(attr_itr, atom) + if isinstance(attr_itr, torch.Tensor): + meta_out = attr_itr.to(device="meta") + else: + meta_out = attr_itr + finally: + self._disable_module_getattr = False + else: + return rv + + if not isinstance(rv, Proxy): + raise ValueError("Don't support composite output yet") + rv.install_metadata(meta_out) + except Exception as e: + if _IS_IN_DEBUG_MODE: + warnings.warn(f"Could not compute metadata for {kind} target {target}: {e}") + + return rv + + # Replaced by .getattr from PyTorch 1.13 + def _module_getattr(self, attr, attr_val, parameter_proxy_cache): + if getattr(self, "_disable_module_getattr", False): + return attr_val + else: + + def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache): + for n, p in collection_to_search: + if attr_val is p: + if n not in parameter_proxy_cache: + kwargs = {} + if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters: + kwargs["proxy_factory_fn"] = ( + None + if not self.param_shapes_constant + else lambda node: ParameterProxy(self, node, n, attr_val) + ) + val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type] + parameter_proxy_cache[n] = val_proxy + return parameter_proxy_cache[n] + return None + + if isinstance(attr_val, torch.nn.Parameter): + maybe_parameter_proxy = maybe_get_proxy_for_attr( + attr_val, self.root.named_parameters(), parameter_proxy_cache + ) + if maybe_parameter_proxy is not None: + return maybe_parameter_proxy + + if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor): + maybe_buffer_proxy = maybe_get_proxy_for_attr( + attr_val, self.root.named_buffers(), parameter_proxy_cache + ) + if maybe_buffer_proxy is not None: + return maybe_buffer_proxy + + return attr_val + + # Needed for PyTorch 1.13+ + def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: Dict[str, Any]): + return self._module_getattr(attr, attr_val, parameter_proxy_cache) + + def call_module(self, m, forward, args, kwargs): + self.orig_forward = forward + return super().call_module(m, forward, args, kwargs) + + def proxy(self, node): + return HFProxy(node, self) + + def trace( + self, + root: Union[torch.nn.Module, Callable[..., Any]], + concrete_args: Optional[Dict[str, Any]] = None, + dummy_inputs: Optional[Dict[str, Any]] = None, + complete_concrete_args_with_inputs_not_in_dummy_inputs: bool = True, + ) -> Graph: + """ + Traces `root` and returns the corresponding FX `torch.fx.Graph` representation. `root` can either be a + `torch.nn.Module` instance or a Python callable. Note that after this call, `self.root` may be different from + the `root` passed in here. For example, when a free function is passed to `trace()`, we will create a + `torch.nn.Module` instance to use as the root and add embedded constants to. + + Args: + root (`torch.nn.Module` or `Callable`): + Either a `torch.nn.Module`` or a function to be traced through. If root is not a + [`~transformers.PreTrainedModel`], then `dummy_inputs` must be passed, otherwise tracing will fail. + concrete_args (`Dict[str, Any], *optional*): + Concrete arguments that should not be treated as Proxies + dummy_inputs (`Dict[str, Any]`, *optional*): + The dummy inputs needed to handle data-dependent control-flow if `root` is not a + [`~transformers.PreTrainedModel`]. It can also be used when `root` is a + [`~transformers.PreTrainedModel`] to specify custom dummy inputs for a subset or all the model inputs. + complete_concrete_args_with_inputs_not_in_dummy_inputs (`bool`, *optional*, defaults to `True`): + If `True`, and `dummy_inputs` is specified, every argument that `root` can take that is not in + `dummy_inputs` and not in `concrete_args` will be added to `concrete_args`, otherwise does nothing. + + Returns: + `torch.fx.Graph`: + A FX `torch.fx.Graph` representing the semantics of the passed-in `root`. + + """ + sig = inspect.signature(root.forward if isinstance(root, torch.nn.Module) else root) + + if concrete_args is None: + concrete_args = {} + + if dummy_inputs is not None and complete_concrete_args_with_inputs_not_in_dummy_inputs: + for param in sig.parameters.values(): + if param.name in dummy_inputs: + continue + if param.default is inspect.Parameter.empty: + raise ValueError(f"You need to specify a default value for the parameter {param.name}.") + concrete_args.update( + { + p.name: p.default + for p in sig.parameters.values() + if (p.name not in dummy_inputs and p.name not in concrete_args) + } + ) + + input_names = sig.parameters.keys() - concrete_args.keys() + + # Creating a random input shape to generate dummy inputs. + batch_size = _generate_random_int() + sequence_length = _generate_random_int() + shape = [batch_size, sequence_length] + + if root.__class__.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES): + num_choices = _generate_random_int(low=2, high=5) + shape.insert(1, num_choices) + + inputs = dict(dummy_inputs) if dummy_inputs is not None else {} + for input_name in input_names: + if input_name in inputs: + continue + # We enforce that root must either be a PreTrainedModel or deserialized from a serialized traced model to + # be able to use HFTracer._generate_dummy_input. + if isinstance(root, self.supported_archs) or type(root).__qualname__.startswith( + ("_deserialize_graph_module", "_CodeOnlyModule") + ): + inputs.update(self._generate_dummy_input(root, input_name, shape, input_names=input_names)) + else: + raise RuntimeError( + f"Could not generate input named {input_name} for because root is not a" + " transformers.PreTrainedModel." + ) + + concrete_metas = { + input_name: input_.to("meta") if isinstance(input_, torch.Tensor) else input_ + for input_name, input_ in inputs.items() + } + for param in sig.parameters.values(): + if param.kind == inspect.Parameter.VAR_KEYWORD and param.name not in input_names: + concrete_metas[f"**{param.name}"] = {} + self.meta_args = concrete_metas + self.patched_torch_methods = { + target: _gen_constructor_wrapper(getattr(torch, target)) for target in self._TORCH_METHODS_TO_PATCH + } + self.orig_fns = set() + + for name, (wrapper, orig) in self.patched_torch_methods.items(): + setattr(torch, name, wrapper) + self.orig_fns.add(orig) + + try: + self.graph = super().trace(root, concrete_args=concrete_args) + finally: + for name, (_, orig) in self.patched_torch_methods.items(): + setattr(torch, name, orig) + + # This is necessary because concrete args are added as input to the traced module since + # https://github.com/pytorch/pytorch/pull/55888. + for node in self.graph.nodes: + if node.op == "placeholder": + # Removing default values for inputs as the forward pass will fail with them. + if node.target in input_names: + node.args = () + # Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor]. + # It cannot infer on the attributes and methods the input should have, and fails. + node.type = torch.Tensor + # It is a concrete arg so it is not used and should be removed. + else: + to_visit = [node] + to_delete = collections.OrderedDict() + while to_visit: + n = to_visit.pop(0) + to_delete[n] = None + to_visit += list(n.users.keys()) + + for user in reversed(to_delete.keys()): + self.graph.erase_node(user) + + # Without this, return type annotation "Tuple" is causing code execution failure. + if node.op == "output": + node.type = None + + return self.graph + + def _stateless_mod_instanciation_depends_on_proxies(self, mod: nn.Module) -> bool: + """ + Whether the module was instantiated with Proxies. If that is the case, such module cannot be a leaf module + because its attributes are input-dependent. + """ + return any(isinstance(attr, Proxy) for attr in mod.__dict__.values()) + + def _insert_module_as_submodule(self, mod: nn.Module) -> str: + """ + Helper method which tries to insert a module that was not declared as submodule. + """ + # If one of the module attributes is a Proxy, it means that its instantiation is input-dependent. + # It is not possible to insert such modules, those should be traced through. + if self._stateless_mod_instanciation_depends_on_proxies(mod): + return "" + idx = 0 + mod_name = mod.__class__.__name__.lower() + path = f"{mod_name}_{idx}" + already_inserted = False + while hasattr(self.root, path): + if getattr(self.root, path) is mod: + already_inserted = True + break + path = f"{mod_name}_{idx}" + idx += 1 + + # No need to add multiple instances of the same module. + if not already_inserted: + self.root.add_module(path, mod) + return path + + def path_of_module(self, mod: nn.Module) -> str: + """ + Helper method to find the qualified name of `mod` in the Module hierarchy of `root`. For example, if `root` has + a submodule named `foo`, which has a submodule named `bar`, passing `bar` into this function will return the + string "foo.bar". + + Args: + mod (str): The `Module` to retrieve the qualified name for. + """ + try: + return super().path_of_module(mod) + except NameError as e: + if self.allow_insert_stateless_mods and len(list(mod.parameters())) == 0 and len(list(mod.buffers())) == 0: + path = self._insert_module_as_submodule(mod) + return path + raise e + + @compatibility(is_backward_compatible=True) + def keys(self, obj: "Proxy") -> Any: + """Called when a proxy object is has the keys() method called. + This is what happens when ** is called on a proxy. This should return an iterator if ** is supposed to work in + your custom tracer. + """ + attribute = HFAttribute(obj, "keys")() + if obj.node.target == "**kwargs": + return attribute._metadata + return attribute + + +def get_concrete_args(model: nn.Module, input_names: List[str]): + sig = inspect.signature(model.forward) + + if not (set(input_names) <= set(sig.parameters.keys())): + formatted_input_names = input_names[0] if len(input_names) == 1 else ", ".join(input_names) + formatted_allowed_input_names = ", ".join(sig.parameters.keys()) + raise ValueError( + f"The model does not have input(s) named: {formatted_input_names}, expected a subset of the following:" + f" {formatted_allowed_input_names}" + ) + + return {p.name: p.default for p in sig.parameters.values() if p.name not in input_names} + + +def hf_symbolic_trace( + model: PreTrainedModel, + input_names: Optional[List[str]] = None, + disable_check: bool = False, + tracer_cls: Type[HFTracer] = HFTracer, + partition_modules: List = None, +) -> GraphModule: + """ + Performs symbolic tracing on the model. + + Args: + model ([`PretrainedModel`]): + The model to trace. + input_names (`List[str]`, *optional*): + The names of the inputs of the traced model. If unset, model.dummy_inputs.keys() are used instead. + disable_check (`bool`, *optional*, defaults to `False`): + If `True`, no check is done before trying to trace the model, this is mostly usesul for debugging purposes. + tracer_cls (`Type[HFTracer]`, *optional*, defaults to `HFTracer`): + The tracer class to use for instantiating the tracer. If unset, `HFTracer` is used instead. + partition_modules (`List`): + A list of string paths to un-partitionable submodules of custom modulen classes. + + Returns: + `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model. + + Example: + + ```python + from transformers.utils.fx import symbolic_trace + + traced_model = symbolic_trace(model, input_names=["input_ids", "attention_mask", "token_type_ids"]) + ``` + """ + if input_names is None: + input_names = model.dummy_inputs.keys() + + input_names = list(input_names) + concrete_args = get_concrete_args(model, input_names) + + if not disable_check: + check_if_model_is_supported(model) + + # Tracing. + if partition_modules: + # annotate partition modules as minimally unpartitionable units in stage split. + assert isinstance(partition_modules, Sequence) + + def _check_legitimate_fqn(unique_paths, path): + return not any(path == p or path.startswith(p + ".") for p in unique_paths) + + partition_modules_paths = set() + for fqn, sub_module in model.named_modules(): + if (fqn in partition_modules) or ( + type(sub_module) in partition_modules and _check_legitimate_fqn(partition_modules_paths, fqn) + ): + # elif type(sub_module) in partition_modules and _check_legitimate_fqn(partition_modules_paths, fqn): + partition_modules_paths.add(fqn) + partition_modules_paths = list(partition_modules_paths) + register_partition_module(model, fully_qualified_names=partition_modules_paths) + tracer = tracer_cls(partition_modules=partition_modules) + traced_graph = tracer.trace(model, concrete_args=concrete_args) + traced = torch.fx.GraphModule(model, traced_graph) + + if hasattr(model, "config"): + traced.config = model.config + # The model class must be stored as an attribute to allow model deserialization, which uses trace, and thus + # _generate_dummy_input, where the model class is needed. + traced.class_for_deserialization = model.__class__ + if hasattr(model, "device"): + traced.device = model.device + + return traced + + +def register_partition_module(module: nn.Module, fully_qualified_names: Union[str, Sequence] = None): + if fully_qualified_names is None: + setattr(module, _IS_PARTITION_MODULE, True) + else: + if isinstance(fully_qualified_names, str): + fully_qualified_names = [fully_qualified_names] + for fqn, sub_module in module.named_modules(): + for mod_name in fully_qualified_names: + if fqn == mod_name: + setattr(sub_module, _IS_PARTITION_MODULE, True) diff --git a/vescale/plan/__init__.py b/vescale/plan/__init__.py new file mode 100644 index 0000000..3d6e4cf --- /dev/null +++ b/vescale/plan/__init__.py @@ -0,0 +1,20 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + + +from .pipeline_parallel import PipelineParallelPlan +from .spec import * diff --git a/vescale/plan/pipeline_parallel.py b/vescale/plan/pipeline_parallel.py new file mode 100644 index 0000000..b39fdb5 --- /dev/null +++ b/vescale/plan/pipeline_parallel.py @@ -0,0 +1,142 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + + +from dataclasses import dataclass, field +from typing import List, Dict +from .spec import * # noqa: F403 +import torch + +__all__ = ["PipelineParallelPlan"] + + +@dataclass +class PipelineParallelPlan: + # PP mode: + mode: ModeType = ModeType.GRAPH_EAGER + + ########## model graph and partition ########## + + # type of tracer to obtain the model execution graph + # fit modes: [GRAPH_EAGER] + # format: Enum + # consumer: PipeParser + tracer_type: TracerType = TracerType.AUTO + + # kwargs to be fed to different parser, e.g. torch.fx, dynamo, export, etc + # fit modes: [GRAPH_EAGER] + # format: Enum + # consumer: PipeParser + tracer_kwargs: Dict = None + + # method of stage partitioning for all modes + # fit modes: [EAGER, GRAPH_EAGER] + # format: Enum + # consumer: PipeParser and ManualPipeParser + split_method: PipelineSplitMethodType = PipelineSplitMethodType.MANUAL + + # number of pipeline stages + # fit modes: [EAGER, GRAPH_EAGER] + # format: int + # consumer: PipeParser + num_stages: int = 2 + + # number of virtual module chunks per pipeline stage + # fit modes: [EAGER, GRAPH_EAGER] + # format: int + # consumer: ScheduleEngine, PipeModule + virtual_chunks: int = 1 + + # list of minimum un-partitionable units in model forward graph. Internal hierarchy + # of a partition unit is maintained during stage splitting + # fit modes: [EAGER, GRAPH_EAGER] + # format: list of fqns to particular modules/callable or module classes + # consumer: ScheduleEngine, PipeModule + smallest_unsplittable_units: List = field(default_factory=list) + + # stage boundaries + # fit modes: [EAGER, GRAPH_EAGER] + # format: a list of fqns or index integers of particular modules / callables + # consumer: PipeParser and ManualParser + split_points: List = field(default_factory=list) + + # enables to manually define boundaries of virtual stage chunks in interleaved 1F1B schedule + # fit modes: [EAGER, GRAPH_EAGER] + # format: boolean + # consumer: PipeParser and ManualParser + enable_vpp_split_points: bool = False + + # enables to uniformly split stages by modules and operators when split_method==PipelineSplitMethodType.UNIFORM + # fit modes: [EAGER, GRAPH_EAGER] + # format: boolean + # consumer: PipeParser and ManualParser + uniform_split_ops: bool = False + + ########## end of model graph generation, partition ########## + + ########## pipeline runtime ########## + + # executes batched p2p communication for simple 1f1b and interleaved 1f1b, + # mutually exclusive to overlap_p2p_comm + # fit modes: [EAGER, GRAPH_EAGER] + # format: bool + # consumer: ScheduleEngine + batch_p2p_comm: bool = False + + # executes overlapped p2p communication for simple 1f1b and interleaved 1f1b, + # mutually exclusive to batch_p2p_comm + # fit modes: [EAGER, GRAPH_EAGER] + # format: bool + # consumer: ScheduleEngine + overlap_p2p_comm: bool = True + + # sets to True in inference, so that pipeline schedule only executes forward propagation + # fit modes: [EAGER, GRAPH_EAGER] + # format: bool + # consumer: ScheduleEngine + forward_only: bool = False + + # pipeline schedule type + # fit modes: [EAGER, GRAPH_EAGER] + # format: Enum + # consumer: ScheduleEngine + schedule_type: PipelineScheduleType = PipelineScheduleType.SIMPLE_1F1B + + # reuses data tensor shapes in some use cases instead of communicating + # shapes before tensors. Use with caution! + # fit modes: [EAGER, GRAPH_EAGER] + # format: bool + # consumer: ScheduleEngine + reuse_p2p_tensor_shape: bool = False + + # precision types of communicated tensors during pipeline execution + # fit modes: [EAGER, GRAPH_EAGER] + # format: torch.dtype + # consumer: ScheduleEngine + p2p_tensor_dtype: torch.dtype = torch.float32 + + ########## end of pipeline schedule ########## + + ########## other information ########## + + # list of groups of fqns whose parameters or gradients will be synchronized per step, e.g. embedding modules + # fit modes: [EAGER, GRAPH_EAGER] + # format: [ [word_embeddingA, word_embeddingB], [vision_embeddingA, vision_embeddingB] ] + # consumer: build utilities in vescale/api.py + shared_modules: List[List[str]] = field(default_factory=list) + + ########## end of other information ########## diff --git a/vescale/plan/spec.py b/vescale/plan/spec.py new file mode 100644 index 0000000..e91a39e --- /dev/null +++ b/vescale/plan/spec.py @@ -0,0 +1,78 @@ +################################################################################ +# +# Copyright 2023 ByteDance Ltd. and/or its affiliates. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +from enum import Enum, auto +from dataclasses import dataclass +from typing import TypeVar + + +__all__ = [ + "ModeType", + "PipelineP2PSpec", + "PipelineSplitMethodType", + "PipelineScheduleType", + "TracerType", +] + +ArrayLike = TypeVar("ArrayLike") + + +class ModeType(Enum): + """Type of parallel modes""" + + EAGER = auto() + MANUAL_EAGER = auto() + GRAPH_EAGER = auto() + + +class PipelineSplitMethodType(Enum): + """Type of pipeline stage partitioning methods""" + + MANUAL = auto() + UNIFORM = auto() + PARAMETERS = auto() + AUTO = auto() + SIMULATOR = auto() + FLOPS = auto() + + +class PipelineScheduleType(Enum): + """Type of pipeline parallel schedules""" + + SIMPLE_1F1B = auto() + INTERLEAVED_1F1B = auto() + GPIPE = auto() + ZERO_BUBBLE = auto() + GRAPH_PIPE = auto() + + +class TracerType(Enum): + VESCALE_FX = auto() + VESCALE_EXPORT = auto() + HF_FX = auto() + TORCH_FX = auto() + TORCH_DYNAMO = auto() + TORCH_EXPORT = auto() + AUTO = auto() + + +@dataclass +class PipelineP2PSpec: + """The p2p spec for communication p2p spec in manual pipeline plan.""" + + peer_stage_idx: int + peer_output_idx: int = 0