From 89bd96baabac6507666964c7b9dcf2588da75fdf Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Sat, 8 Jun 2024 02:47:25 +0300 Subject: [PATCH] Get rid of some aliasing violations --- blazingio.hpp | 80 +++++++++++++++++++++-------------------------- blazingio.min.hpp | 4 +-- 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/blazingio.hpp b/blazingio.hpp index 596062e..891a54c 100644 --- a/blazingio.hpp +++ b/blazingio.hpp @@ -532,6 +532,7 @@ struct istream_impl { // We expect long runs here, hence vectorization. Instrinsics break aliasing, and if we // interleave ptr modification with SIMD loading, there's going to be an extra memory // write on every iteration. + NonAliasingChar* p = ptr; @match @case linux-*,macos-* @case windows-* @@ -553,45 +554,37 @@ struct istream_impl { @case *-x86+avx2 int mask; SIMD_TYPE space = _mm256_set1_epi8(' '); - SIMD_TYPE *p = (SIMD_TYPE*)ptr; while ( !(mask = _mm256_movemask_epi8( - _mm256_cmpeq_epi8(space, _mm256_max_epu8(space, _mm256_loadu_si256(p))) + _mm256_cmpeq_epi8(space, _mm256_max_epu8(space, _mm256_loadu_si256((SIMD_TYPE*)p))) )) ) - // XXX: I have no idea if this pointer arithmetic is sound. __m256i is may_alias, so - // it likely is, but there aren't any explicit guarantees. - p++; - ptr = (NonAliasingChar*)p + BSFD(mask); + p += 32; + ptr = p + BSFD(mask); @case *-x86+sse4.1 int mask; SIMD_TYPE space = _mm_set1_epi8(' '); - SIMD_TYPE *p = (SIMD_TYPE*)ptr; while ( !(mask = _mm_movemask_epi8( - _mm_cmpeq_epi8(space, _mm_max_epu8(space, _mm_loadu_si128(p))) + _mm_cmpeq_epi8(space, _mm_max_epu8(space, _mm_loadu_si128((SIMD_TYPE*)p))) )) ) - // XXX: I have no idea if this pointer arithmetic is sound. __m128i is may_alias, so - // it likely is, but there aren't any explicit guarantees. - p++; - ptr = (NonAliasingChar*)p + BSFD(mask); + p += 16; + ptr = p + BSFD(mask); @case *-aarch64+neon uint64x2_t vec; - auto p = (uint8_t*)ptr; - while (vec = (uint64x2_t)(vld1q_u8(p) < 33), !(vec[0] | vec[1])) + while (vec = (uint64x2_t)(vld1q_u8((uint8_t*)p) < 33), !(vec[0] | vec[1])) p += 16; - ptr = (NonAliasingChar*)p + (vec[0] ? 0 : 8) + BSFQ_64BIT(vec[0] ?: vec[1]) / 8; + ptr = p + (vec[0] ? 0 : 8) + BSFQ_64BIT(vec[0] ?: vec[1]) / 8; @case *-x86+none,*-aarch64+none // This is a variation on Mycroft's algorithm. See // https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ for // the original code. - // XXX: there's a strict aliasing violation here - SIMD_TYPE* p = (SIMD_TYPE*)ptr; uint64_t vec; - while (!(vec = ((*p - ONE_BYTES * 33) & ~*p & (ONE_BYTES << 7)))) - p++; - ptr = (NonAliasingChar*)p + BSFQ(vec) / 8; + // XXX: there's a strict aliasing violation here + while (!(vec = ((*(SIMD_TYPE*)p - ONE_BYTES * 33) & ~*(SIMD_TYPE*)p & (ONE_BYTES << 7)))) + p += 8; + ptr = p + BSFQ(vec) / 8; @end }); } @@ -651,7 +644,7 @@ struct istream_impl { while (i % SIMD_SIZE) value[--i] = *ptr++ == '1'; !endif - SIMD_TYPE* p = (SIMD_TYPE*)ptr; + NonAliasingChar* p = ptr; !ifdef INTERACTIVE for (int64_t j = 0; j < min(i, end - ptr) / SIMD_SIZE; j++) { !else @@ -674,10 +667,7 @@ struct istream_impl { PUT(uint32_t, BSWAP32( _mm256_movemask_epi8( _mm256_shuffle_epi8( - // XXX: I have no idea if this pointer arithmetic is sound. __m256i - // is may_alias, so it likely is, but there aren't any explicit - // guarantees. - _mm256_slli_epi32(_mm256_loadu_si256(p++), 7), + _mm256_slli_epi32(_mm256_loadu_si256((SIMD_TYPE*)p), 7), _mm256_set_epi64x( a + ONE_BYTES * 24, a + ONE_BYTES * 16, @@ -692,28 +682,26 @@ struct istream_impl { uint64_t a = ~0ULL / 65025; PUT(uint16_t, _mm_movemask_epi8( _mm_shuffle_epi8( - // XXX: I have no idea if this pointer arithmetic is sound. __m128i is - // may_alias, so it likely is, but there aren't any explicit guarantees. - _mm_slli_epi32(_mm_loadu_si128(p++), 7), + _mm_slli_epi32(_mm_loadu_si128((SIMD_TYPE*)p), 7), _mm_set_epi64x(a, a + ONE_BYTES * 8) ) )); @case *-aarch64+neon - // XXX: there's a strict aliasing violation here - auto masked = (uint8x16_t)vdupq_n_u64(POWERS_OF_TWO) & ('0' - vld1q_u8((uint8_t*)p++)); + auto masked = (uint8x16_t)vdupq_n_u64(POWERS_OF_TWO) & ('0' - vld1q_u8((uint8_t*)p)); auto zipped = vzip_u8(vget_high_u8(masked), vget_low_u8(masked)); PUT(uint16_t, vaddvq_u16( (uint16x8_t)vcombine_u8(zipped.val[0], zipped.val[1]) )); @case *-x86+none,*-aarch64+none // XXX: there's a strict aliasing violation here - PUT(char, (*p++ & ONE_BYTES) * BITSET_SHIFT >> 56); + PUT(char, (*(uint64_t*)p & ONE_BYTES) * BITSET_SHIFT >> 56); @end + p += SIMD_SIZE; @ondemand *-x86+avx2,*-x86+sse4.1,*-aarch64+neon memcpy((char*)&value + i / 8, &x, sizeof(x)); @end } - ptr = (NonAliasingChar*)p; + ptr = p; !ifdef INTERACTIVE } !endif @@ -1274,21 +1262,20 @@ struct SPLIT_HERE blazingio_ostream { auto i = N; while (i % SIMD_SIZE) *ptr++ = '0' + value[--i]; - SIMD_TYPE* p = (SIMD_TYPE*)ptr; + NonAliasingChar* p = ptr; i /= SIMD_SIZE; while (i) { @match @case *-x86+avx2 auto b = _mm256_set1_epi64x(POWERS_OF_TWO); _mm256_storeu_si256( - // XXX: I have no idea if this pointer arithmetic is sound. __m256i is may_alias, so - // it likely is, but there aren't any explicit guarantees. - p++, + (SIMD_TYPE*)p, _mm256_sub_epi8( _mm256_set1_epi8('0'), _mm256_cmpeq_epi8( _mm256_and_si256( _mm256_shuffle_epi8( + // XXX: there's a strict aliasing violation here _mm256_set1_epi32(((uint32_t*)&value)[--i]), _mm256_set_epi64x(0, ONE_BYTES, ONE_BYTES * 2, ONE_BYTES * 3) ), @@ -1301,14 +1288,13 @@ struct SPLIT_HERE blazingio_ostream { @case *-x86+sse4.1 auto b = _mm_set1_epi64x(POWERS_OF_TWO); _mm_storeu_si128( - // XXX: I have no idea if this pointer arithmetic is sound. __m128i is may_alias, so - // it likely is, but there aren't any explicit guarantees. - p++, + (SIMD_TYPE*)p, _mm_sub_epi8( _mm_set1_epi8('0'), _mm_cmpeq_epi8( _mm_and_si128( _mm_shuffle_epi8( + // XXX: there's a strict aliasing violation here _mm_set1_epi16(((uint16_t*)&value)[--i]), _mm_set_epi64x(0, ONE_BYTES) ), @@ -1319,18 +1305,22 @@ struct SPLIT_HERE blazingio_ostream { ) ); @case *-aarch64+neon + // XXX: there's a strict aliasing violation here auto vec = (uint8x8_t)vdup_n_u16(((uint16_t*)&value)[--i]); - // XXX: there's an aliasing violation here - *p++ = '0' - vtstq_u8( - vcombine_u8(vuzp2_u8(vec, vec), vuzp1_u8(vec, vec)), - (uint8x16_t)vdupq_n_u64(POWERS_OF_TWO) + vst1q_u8( + (uint8_t*)p, + '0' - vtstq_u8( + vcombine_u8(vuzp2_u8(vec, vec), vuzp1_u8(vec, vec)), + (uint8x16_t)vdupq_n_u64(POWERS_OF_TWO) + ) ); @case *-x86+none,*-aarch64+none // XXX: there's an aliasing violation here - *p++ = ((BITSET_SHIFT * (((uint8_t*)&value)[--i]) >> 7) & ONE_BYTES) | (ONE_BYTES * 0x30); + *(uint64_t*)p = ((BITSET_SHIFT * (((uint8_t*)&value)[--i]) >> 7) & ONE_BYTES) | (ONE_BYTES * 0x30); @end + p += SIMD_SIZE; } - ptr = (NonAliasingChar*)p; + ptr = p; } !endif diff --git a/blazingio.min.hpp b/blazingio.min.hpp index 0c9588d..32b7647 100644 --- a/blazingio.min.hpp +++ b/blazingio.min.hpp @@ -55,8 +55,8 @@ #define $I $w(__forceinline,__attribute__((always_inline))) #define $F M(), #define E$(x)if(!(x))abort(); -$w(LONG WINAPI $x(_EXCEPTION_POINTERS*);,)namespace $f{using namespace std;struct B{enum $c A:char{}c;B&$O=(char x){c=A{x};$r}$O char(){$R(char)c;}};$C uint64_t C=~0ULL/255;struct D{string&K;};static B E[65568];templatestruct G{B*H,*I;void K(off_t C){$w(char*D=(char*)VirtualAlloc(0,(C+8191)&-4096,8192,1);E$(D)E$(VirtualFree(D,0,32768))DWORD A=C&-65536;E$(!A||MapViewOfFileEx(CreateFileMapping(GetStdHandle(-10),0,2,0,A,0),4,0,0,0,D)==D)E$(VirtualAlloc(D+A,65536,12288,4)==D+A)E$(~_lseek(0,A,0))DWORD E=0;ReadFile(GetStdHandle(-10),D+A,65536,&E,0);,int A=getpagesize();char*D=(char*)mmap(0,C+A,3,2,0,0);E$(D!=(void*)-1)E$(mmap(D+((C+A-1)&-A),A,3,$m(4114,50),-1,0)!=(void*)-1))H=(B*)D+C;*H=10;H[1]=48;H[2]=0;I=(B*)D;}void L(){H=I=E;}$I void M(){if(F&&I==H){$w(DWORD A=0;ReadFile(GetStdHandle(-10),I=E,65536,&A,0);,$a($u(register long A asm("x0")=0,D asm("x1")=(long)E,G asm("x2")=65536,C asm($m("x16","x8"))=$m(3,63);asm volatile("svc 0" $m("x80",):"+r"(A),"+r"(D):"r"(C),"r"(G));I=launder(E);),off_t A=$H(3,$m(33554435,0));B*D=E;asm volatile($H("int $128","syscall"):"+a"(A),$H("+c"(D):"b","+S"(D):"D")(0),"d"(65536)$H(,$u(:"rcx","r11")));I=D;))H=I+A;*H=10;if(!A)E[1]=48,E[2]=0;}}$T>$I void N(T&x){while($F(*I&240)==48)x=x*10+(*I++-48);}$T>$I decltype((void)~T{1})O(T&x){M();int A=is_signed_v&&*I==45;I+=A;N(x=0);x=A?1+~x:x;}$T>$I decltype((void)T{1.})O(T&x){M();int A=*I==45;I+=A;$F I+=*I==43;uint64_t n=0;int i=0;for(;i<18&&($F*I&240)==48;i++)n=n*10+*I++-48;int B=20;int C=*I==46;I+=C;for(;i<18&&($F*I&240)==48;i++)n=n*10+*I++-48,B-=C;x=(T)n;while(($F*I&240)==48)x=x*10+*I++-48,B-=C;if(*I==46)I++,C=1;while(($F*I&240)==48)x=x*10+*I++-48,B-=C;int D;if((*I|32)==101)I++,$F I+=*I==43,O(D),B+=D;static $C auto E=[](){arrayE{};T x=1;for(int i=21;i--;)E[40-i]=x,E[i]=1/x,x*=10;$R E;}();while(B>40)x*=(T)1e10,B-=10;while(B<0)x*=(T)1e-10,B+=10;x*=E[B];x=A?-x:x;}$I void O(bool&x){$F x=*I++==49;}$I void O(char&x){$F x=*I++;}$I void O(uint8_t&x){$F x=*I++;}$I void O(int8_t&x){$F x=*I++;}$T>$s void P(string&K,T C){M();B*G=I;C();K.assign((char*)G,I-G);while(F&&I==H&&($F H!=E)){C();K.append(E,I);}}$s void O(string&K){P(K,[&]()$s{$w(ULONG R;,)$a(uint64x2_t A;auto p=(uint8_t*)I;while(A=(uint64x2_t)(vld1q_u8(p)<33),!(A[0]|A[1]))p+=16;I=(B*)p+(A[0]?0:8)+$w((_BitScanForward64(&R,A[0]?:A[1]),R),__builtin_ctzll(A[0]?:A[1]))/8;,int J;$t C=M$(set1,32);$t*p=($t*)I;while(!(J=M$(movemask,M$(cmpeq,C,_mm256_max_epu8(C,_mm256_loadu_si256(p))))))p++;I=(B*)p+$w((_BitScanForward(&R,J),R),__builtin_ctz(J));)});}$s void O(D&A){P(A.K,[&](){I=(B*)memchr(I,10,H-I+1);});if(A.K.size()&&A.K.back()==13)A.K.pop_back();if(A.K.empty()||I$I void O(complex&K){T A,B{};if($F*I==40){I++;O(A);if($F*I++==44)Q(B),I++;}else O(A);K={A,B};}template$s void O(bitset&K){if(N>4095&&!*this)$R;ptrdiff_t i=N;while(i)if($F i%$z||H-I<$z)K[--i]=*I++==49;else{$t*p=($t*)I;for(int64_t j=0;j$I void Q(T&K){if(!is_same_v)while($F 0<=*I&&*I<33)I++;O(K);}$O bool(){$R!!*this;}bool $O!(){$R I>H;}};struct $i{G<0>A;G<1>B;$i(){struct stat D;E$(~fstat(0,&D))(D.st_mode>>12)==8?A.K(D.st_size):B.L();}$i*tie(nullptr_t){$R this;}void sync_with_stdio(bool){}$T>$I $i&$O>>(T&K){A.I?A.Q(K):B.Q(K);$r}$O bool(){$R!!*this;}bool $O!(){$R A.I?!A:!B;}};uint16_t A[100];char L[64]{1};struct -$o{char*D;B*I;int J;$o(){$w(E$(D=(char*)VirtualAlloc(0,536870912,8192,4))E$(VirtualAlloc(D,4096,4096,260))AddVectoredExceptionHandler(1,$x);,size_t C=536870912;$m(,struct rlimit E;getrlimit(RLIMIT_AS,&E);if(~E.rlim_cur)C=25165824;)D=(char*)mmap(0,C,3,$m(4162,16418),-1,0);E$(D!=(void*)-1))I=(B*)D;for(int i=0;i<100;i++)A[i]=(48+i/10)|((48+i%10)<<8);for(int i=1;i<64;i++)L[i]=L[i-1]+(0x8922489224892249>>i&1);}~$o(){flush(!J);}void flush(int F=0){$w(J=1;auto E=GetStdHandle(-11);auto C=F?ReOpenFile(E,1073741824,7,2684354560):(void*)-1;DWORD A;E$(C==(void*)-1?WriteFile(E,D,DWORD((char*)I-D),&A,0):(WriteFile(C,D,DWORD(((char*)I-D+4095)&-4096),&A,0)&&~_chsize(1,int((char*)I-D)))),auto G=D;ssize_t A;while((A=write(1,G,(char*)I-G))>0)G+=A;E$(~A))I=(B*)D;}$P(char)*I++=K;}$P(uint8_t)*I++=K;}$P(int8_t)*I++=K;}$P(bool)*I++=48+K;}$T>decltype((void)~T{1})F(T K){using D=make_unsigned_t;D C=K;if(K<0)F('-'),C=1+~C;static $C auto N=[](){arrayN{};D n=1;for(size_t i=1;i>32))?M+=32:_BitScanReverse(&M,(ULONG)C|1),_BitScanReverse64(&M,C|1)),M),63^__builtin_clzll(C|1))];G-=C>25];n=(n&33554431)*25;H|=A[n>>23]<<16;H|=uint64_t(48+((n&8388607)*5>>22))<<32;H>>=40-G*8;memcpy(I,&H,8);}else if $C(sizeof(T)==4){auto n=1441151881ULL*C;$H(n>>=25;n++;for(int i=0;i<5;i++){H[i]=A[n>>32];n=(n&~0U)*100;},int K=57;auto J=~0ULL>>7;for(int i=0;i<5;i++){H[i]=A[n>>K];n=(n&J)*25;K-=2;J/=4;})memcpy(I,(B*)H+10-G,16);}else{$H($u(if(C<(1ULL<<32)){$R F((uint32_t)C);}auto J=(uint64_t)1e10;auto x=C/J,y=C%J;int K=100000,b[]{int(x/K),int(x%K),int(y/K),int(y%K)};B H[40];for(int i=0;i<4;i++){uint32_t n=(429497ULL*b[i]>>7)+1;B*p=H+i*5;*p=48+(n>>25);n=(n&~0U>>7)*25;memcpy(p+1,A+(n>>23),2);memcpy(p+3,A+((n&~0U>>9)*25>>21),2);}),$u(uint64_t D,E=_umul128(18,C,&D),F;_umul128(0x725dd1d243aba0e8,C,&F);D+=__builtin_add_overflow(E,F+1,&E);for(int i=0;i<10;i++)H[i]=A[D],E=_umul128(100,E,&D);))memcpy(I,(B*)H+20-G,20);}I+=G;}$T>decltype((void)T{1.})F(T K){if(K<0)F('-'),K=-K;auto G=[&](){auto x=uint64_t(K*1e12);$H($u(x-=x>999999999999;uint32_t n[]{uint32_t(x/1000000*429497>>7)+1,uint32_t(x%1000000*429497>>7)+1};int K=25,J=~0U>>7;for(int i=0;i<3;i++){for(int j=0;j<2;j++)memcpy(I+i*2+j*6,A+(n[j]>>K),2),n[j]=(n[j]&J)*25;K-=2;J/=4;}I+=12;),$u(uint64_t D,E=_umul128(472236648287,x,&D)>>8;E|=D<<56;D>>=8;E++;for(int i=0;i<6;i++)memcpy(I,A+D,2),I+=2,E=_umul128(100,E,&D);))};if(!K)$R F('0');if(K>=1e16){K*=(T)1e-16;int B=16;while(K>=1)K*=(T).1,B++;F("0.");G();F('e');F(B);}else if(K>=1){auto B=(uint64_t)K;F(B);if(K-=B)F('.'),G();}else F("0."),G();}$P(const char*)$w(size_t A=strlen(K);memcpy((char*)I,K,A);I+=A;,I=(B*)stpcpy((char*)I,K);)}$P(const uint8_t*)F((char*)K);}$P(const int8_t*)F((char*)K);}$P(string_view)memcpy(I,K.data(),K.size());I+=K.size();}$T>$P(complex)*this<<'('<$s $P(const bitset&)auto i=N;while(i%$z)*I++=48+K[--i];$t*p=($t*)I;i/=$z;while(i){$a(auto A=(uint8x8_t)vdup_n_u16(((uint16_t*)&K)[--i]);*p++=48-vtstq_u8(vcombine_u8(vuzp2_u8(A,A),vuzp1_u8(A,A)),(uint8x16_t)vdupq_n_u64(~2ULL/254));,auto b=_mm256_set1_epi64x(~2ULL/254);_mm256_storeu_si256(p++,M$(sub,M$(set1,48),M$(cmpeq,_mm256_and_si256(M$(shuffle,_mm256_set1_epi32(((uint32_t*)&K)[--i]),_mm256_set_epi64x(0,C,C*2,C*3)),b),b)));)}I=(B*)p;}$T>$o&$O<<(const T&K){F(K);$r}$o&$O<<($o&(*A)($o&)){$R A(*this);}};struct $e{$T>$e&$O<<(const T&K){$r}$e&$O<<($e&(*A)($e&)){$R A(*this);}};}namespace std{$f::$i i$;$f::$o o$;$f::$e e$;$f::$i&getline($f::$i&B,string&K){$f::D A{K};$R B>>A;}$f::$o&flush($f::$o&B){if(!i$.A.I)B.flush();$R B;}$f::$o&endl($f::$o&B){$R B<<'\n'<ExceptionRecord;auto B=C->ExceptionInformation[1];if(C->ExceptionCode==2147483649&&B-(ULONG_PTR)std::o$.D<0x40000000){E$(VirtualAlloc((char*)B,16777216,4096,4)&&VirtualAlloc((char*)(B+16777216),4096,4096,260))$R-1;}$R 0;},) +$w(LONG WINAPI $x(_EXCEPTION_POINTERS*);,)namespace $f{using namespace std;struct B{enum $c A:char{}c;B&$O=(char x){c=A{x};$r}$O char(){$R(char)c;}};$C uint64_t C=~0ULL/255;struct D{string&K;};static B E[65568];templatestruct G{B*H,*I;void K(off_t C){$w(char*D=(char*)VirtualAlloc(0,(C+8191)&-4096,8192,1);E$(D)E$(VirtualFree(D,0,32768))DWORD A=C&-65536;E$(!A||MapViewOfFileEx(CreateFileMapping(GetStdHandle(-10),0,2,0,A,0),4,0,0,0,D)==D)E$(VirtualAlloc(D+A,65536,12288,4)==D+A)E$(~_lseek(0,A,0))DWORD E=0;ReadFile(GetStdHandle(-10),D+A,65536,&E,0);,int A=getpagesize();char*D=(char*)mmap(0,C+A,3,2,0,0);E$(D!=(void*)-1)E$(mmap(D+((C+A-1)&-A),A,3,$m(4114,50),-1,0)!=(void*)-1))H=(B*)D+C;*H=10;H[1]=48;H[2]=0;I=(B*)D;}void L(){H=I=E;}$I void M(){if(F&&I==H){$w(DWORD A=0;ReadFile(GetStdHandle(-10),I=E,65536,&A,0);,$a($u(register long A asm("x0")=0,D asm("x1")=(long)E,G asm("x2")=65536,C asm($m("x16","x8"))=$m(3,63);asm volatile("svc 0" $m("x80",):"+r"(A),"+r"(D):"r"(C),"r"(G));I=launder(E);),off_t A=$H(3,$m(33554435,0));B*D=E;asm volatile($H("int $128","syscall"):"+a"(A),$H("+c"(D):"b","+S"(D):"D")(0),"d"(65536)$H(,$u(:"rcx","r11")));I=D;))H=I+A;*H=10;if(!A)E[1]=48,E[2]=0;}}$T>$I void N(T&x){while($F(*I&240)==48)x=x*10+(*I++-48);}$T>$I decltype((void)~T{1})O(T&x){M();int A=is_signed_v&&*I==45;I+=A;N(x=0);x=A?1+~x:x;}$T>$I decltype((void)T{1.})O(T&x){M();int A=*I==45;I+=A;$F I+=*I==43;uint64_t n=0;int i=0;for(;i<18&&($F*I&240)==48;i++)n=n*10+*I++-48;int B=20;int C=*I==46;I+=C;for(;i<18&&($F*I&240)==48;i++)n=n*10+*I++-48,B-=C;x=(T)n;while(($F*I&240)==48)x=x*10+*I++-48,B-=C;if(*I==46)I++,C=1;while(($F*I&240)==48)x=x*10+*I++-48,B-=C;int D;if((*I|32)==101)I++,$F I+=*I==43,O(D),B+=D;static $C auto E=[](){arrayE{};T x=1;for(int i=21;i--;)E[40-i]=x,E[i]=1/x,x*=10;$R E;}();while(B>40)x*=(T)1e10,B-=10;while(B<0)x*=(T)1e-10,B+=10;x*=E[B];x=A?-x:x;}$I void O(bool&x){$F x=*I++==49;}$I void O(char&x){$F x=*I++;}$I void O(uint8_t&x){$F x=*I++;}$I void O(int8_t&x){$F x=*I++;}$T>$s void P(string&K,T C){M();B*G=I;C();K.assign((char*)G,I-G);while(F&&I==H&&($F H!=E)){C();K.append(E,I);}}$s void O(string&K){P(K,[&]()$s{B*p=I;$w(ULONG R;,)$a(uint64x2_t A;while(A=(uint64x2_t)(vld1q_u8((uint8_t*)p)<33),!(A[0]|A[1]))p+=16;I=p+(A[0]?0:8)+$w((_BitScanForward64(&R,A[0]?:A[1]),R),__builtin_ctzll(A[0]?:A[1]))/8;,int J;$t C=M$(set1,32);while(!(J=M$(movemask,M$(cmpeq,C,_mm256_max_epu8(C,_mm256_loadu_si256(($t*)p))))))p+=32;I=p+$w((_BitScanForward(&R,J),R),__builtin_ctz(J));)});}$s void O(D&A){P(A.K,[&](){I=(B*)memchr(I,10,H-I+1);});if(A.K.size()&&A.K.back()==13)A.K.pop_back();if(A.K.empty()||I$I void O(complex&K){T A,B{};if($F*I==40){I++;O(A);if($F*I++==44)Q(B),I++;}else O(A);K={A,B};}template$s void O(bitset&K){if(N>4095&&!*this)$R;ptrdiff_t i=N;while(i)if($F i%$z||H-I<$z)K[--i]=*I++==49;else{B*p=I;for(int64_t j=0;j$I void Q(T&K){if(!is_same_v)while($F 0<=*I&&*I<33)I++;O(K);}$O bool(){$R!!*this;}bool $O!(){$R I>H;}};struct $i{G<0>A;G<1>B;$i(){struct stat D;E$(~fstat(0,&D))(D.st_mode>>12)==8?A.K(D.st_size):B.L();}$i*tie(nullptr_t){$R this;}void sync_with_stdio(bool){}$T>$I $i&$O>>(T&K){A.I?A.Q(K):B.Q(K);$r}$O bool(){$R!!*this;}bool $O!(){$R A.I?!A:!B;}};uint16_t A[100];char L[64]{1};struct +$o{char*D;B*I;int J;$o(){$w(E$(D=(char*)VirtualAlloc(0,536870912,8192,4))E$(VirtualAlloc(D,4096,4096,260))AddVectoredExceptionHandler(1,$x);,size_t C=536870912;$m(,struct rlimit E;getrlimit(RLIMIT_AS,&E);if(~E.rlim_cur)C=25165824;)D=(char*)mmap(0,C,3,$m(4162,16418),-1,0);E$(D!=(void*)-1))I=(B*)D;for(int i=0;i<100;i++)A[i]=(48+i/10)|((48+i%10)<<8);for(int i=1;i<64;i++)L[i]=L[i-1]+(0x8922489224892249>>i&1);}~$o(){flush(!J);}void flush(int F=0){$w(J=1;auto E=GetStdHandle(-11);auto C=F?ReOpenFile(E,1073741824,7,2684354560):(void*)-1;DWORD A;E$(C==(void*)-1?WriteFile(E,D,DWORD((char*)I-D),&A,0):(WriteFile(C,D,DWORD(((char*)I-D+4095)&-4096),&A,0)&&~_chsize(1,int((char*)I-D)))),auto G=D;ssize_t A;while((A=write(1,G,(char*)I-G))>0)G+=A;E$(~A))I=(B*)D;}$P(char)*I++=K;}$P(uint8_t)*I++=K;}$P(int8_t)*I++=K;}$P(bool)*I++=48+K;}$T>decltype((void)~T{1})F(T K){using D=make_unsigned_t;D C=K;if(K<0)F('-'),C=1+~C;static $C auto N=[](){arrayN{};D n=1;for(size_t i=1;i>32))?M+=32:_BitScanReverse(&M,(ULONG)C|1),_BitScanReverse64(&M,C|1)),M),63^__builtin_clzll(C|1))];G-=C>25];n=(n&33554431)*25;H|=A[n>>23]<<16;H|=uint64_t(48+((n&8388607)*5>>22))<<32;H>>=40-G*8;memcpy(I,&H,8);}else if $C(sizeof(T)==4){auto n=1441151881ULL*C;$H(n>>=25;n++;for(int i=0;i<5;i++){H[i]=A[n>>32];n=(n&~0U)*100;},int K=57;auto J=~0ULL>>7;for(int i=0;i<5;i++){H[i]=A[n>>K];n=(n&J)*25;K-=2;J/=4;})memcpy(I,(B*)H+10-G,16);}else{$H($u(if(C<(1ULL<<32)){$R F((uint32_t)C);}auto J=(uint64_t)1e10;auto x=C/J,y=C%J;int K=100000,b[]{int(x/K),int(x%K),int(y/K),int(y%K)};B H[40];for(int i=0;i<4;i++){uint32_t n=(429497ULL*b[i]>>7)+1;B*p=H+i*5;*p=48+(n>>25);n=(n&~0U>>7)*25;memcpy(p+1,A+(n>>23),2);memcpy(p+3,A+((n&~0U>>9)*25>>21),2);}),$u(uint64_t D,E=_umul128(18,C,&D),F;_umul128(0x725dd1d243aba0e8,C,&F);D+=__builtin_add_overflow(E,F+1,&E);for(int i=0;i<10;i++)H[i]=A[D],E=_umul128(100,E,&D);))memcpy(I,(B*)H+20-G,20);}I+=G;}$T>decltype((void)T{1.})F(T K){if(K<0)F('-'),K=-K;auto G=[&](){auto x=uint64_t(K*1e12);$H($u(x-=x>999999999999;uint32_t n[]{uint32_t(x/1000000*429497>>7)+1,uint32_t(x%1000000*429497>>7)+1};int K=25,J=~0U>>7;for(int i=0;i<3;i++){for(int j=0;j<2;j++)memcpy(I+i*2+j*6,A+(n[j]>>K),2),n[j]=(n[j]&J)*25;K-=2;J/=4;}I+=12;),$u(uint64_t D,E=_umul128(472236648287,x,&D)>>8;E|=D<<56;D>>=8;E++;for(int i=0;i<6;i++)memcpy(I,A+D,2),I+=2,E=_umul128(100,E,&D);))};if(!K)$R F('0');if(K>=1e16){K*=(T)1e-16;int B=16;while(K>=1)K*=(T).1,B++;F("0.");G();F('e');F(B);}else if(K>=1){auto B=(uint64_t)K;F(B);if(K-=B)F('.'),G();}else F("0."),G();}$P(const char*)$w(size_t A=strlen(K);memcpy((char*)I,K,A);I+=A;,I=(B*)stpcpy((char*)I,K);)}$P(const uint8_t*)F((char*)K);}$P(const int8_t*)F((char*)K);}$P(string_view)memcpy(I,K.data(),K.size());I+=K.size();}$T>$P(complex)*this<<'('<$s $P(const bitset&)auto i=N;while(i%$z)*I++=48+K[--i];B*p=I;i/=$z;while(i){$a(auto A=(uint8x8_t)vdup_n_u16(((uint16_t*)&K)[--i]);vst1q_u8((uint8_t*)p,48-vtstq_u8(vcombine_u8(vuzp2_u8(A,A),vuzp1_u8(A,A)),(uint8x16_t)vdupq_n_u64(~2ULL/254)));,auto b=_mm256_set1_epi64x(~2ULL/254);_mm256_storeu_si256(($t*)p,M$(sub,M$(set1,48),M$(cmpeq,_mm256_and_si256(M$(shuffle,_mm256_set1_epi32(((uint32_t*)&K)[--i]),_mm256_set_epi64x(0,C,C*2,C*3)),b),b)));)p+=$z;}I=p;}$T>$o&$O<<(const T&K){F(K);$r}$o&$O<<($o&(*A)($o&)){$R A(*this);}};struct $e{$T>$e&$O<<(const T&K){$r}$e&$O<<($e&(*A)($e&)){$R A(*this);}};}namespace std{$f::$i i$;$f::$o o$;$f::$e e$;$f::$i&getline($f::$i&B,string&K){$f::D A{K};$R B>>A;}$f::$o&flush($f::$o&B){if(!i$.A.I)B.flush();$R B;}$f::$o&endl($f::$o&B){$R B<<'\n'<ExceptionRecord;auto B=C->ExceptionInformation[1];if(C->ExceptionCode==2147483649&&B-(ULONG_PTR)std::o$.D<0x40000000){E$(VirtualAlloc((char*)B,16777216,4096,4)&&VirtualAlloc((char*)(B+16777216),4096,4096,260))$R-1;}$R 0;},) #define freopen(...)if(freopen(__VA_ARGS__)==stdin)std::i$=$f::$i{} #define cin i$ #define cout o$