forked from peigen-sboxes/PEIGEN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunc.hpp
4687 lines (4256 loc) · 160 KB
/
func.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* PEIGEN: a Platform for Evaluation, Implementation, and Generation of S-boxes
*
* Copyright 2019 by
* Zhenzhen Bao <baozhenzhen10[at]gmail.com>
* Jian Guo <guojian[at]ntu.edu.sg>
* San Ling <lingsan[at]ntu.edu.sg>
* Yu Sasaki <yu[dot]sasaki[dot][email protected]>
*
* This platform is developed based on the open source application
* <http://jeremy.jean.free.fr/pub/fse2018_layer_implementations.tar.gz>
* Optimizing Implementations of Lightweight Building Blocks
*
* Copyright 2017 by
* Jade Tourteaux <Jade[dot]Tourteaux[at]gmail.com>
* Jérémy Jean <Jean[dot]Jeremy[at]gmail.com>
*
* We follow the same copyright policy.
*
* This file is part of some open source application.
*
* Some open source application is free software: you can redistribute
* it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* Some open source application is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Foobar. If not, see <http://www.gnu.org/licenses/>.
*
* @license GPL-3.0+ <http://spdx.org/licenses/GPL-3.0+>
*/
#ifndef FUNC_H__
#define FUNC_H__
#include "constants.hpp"
using namespace Peigen;
using namespace std;
#define NDEBUG
#ifdef NDEBUG
#define PRINT(x) ((void)0)
#else
#define PRINT(x) x
#endif
#ifdef NCHECKR
#define CHECKR(x) ((void)0)
#else
#define CHECKR(x) x
#endif
#ifdef NCHECKAB
#define CHECK(x) ((void)0)
#else
#define CHECK(x) x
#endif
template<int N>
struct Peigen::function_t
{
/*
the bit-sliced representation of the function
*/
bit_slice_t<N> bit_slice{{}};
/*
the line which was in the previous function
*/
bit_slice_l_t<N> prev = {{ 0 }};
/*
index of changed line
*/
uint8_t info_line = 0;
/*
identifier for last boolean operation
*/
uint8_t info_op = 0;
__m128i LUT[LUT_XMM_N];
function_t() { };
function_t(const bit_slice_t<N> a)
{
for (int i = 0; i < N; i++)
{
bit_slice[i] = a[i];
}
bit_slice_to_LUT();
}
function_t(const uint8_t aLUT[LUT_UNIT_N])
{
const __m128i * aLUTp = (const __m128i *)aLUT;
for (int i = 0; i < LUT_XMM_N; i++)
{
LUT[i] = _mm_loadu_si128(aLUTp + i);
}
LUT_to_bit_slice();
}
function_t(const string str)
{
parse_function(str);
}
static function_t INPUT_DEFAULT()
{
function_t I;
uint8_t * x = (uint8_t *)I.LUT;
for (int i = 0; i < LUT_UNIT_N; i++)
{
x[i] = i & 0xff;
}
I.LUT_to_bit_slice();
return I;
}
void parse_function(const string str)
{
if(str.size() == (BIT_SLICE_NIBBLES_N * N) + (N - 1))
{
string_to_bit_slice(str);
bit_slice_to_LUT();
}
else if(str.size() == (BIT_SLICE_BITS_N<<1))
{
string_to_LUT(str);
LUT_to_bit_slice();
}
else
{
cout << "Error: Length of the string representing the function should be either "
<< (BIT_SLICE_NIBBLES_N * N) + (N - 1) << " or "
<< (BIT_SLICE_BITS_N<<1) << endl;
}
}
bool operator < (const function_t& f) const
{
return bit_slice < f.bit_slice;
}
bool operator == (const function_t& f) const
{
return bit_slice == f.bit_slice;
}
string to_string() const
{
stringstream ss;
ss << hex << setfill('0');
for (int i = N - 1; i >= 0; i--)
{
for (int j = UNIT_N - 1; j >= 0; j--)
{
ss << setw(UNIT_NIBBLE_N) << bit_slice[i][j] + '\0';
}
ss << "_";
}
string res = ss.str();
res.pop_back();
return res;
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
string LUT_to_string() const
{
stringstream ss;
ss << hex << setfill('0');
uint8_t * S = (uint8_t *) LUT;
for (int i = 0; i < LUT_UNIT_N; i++)
{
ss << setw(2) << S[i] + '\0';
}
return ss.str();
}
string show_LUT() const
{
stringstream ss;
ss << "LUT = {" << endl;
ss << hex << setfill('0');
uint8_t * S = (uint8_t *) LUT;
for (int i = 0; i < LUT_UNIT_N; i++)
{
ss << "0x" << setw(2) << S[i] + '\0' << ",";
}
ss << endl;
ss << "};" << endl;
return ss.str();
}
void show_LUT(ofstream & fout) const
{
fout << show_LUT();
}
void string_to_bit_slice(const string& str)
{
if(str.size() == (BIT_SLICE_NIBBLES_N * N) + (N - 1))
{
stringstream ss(str);
string number;
int i = N - 1;
while(getline(ss, number, '_') && i >= 0)
{
if(number.length() != BIT_SLICE_NIBBLES_N)
{
cout << "Length of each substring error.";
exit(1);
}
for (int j = UNIT_N - 1; j >= 0; j--)
{
bit_slice[i][j] = stoul(number.substr((UNIT_N - 1 - j)*UNIT_NIBBLE_N, UNIT_NIBBLE_N), nullptr, 16);
}
i--;
}
}
else
{
cout << "Error: Length of the string representing the function in bitsliced form should be "
<< (BIT_SLICE_NIBBLES_N * N) + (N - 1) << endl;
exit(1);
}
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
void string_to_LUT(const string& str)
{
if(str.size() == (BIT_SLICE_BITS_N<<1))
{
string tmp;
uint8_t * x = (uint8_t *)LUT;
for (int i = 0; i < LUT_BYTE_N; i++)
{
tmp = str.substr(i * 2, 2);
x[i] = (uint8_t) stoul(tmp, nullptr, 16);
}
}
else
{
cout << "Error: Length of the string representing the function in LUT form should be "
<< (BIT_SLICE_BITS_N<<1) << endl;
exit(1);
}
}
string show() const
{
stringstream ss;
ss << hex << setfill('0');
uint8_t * S = (uint8_t *) LUT;
ss << "{";
for (int i = 0; i < LUT_UNIT_N; i++)
{
ss << "0x" << setw(LUT_UNIT_NIBBLE_N) << S[i] + '\0' << ",";
}
string res = ss.str();
res.pop_back();
res = res + "}";
return res;
}
string show_concise() const
{
stringstream ss;
ss << hex << setfill('0');
uint8_t * S = (uint8_t *) LUT;
for (int i = 0; i < LUT_UNIT_N; i++)
{
ss << setw(LUT_UNIT_NIBBLE_N) << S[i] + '\0';
}
string res = ss.str();
return res;
}
void sort()
{
bit_slice_l_t<N> mod = bit_slice[info_line];
StaticSort<N> staticSort;
staticSort(bit_slice);
for(int i = 0; i < N; i++)
{
if(bit_slice[i] == mod) info_line = i;
}
}
void sort(int iPE[N])
{
bit_slice_PE_t<N> b_c_tmp;
for (int i = 0; i < N; i++)
{
b_c_tmp[i].first = bit_slice[i];
b_c_tmp[i].second = i;
}
StaticSort<N> staticSort;
staticSort(b_c_tmp);
for (int i = 0; i < N; i++)
{
bit_slice[i] = b_c_tmp[i].first;
iPE[b_c_tmp[i].second] = i;
}
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
void LUT_to_bit_slice()
{
if (N == 3)
{
const __m128i mask = _mm_set_epi64x(0x0, 0xffffffffffffffffULL);
LUT[0] = _mm_and_si128(LUT[0], mask);
bit_slice[2][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 5));
bit_slice[1][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 6));
bit_slice[0][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 7));
}
else if (N == 4)
{
bit_slice[3][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 4));
bit_slice[2][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 5));
bit_slice[1][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 6));
bit_slice[0][0] = _mm_movemask_epi8(_mm_slli_epi16(LUT[0], 7));
}
else if (N <= 8)
{
__m128i xmm[LUT_XMM_N];
for (int i = 0; i < LUT_XMM_N; i++)
{
xmm[i] = _mm_slli_epi16(LUT[i], 8 - N);
}
for (int i = N - 1; i >= 0; i--)
{
for (int j = LUT_XMM_N - 1; j >= 0; j--)
{
bit_slice[i][(j*16)/UNIT_BIT_N] <<= 16;
bit_slice[i][(j*16)/UNIT_BIT_N] |= (unsigned short)_mm_movemask_epi8(xmm[j]);
xmm[j] = _mm_slli_epi16(xmm[j], 1);
}
}
}
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
void LUT_to_bit_slice(uint8_t x[])
{
for (int i = 0; i < LUT_XMM_N; i++)
{
LUT[i] = _mm_loadu_si128((__m128i *)(x + i * 16));
}
LUT_to_bit_slice();
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
void bit_slice_to_LUT()
{
uint8_t * x = (uint8_t *) LUT;
if (N == 3)
{
const __m128i mask = _mm_set_epi64x(0x0, 0xffffffULL);
__m128i xmm[1];
xmm[0] = _mm_loadu_si128((__m128i *)((*bit_slice.data()).data()));
xmm[0] = _mm_and_si128(xmm[0], mask);
for (int i = 7; i >= 0; i--)
{
int tmp = _mm_movemask_epi8(xmm[0]);
x[i] = tmp & 0x7;
xmm[0] = _mm_slli_epi16(xmm[0], 1);
}
}
else if (N == 4)
{
const __m128i smm = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x7, 0x5, 0x3, 0x1, 0x80, 0x80, 0x80, 0x80, 0x6, 0x4, 0x2, 0x0);
__m128i xmm[1];
xmm[0] = _mm_loadu_si128((__m128i *)((*bit_slice.data()).data()));
xmm[0] = _mm_shuffle_epi8(xmm[0], smm);
for (int i = 7; i >= 0; i--)
{
int tmp = _mm_movemask_epi8(xmm[0]);
x[i + 8] = (tmp >> 8) & 0xf; x[i] = tmp & 0xf;
xmm[0] = _mm_slli_epi16(xmm[0], 1);
}
}
else if (N == 5)
{
const __m128i mmmask = _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff);
const __m128i smm = _mm_set_epi8(0xf, 0xb, 0x7, 0x3, 0xe, 0xa, 0x6, 0x2, 0xd, 0x9, 0x5, 0x1, 0xc, 0x8, 0x4, 0x0);
__m128i xmm[2];
xmm[0] = _mm_loadu_si128((__m128i *)bit_slice.data()); // 3_3 3_2 3_1 3_0 2_3 2_2 2_1 2_0 1_3 1_2 1_1 1_0 0_3 0_2 0_1 0_0
xmm[1] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 1); // x_3 x_2 x_1 x_0 x_3 x_2 x_1 x_0 x_3 x_2 x_1 x_0 4_3 4_2 4_1 4_0
xmm[0] = _mm_shuffle_epi8(xmm[0], smm); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
xmm[1] = _mm_shuffle_epi8(xmm[1], smm);
xmm[1] = _mm_and_si128(xmm[1], mmmask); // x_3 x_3 x_3 4_3 x_2 x_2 x_2 4_2 x_1 x_1 x_1 4_1 x_0 x_0 x_0 4_0
for (int i = 7; i >= 0; i--)
{
int tmp0 = _mm_movemask_epi8(xmm[0]);
int tmp1 = _mm_movemask_epi8(xmm[1]);
x[i + 8 * 3] = ((tmp0 >> (4 * 3)) & 0xf) | ((tmp1 >> (4 * 2)) & 0x10);
x[i + 8 * 2] = ((tmp0 >> (4 * 2)) & 0xf) | ((tmp1 >> (4 * 1)) & 0x10);
x[i + 8 * 1] = ((tmp0 >> (4 * 1)) & 0xf) | ((tmp1 >> (4 * 0)) & 0x10);
x[i + 8 * 0] = ((tmp0 >> (4 * 0)) & 0xf) | ((tmp1 << (4 * 1)) & 0x10);
xmm[0] = _mm_slli_epi16(xmm[0], 1);
xmm[1] = _mm_slli_epi16(xmm[1], 1);
}
}
else if (N == 6)
{
const __m128i smm = _mm_set_epi8(0xf, 0x7, 0xe, 0x6, 0xd, 0x5, 0xc, 0x4, 0xb, 0x3, 0xa, 0x2, 0x9, 0x1, 0x8, 0x0);
__m128i xmm[4];
__m128i tmm[4];
xmm[0] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 0); // 1_7 1_6 1_5 1_4 1_3 1_2 1_1 1_0 0_7 0_6 0_5 0_4 0_3 0_2 0_1 0_0
xmm[1] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 1); // 3_7 3_6 3_5 3_4 3_3 3_2 3_1 3_0 2_7 2_6 2_5 2_4 2_3 2_2 2_1 2_0
xmm[2] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 2); // 5_7 5_6 5_5 5_4 5_3 5_2 5_1 5_0 4_7 4_6 4_5 4_4 4_3 4_2 4_1 4_0
xmm[0] = _mm_shuffle_epi8(xmm[0], smm); // 1_7 0_7 1_6 0_6 1_5 0_5 1_4 0_4 1_3 0_3 1_2 0_2 1_1 0_1 1_0 0_0
xmm[1] = _mm_shuffle_epi8(xmm[1], smm); // 3_7 2_7 3_6 2_6 3_5 2_5 3_4 2_4 3_3 2_3 3_2 2_2 3_1 2_1 3_0 2_0
xmm[2] = _mm_shuffle_epi8(xmm[2], smm); // 5_7 4_7 5_6 4_6 5_5 4_5 5_4 4_4 5_3 4_3 5_2 4_2 5_1 4_1 5_0 4_0
xmm[3] = _mm_setzero_si128(); // x_7 x_7 x_6 x_6 x_5 x_5 x_4 x_4 x_3 x_3 x_2 x_2 x_1 x_1 x_0 x_0
tmm[0] = _mm_unpacklo_epi16(xmm[0], xmm[1]); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
tmm[1] = _mm_unpackhi_epi16(xmm[0], xmm[1]); // 3_7 2_7 1_7 0_7 3_6 2_6 1_6 0_6 3_5 2_5 1_5 0_5 3_4 2_4 1_4 0_4
tmm[2] = _mm_unpacklo_epi16(xmm[2], xmm[3]); // x_3 x_3 5_3 4_3 x_2 x_2 5_2 4_2 x_1 x_1 5_1 4_1 x_0 x_0 5_0 4_0
tmm[3] = _mm_unpackhi_epi16(xmm[2], xmm[3]); // x_7 x_7 5_7 4_7 x_6 x_6 5_6 4_6 x_5 x_5 5_5 4_5 x_4 x_4 5_4 4_4
xmm[0] = _mm_unpacklo_epi32(tmm[0], tmm[2]); // x_1 x_1 5_1 4_1 3_1 2_1 1_1 0_1 x_0 x_0 5_0 4_0 3_0 2_0 1_0 0_0
xmm[1] = _mm_unpackhi_epi32(tmm[0], tmm[2]); // x_3 x_3 5_3 4_3 3_3 2_3 1_3 0_3 x_2 x_2 5_2 4_2 3_2 2_2 1_2 0_2
xmm[2] = _mm_unpacklo_epi32(tmm[1], tmm[3]); // x_5 x_5 5_5 4_5 3_5 2_5 1_5 0_5 x_4 x_4 5_4 4_4 3_4 2_4 1_4 0_4
xmm[3] = _mm_unpackhi_epi32(tmm[1], tmm[3]); // x_7 x_7 5_7 4_7 3_7 2_7 1_7 0_7 x_6 x_6 5_6 4_6 3_6 2_6 1_6 0_6
for (int i = 7; i >= 0; i--)
{
int tmp0 = _mm_movemask_epi8(xmm[0]);
int tmp1 = _mm_movemask_epi8(xmm[1]);
int tmp2 = _mm_movemask_epi8(xmm[2]);
int tmp3 = _mm_movemask_epi8(xmm[3]);
x[i + 8 * 7] = ((tmp3 >> (8 * 1)) & 0x3f); x[i + 8 * 6] = ((tmp3 >> (8 * 0)) & 0x3f);
x[i + 8 * 5] = ((tmp2 >> (8 * 1)) & 0x3f); x[i + 8 * 4] = ((tmp2 >> (8 * 0)) & 0x3f);
x[i + 8 * 3] = ((tmp1 >> (8 * 1)) & 0x3f); x[i + 8 * 2] = ((tmp1 >> (8 * 0)) & 0x3f);
x[i + 8 * 1] = ((tmp0 >> (8 * 1)) & 0x3f); x[i + 8 * 0] = ((tmp0 >> (8 * 0)) & 0x3f);
xmm[0] = _mm_slli_epi16(xmm[0], 1);
xmm[1] = _mm_slli_epi16(xmm[1], 1);
xmm[2] = _mm_slli_epi16(xmm[2], 1);
xmm[3] = _mm_slli_epi16(xmm[3], 1);
}
}
else if (N == 7)
{
__m128i xmm[8];
__m128i tmm[8];
for (int i = 0; i < 7; i++)
{
xmm[i] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + i);
}
xmm[7] = _mm_setzero_si128();
// 0_f 0_e 0_d 0_c 0_b 0_a 0_9 0_8 0_7 0_6 0_5 0_4 0_3 0_2 0_1 0_0
// 1_f 1_e 1_d 1_c 1_b 1_a 1_9 1_8 1_7 1_6 1_5 1_4 1_3 1_2 1_1 1_0
// 2_f 2_e 2_d 2_c 2_b 2_a 2_9 2_8 2_7 2_6 2_5 2_4 2_3 2_2 2_1 2_0
// 3_f 3_e 3_d 3_c 3_b 3_a 3_9 3_8 3_7 3_6 3_5 3_4 3_3 3_2 3_1 3_0
// 4_f 4_e 4_d 4_c 4_b 4_a 4_9 4_8 4_7 4_6 4_5 4_4 4_3 4_2 4_1 4_0
// 5_f 5_e 5_d 5_c 5_b 5_a 5_9 5_8 5_7 5_6 5_5 5_4 5_3 5_2 5_1 5_0
// 6_f 6_e 6_d 6_c 6_b 6_a 6_9 6_8 6_7 6_6 6_5 6_4 6_3 6_2 6_1 6_0
// x_f x_e x_d x_c x_b x_a x_9 x_8 x_7 x_6 x_5 x_4 x_3 x_2 x_1 x_0
tmm[0] = _mm_unpacklo_epi8(xmm[0], xmm[1]); // 1_7 0_7 1_6 0_6 1_5 0_5 1_4 0_4 1_3 0_3 1_2 0_2 1_1 0_1 1_0 0_0
tmm[1] = _mm_unpackhi_epi8(xmm[0], xmm[1]); // 1_f 0_f 1_e 0_e 1_d 0_d 1_c 0_c 1_b 0_b 1_a 0_a 1_9 0_9 1_8 0_8
tmm[2] = _mm_unpacklo_epi8(xmm[2], xmm[3]); // 3_7 2_7 3_6 2_6 3_5 2_5 3_4 2_4 3_3 2_3 3_2 2_2 3_1 2_1 3_0 2_0
tmm[3] = _mm_unpackhi_epi8(xmm[2], xmm[3]); // 3_f 2_f 3_e 2_e 3_d 2_d 3_c 2_c 3_b 2_b 3_a 2_a 3_9 2_9 3_8 2_8
tmm[4] = _mm_unpacklo_epi8(xmm[4], xmm[5]); // 5_7 4_7 5_6 4_6 5_5 4_5 5_4 4_4 5_3 4_3 5_2 4_2 5_1 4_1 5_0 4_0
tmm[5] = _mm_unpackhi_epi8(xmm[4], xmm[5]); // 5_f 4_f 5_e 4_e 5_d 4_d 5_c 4_c 5_b 4_b 5_a 4_a 5_9 4_9 5_8 4_8
tmm[6] = _mm_unpacklo_epi8(xmm[6], xmm[7]); // x_7 6_7 x_6 6_6 x_5 6_5 x_4 6_4 x_3 6_3 x_2 6_2 x_1 6_1 x_0 6_0
tmm[7] = _mm_unpackhi_epi8(xmm[6], xmm[7]); // x_f 6_f x_e 6_e x_d 6_d x_c 6_c x_b 6_b x_a 6_a x_9 6_9 x_8 6_8
xmm[0] = _mm_unpacklo_epi16(tmm[0], tmm[2]); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
xmm[1] = _mm_unpackhi_epi16(tmm[0], tmm[2]); // 3_7 2_7 1_7 0_7 3_6 2_6 1_6 0_6 3_5 2_5 1_5 0_5 3_4 2_4 1_4 0_4
xmm[2] = _mm_unpacklo_epi16(tmm[1], tmm[3]); // 3_b 2_b 1_b 0_b 3_a 2_a 1_a 0_a 3_9 2_9 1_9 0_9 3_8 2_8 1_8 0_8
xmm[3] = _mm_unpackhi_epi16(tmm[1], tmm[3]); // 3_f 2_f 1_f 0_f 3_e 2_e 1_e 0_e 3_d 2_d 1_d 0_d 3_c 2_c 1_c 0_c
xmm[4] = _mm_unpacklo_epi16(tmm[4], tmm[6]); // x_3 6_3 5_3 4_3 x_2 6_2 5_2 4_2 x_1 6_1 5_1 4_1 x_0 6_0 5_0 4_0
xmm[5] = _mm_unpackhi_epi16(tmm[4], tmm[6]); // x_7 6_7 5_7 4_7 x_6 6_6 5_6 4_6 x_5 6_5 5_5 4_5 x_4 6_4 5_4 4_4
xmm[6] = _mm_unpacklo_epi16(tmm[5], tmm[7]); // x_b 6_b 5_b 4_b x_a 6_a 5_a 4_a x_9 6_9 5_9 4_9 x_8 6_8 5_8 4_8
xmm[7] = _mm_unpackhi_epi16(tmm[5], tmm[7]); // x_f 6_f 5_f 4_f x_e 6_e 5_e 4_e x_d 6_d 5_d 4_d x_c 6_c 5_c 4_c
tmm[0] = _mm_unpacklo_epi32(xmm[0], xmm[4]); // x_1 6_1 5_1 4_1 3_1 2_1 1_1 0_1 x_0 6_0 5_0 4_0 3_0 2_0 1_0 0_0
tmm[1] = _mm_unpackhi_epi32(xmm[0], xmm[4]); // x_3 6_3 5_3 4_3 3_3 2_3 1_3 0_3 x_2 6_2 5_2 4_2 3_2 2_2 1_2 0_2
tmm[2] = _mm_unpacklo_epi32(xmm[1], xmm[5]); // x_5 6_5 5_5 4_5 3_5 2_5 1_5 0_5 x_4 6_4 5_4 4_4 3_4 2_4 1_4 0_4
tmm[3] = _mm_unpackhi_epi32(xmm[1], xmm[5]); // x_7 6_7 5_7 4_7 3_7 2_7 1_7 0_7 x_6 6_6 5_6 4_6 3_6 2_6 1_6 0_6
tmm[4] = _mm_unpacklo_epi32(xmm[2], xmm[6]); // x_9 6_9 5_9 4_9 3_9 2_9 1_9 0_9 x_8 6_8 5_8 4_8 3_8 2_8 1_8 0_8
tmm[5] = _mm_unpackhi_epi32(xmm[2], xmm[6]); // x_b 6_b 5_b 4_b 3_b 2_b 1_b 0_b x_a 6_a 5_a 4_a 3_a 2_a 1_a 0_a
tmm[6] = _mm_unpacklo_epi32(xmm[3], xmm[7]); // x_d 6_d 5_d 4_d 3_d 2_d 1_d 0_d x_c 6_c 5_c 4_c 3_c 2_c 1_c 0_c
tmm[7] = _mm_unpackhi_epi32(xmm[3], xmm[7]); // x_f 6_f 5_f 4_f 3_f 2_f 1_f 0_f x_e 6_e 5_e 4_e 3_e 2_e 1_e 0_e
int tmp[8];
for (int i = 7; i >= 0; i--)
{
tmp[0] = _mm_movemask_epi8(tmm[0]);
tmp[1] = _mm_movemask_epi8(tmm[1]);
tmp[2] = _mm_movemask_epi8(tmm[2]);
tmp[3] = _mm_movemask_epi8(tmm[3]);
tmp[4] = _mm_movemask_epi8(tmm[4]);
tmp[5] = _mm_movemask_epi8(tmm[5]);
tmp[6] = _mm_movemask_epi8(tmm[6]);
tmp[7] = _mm_movemask_epi8(tmm[7]);
x[i + 16 * 7 + 8] = ((tmp[7] >> (8 * 1)) & 0x7f); x[i + 16 * 7] = ((tmp[7] >> (8 * 0)) & 0x7f);
x[i + 16 * 6 + 8] = ((tmp[6] >> (8 * 1)) & 0x7f); x[i + 16 * 6] = ((tmp[6] >> (8 * 0)) & 0x7f);
x[i + 16 * 5 + 8] = ((tmp[5] >> (8 * 1)) & 0x7f); x[i + 16 * 5] = ((tmp[5] >> (8 * 0)) & 0x7f);
x[i + 16 * 4 + 8] = ((tmp[4] >> (8 * 1)) & 0x7f); x[i + 16 * 4] = ((tmp[4] >> (8 * 0)) & 0x7f);
x[i + 16 * 3 + 8] = ((tmp[3] >> (8 * 1)) & 0x7f); x[i + 16 * 3] = ((tmp[3] >> (8 * 0)) & 0x7f);
x[i + 16 * 2 + 8] = ((tmp[2] >> (8 * 1)) & 0x7f); x[i + 16 * 2] = ((tmp[2] >> (8 * 0)) & 0x7f);
x[i + 16 * 1 + 8] = ((tmp[1] >> (8 * 1)) & 0x7f); x[i + 16 * 1] = ((tmp[1] >> (8 * 0)) & 0x7f);
x[i + 16 * 0 + 8] = ((tmp[0] >> (8 * 1)) & 0x7f); x[i + 16 * 0] = ((tmp[0] >> (8 * 0)) & 0x7f);
tmm[0] = _mm_slli_epi16(tmm[0], 1);
tmm[1] = _mm_slli_epi16(tmm[1], 1);
tmm[2] = _mm_slli_epi16(tmm[2], 1);
tmm[3] = _mm_slli_epi16(tmm[3], 1);
tmm[4] = _mm_slli_epi16(tmm[4], 1);
tmm[5] = _mm_slli_epi16(tmm[5], 1);
tmm[6] = _mm_slli_epi16(tmm[6], 1);
tmm[7] = _mm_slli_epi16(tmm[7], 1);
}
}
else if (N == 8)
{
__m128i xmm[8];
__m128i tmm[8];
for (int k = 1; k >= 0; k--)
{
for (int i = 0; i < 8; i++)
{
xmm[i] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + i * 2 + k);
}
tmm[0] = _mm_unpacklo_epi8(xmm[0], xmm[1]); // 1_7 0_7 1_6 0_6 1_5 0_5 1_4 0_4 1_3 0_3 1_2 0_2 1_1 0_1 1_0 0_0
tmm[1] = _mm_unpackhi_epi8(xmm[0], xmm[1]); // 1_f 0_f 1_e 0_e 1_d 0_d 1_c 0_c 1_b 0_b 1_a 0_a 1_9 0_9 1_8 0_8
tmm[2] = _mm_unpacklo_epi8(xmm[2], xmm[3]); // 3_7 2_7 3_6 2_6 3_5 2_5 3_4 2_4 3_3 2_3 3_2 2_2 3_1 2_1 3_0 2_0
tmm[3] = _mm_unpackhi_epi8(xmm[2], xmm[3]); // 3_f 2_f 3_e 2_e 3_d 2_d 3_c 2_c 3_b 2_b 3_a 2_a 3_9 2_9 3_8 2_8
tmm[4] = _mm_unpacklo_epi8(xmm[4], xmm[5]); // 5_7 4_7 5_6 4_6 5_5 4_5 5_4 4_4 5_3 4_3 5_2 4_2 5_1 4_1 5_0 4_0
tmm[5] = _mm_unpackhi_epi8(xmm[4], xmm[5]); // 5_f 4_f 5_e 4_e 5_d 4_d 5_c 4_c 5_b 4_b 5_a 4_a 5_9 4_9 5_8 4_8
tmm[6] = _mm_unpacklo_epi8(xmm[6], xmm[7]); // 7_7 6_7 7_6 6_6 7_5 6_5 7_4 6_4 7_3 6_3 7_2 6_2 7_1 6_1 7_0 6_0
tmm[7] = _mm_unpackhi_epi8(xmm[6], xmm[7]); // 7_f 6_f 7_e 6_e 7_d 6_d 7_c 6_c 7_b 6_b 7_a 6_a 7_9 6_9 7_8 6_8
xmm[0] = _mm_unpacklo_epi16(tmm[0], tmm[2]); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
xmm[1] = _mm_unpackhi_epi16(tmm[0], tmm[2]); // 3_7 2_7 1_7 0_7 3_6 2_6 1_6 0_6 3_5 2_5 1_5 0_5 3_4 2_4 1_4 0_4
xmm[2] = _mm_unpacklo_epi16(tmm[1], tmm[3]); // 3_b 2_b 1_b 0_b 3_a 2_a 1_a 0_a 3_9 2_9 1_9 0_9 3_8 2_8 1_8 0_8
xmm[3] = _mm_unpackhi_epi16(tmm[1], tmm[3]); // 3_f 2_f 1_f 0_f 3_e 2_e 1_e 0_e 3_d 2_d 1_d 0_d 3_c 2_c 1_c 0_c
xmm[4] = _mm_unpacklo_epi16(tmm[4], tmm[6]); // 7_3 6_3 5_3 4_3 7_2 6_2 5_2 4_2 7_1 6_1 5_1 4_1 7_0 6_0 5_0 4_0
xmm[5] = _mm_unpackhi_epi16(tmm[4], tmm[6]); // 7_7 6_7 5_7 4_7 7_6 6_6 5_6 4_6 7_5 6_5 5_5 4_5 7_4 6_4 5_4 4_4
xmm[6] = _mm_unpacklo_epi16(tmm[5], tmm[7]); // 7_b 6_b 5_b 4_b 7_a 6_a 5_a 4_a 7_9 6_9 5_9 4_9 7_8 6_8 5_8 4_8
xmm[7] = _mm_unpackhi_epi16(tmm[5], tmm[7]); // 7_f 6_f 5_f 4_f 7_e 6_e 5_e 4_e 7_d 6_d 5_d 4_d 7_c 6_c 5_c 4_c
tmm[0] = _mm_unpacklo_epi32(xmm[0], xmm[4]); // 7_1 6_1 5_1 4_1 3_1 2_1 1_1 0_1 7_0 6_0 5_0 4_0 3_0 2_0 1_0 0_0
tmm[1] = _mm_unpackhi_epi32(xmm[0], xmm[4]); // 7_3 6_3 5_3 4_3 3_3 2_3 1_3 0_3 7_2 6_2 5_2 4_2 3_2 2_2 1_2 0_2
tmm[2] = _mm_unpacklo_epi32(xmm[1], xmm[5]); // 7_5 6_5 5_5 4_5 3_5 2_5 1_5 0_5 7_4 6_4 5_4 4_4 3_4 2_4 1_4 0_4
tmm[3] = _mm_unpackhi_epi32(xmm[1], xmm[5]); // 7_7 6_7 5_7 4_7 3_7 2_7 1_7 0_7 7_6 6_6 5_6 4_6 3_6 2_6 1_6 0_6
tmm[4] = _mm_unpacklo_epi32(xmm[2], xmm[6]); // 7_9 6_9 5_9 4_9 3_9 2_9 1_9 0_9 7_8 6_8 5_8 4_8 3_8 2_8 1_8 0_8
tmm[5] = _mm_unpackhi_epi32(xmm[2], xmm[6]); // 7_b 6_b 5_b 4_b 3_b 2_b 1_b 0_b 7_a 6_a 5_a 4_a 3_a 2_a 1_a 0_a
tmm[6] = _mm_unpacklo_epi32(xmm[3], xmm[7]); // 7_d 6_d 5_d 4_d 3_d 2_d 1_d 0_d 7_c 6_c 5_c 4_c 3_c 2_c 1_c 0_c
tmm[7] = _mm_unpackhi_epi32(xmm[3], xmm[7]); // 7_f 6_f 5_f 4_f 3_f 2_f 1_f 0_f 7_e 6_e 5_e 4_e 3_e 2_e 1_e 0_e
int tmp[8];
for (int i = 7; i >= 0; i--)
{
tmp[0] = _mm_movemask_epi8(tmm[0]);
tmp[1] = _mm_movemask_epi8(tmm[1]);
tmp[2] = _mm_movemask_epi8(tmm[2]);
tmp[3] = _mm_movemask_epi8(tmm[3]);
tmp[4] = _mm_movemask_epi8(tmm[4]);
tmp[5] = _mm_movemask_epi8(tmm[5]);
tmp[6] = _mm_movemask_epi8(tmm[6]);
tmp[7] = _mm_movemask_epi8(tmm[7]);
x[128 * k + i + 16 * 7 + 8] = ((tmp[7] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 7] = ((tmp[7] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 6 + 8] = ((tmp[6] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 6] = ((tmp[6] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 5 + 8] = ((tmp[5] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 5] = ((tmp[5] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 4 + 8] = ((tmp[4] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 4] = ((tmp[4] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 3 + 8] = ((tmp[3] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 3] = ((tmp[3] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 2 + 8] = ((tmp[2] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 2] = ((tmp[2] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 1 + 8] = ((tmp[1] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 1] = ((tmp[1] >> (8 * 0)) & 0xff);
x[128 * k + i + 16 * 0 + 8] = ((tmp[0] >> (8 * 1)) & 0xff); x[128 * k + i + 16 * 0] = ((tmp[0] >> (8 * 0)) & 0xff);
tmm[0] = _mm_slli_epi16(tmm[0], 1);
tmm[1] = _mm_slli_epi16(tmm[1], 1);
tmm[2] = _mm_slli_epi16(tmm[2], 1);
tmm[3] = _mm_slli_epi16(tmm[3], 1);
tmm[4] = _mm_slli_epi16(tmm[4], 1);
tmm[5] = _mm_slli_epi16(tmm[5], 1);
tmm[6] = _mm_slli_epi16(tmm[6], 1);
tmm[7] = _mm_slli_epi16(tmm[7], 1);
}
}
}
else
{
cout << "bit_slice_to_LUT():: Not support for N > 8." << endl;
}
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
void bit_slice_to_LUT(uint8_t x[])
{
bit_slice_to_LUT();
for (int i = 0; i < LUT_XMM_N; i++)
{
_mm_storeu_si128((__m128i *)(x + i * 16), LUT[i]);
}
}
// !!! Only applicable for N = 3, 4, 5, 6, 7, 8
void bit_slice_to_iLUT(uint8_t x[])
{
if (N == 3)
{
static const __m128i mask = _mm_set_epi64x(0x0, 0xffffffULL);
__m128i xmm[1];
xmm[0] = _mm_loadu_si128((__m128i *)bit_slice.data());
xmm[0] = _mm_and_si128(xmm[0], mask);
for (int i = 7; i >= 0; i--)
{
int tmp = _mm_movemask_epi8(xmm[0]);
x[tmp & 0x7] = i;
xmm[0] = _mm_slli_epi16(xmm[0], 1);
}
}
else if (N == 4)
{
static const __m128i smm = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x7, 0x5, 0x3, 0x1, 0x80, 0x80, 0x80, 0x80, 0x6, 0x4, 0x2, 0x0);
__m128i xmm[1];
xmm[0] = _mm_loadu_si128((__m128i *)bit_slice.data());
xmm[0] = _mm_shuffle_epi8(xmm[0], smm);
for (int i = 7; i >= 0; i--)
{
int tmp = _mm_movemask_epi8(xmm[0]);
x[(tmp >> 8) & 0xf] = i + 8; x[tmp & 0xf] = i;
xmm[0] = _mm_slli_epi16(xmm[0], 1);
}
}
else if (N == 5)
{
static const __m128i mmmask = _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff);
static const __m128i smm = _mm_set_epi8(0xf, 0xb, 0x7, 0x3, 0xe, 0xa, 0x6, 0x2, 0xd, 0x9, 0x5, 0x1, 0xc, 0x8, 0x4, 0x0);
__m128i xmm[2];
xmm[0] = _mm_loadu_si128((__m128i *)bit_slice.data()); // 3_3 3_2 3_1 3_0 2_3 2_2 2_1 2_0 1_3 1_2 1_1 1_0 0_3 0_2 0_1 0_0
xmm[1] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 1); // x_3 x_2 x_1 x_0 x_3 x_2 x_1 x_0 x_3 x_2 x_1 x_0 4_3 4_2 4_1 4_0
xmm[0] = _mm_shuffle_epi8(xmm[0], smm); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
xmm[1] = _mm_shuffle_epi8(xmm[1], smm);
xmm[1] = _mm_and_si128(xmm[1], mmmask); // x_3 x_3 x_3 4_3 x_2 x_2 x_2 4_2 x_1 x_1 x_1 4_1 x_0 x_0 x_0 4_0
for (int i = 7; i >= 0; i--)
{
int tmp0 = _mm_movemask_epi8(xmm[0]);
int tmp1 = _mm_movemask_epi8(xmm[1]);
x[((tmp0 >> (4 * 3)) & 0xf) | ((tmp1 >> (4 * 2)) & 0x10)] = i + 8 * 3;
x[((tmp0 >> (4 * 2)) & 0xf) | ((tmp1 >> (4 * 1)) & 0x10)] = i + 8 * 2;
x[((tmp0 >> (4 * 1)) & 0xf) | ((tmp1 >> (4 * 0)) & 0x10)] = i + 8 * 1;
x[((tmp0 >> (4 * 0)) & 0xf) | ((tmp1 << (4 * 1)) & 0x10)] = i + 8 * 0;
xmm[0] = _mm_slli_epi16(xmm[0], 1);
xmm[1] = _mm_slli_epi16(xmm[1], 1);
}
}
else if (N == 6)
{
static const __m128i smm = _mm_set_epi8(0xf, 0x7, 0xe, 0x6, 0xd, 0x5, 0xc, 0x4, 0xb, 0x3, 0xa, 0x2, 0x9, 0x1, 0x8, 0x0);
__m128i xmm[4];
__m128i tmm[4];
xmm[0] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 0); // 1_7 1_6 1_5 1_4 1_3 1_2 1_1 1_0 0_7 0_6 0_5 0_4 0_3 0_2 0_1 0_0
xmm[1] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 1); // 3_7 3_6 3_5 3_4 3_3 3_2 3_1 3_0 2_7 2_6 2_5 2_4 2_3 2_2 2_1 2_0
xmm[2] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + 2); // 5_7 5_6 5_5 5_4 5_3 5_2 5_1 5_0 4_7 4_6 4_5 4_4 4_3 4_2 4_1 4_0
xmm[0] = _mm_shuffle_epi8(xmm[0], smm); // 1_7 0_7 1_6 0_6 1_5 0_5 1_4 0_4 1_3 0_3 1_2 0_2 1_1 0_1 1_0 0_0
xmm[1] = _mm_shuffle_epi8(xmm[1], smm); // 3_7 2_7 3_6 2_6 3_5 2_5 3_4 2_4 3_3 2_3 3_2 2_2 3_1 2_1 3_0 2_0
xmm[2] = _mm_shuffle_epi8(xmm[2], smm); // 5_7 4_7 5_6 4_6 5_5 4_5 5_4 4_4 5_3 4_3 5_2 4_2 5_1 4_1 5_0 4_0
xmm[3] = _mm_setzero_si128(); // x_7 x_7 x_6 x_6 x_5 x_5 x_4 x_4 x_3 x_3 x_2 x_2 x_1 x_1 x_0 x_0
tmm[0] = _mm_unpacklo_epi16(xmm[0], xmm[1]); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
tmm[1] = _mm_unpackhi_epi16(xmm[0], xmm[1]); // 3_7 2_7 1_7 0_7 3_6 2_6 1_6 0_6 3_5 2_5 1_5 0_5 3_4 2_4 1_4 0_4
tmm[2] = _mm_unpacklo_epi16(xmm[2], xmm[3]); // x_3 x_3 5_3 4_3 x_2 x_2 5_2 4_2 x_1 x_1 5_1 4_1 x_0 x_0 5_0 4_0
tmm[3] = _mm_unpackhi_epi16(xmm[2], xmm[3]); // x_7 x_7 5_7 4_7 x_6 x_6 5_6 4_6 x_5 x_5 5_5 4_5 x_4 x_4 5_4 4_4
xmm[0] = _mm_unpacklo_epi32(tmm[0], tmm[2]); // x_1 x_1 5_1 4_1 3_1 2_1 1_1 0_1 x_0 x_0 5_0 4_0 3_0 2_0 1_0 0_0
xmm[1] = _mm_unpackhi_epi32(tmm[0], tmm[2]); // x_3 x_3 5_3 4_3 3_3 2_3 1_3 0_3 x_2 x_2 5_2 4_2 3_2 2_2 1_2 0_2
xmm[2] = _mm_unpacklo_epi32(tmm[1], tmm[3]); // x_5 x_5 5_5 4_5 3_5 2_5 1_5 0_5 x_4 x_4 5_4 4_4 3_4 2_4 1_4 0_4
xmm[3] = _mm_unpackhi_epi32(tmm[1], tmm[3]); // x_7 x_7 5_7 4_7 3_7 2_7 1_7 0_7 x_6 x_6 5_6 4_6 3_6 2_6 1_6 0_6
for (int i = 7; i >= 0; i--)
{
int tmp0 = _mm_movemask_epi8(xmm[0]);
int tmp1 = _mm_movemask_epi8(xmm[1]);
int tmp2 = _mm_movemask_epi8(xmm[2]);
int tmp3 = _mm_movemask_epi8(xmm[3]);
x[((tmp3 >> (8 * 1)) & 0x3f)] = i + 8 * 7; x[((tmp3 >> (8 * 0)) & 0x3f)] = i + 8 * 6;
x[((tmp2 >> (8 * 1)) & 0x3f)] = i + 8 * 5; x[((tmp2 >> (8 * 0)) & 0x3f)] = i + 8 * 4;
x[((tmp1 >> (8 * 1)) & 0x3f)] = i + 8 * 3; x[((tmp1 >> (8 * 0)) & 0x3f)] = i + 8 * 2;
x[((tmp0 >> (8 * 1)) & 0x3f)] = i + 8 * 1; x[((tmp0 >> (8 * 0)) & 0x3f)] = i + 8 * 0;
xmm[0] = _mm_slli_epi16(xmm[0], 1);
xmm[1] = _mm_slli_epi16(xmm[1], 1);
xmm[2] = _mm_slli_epi16(xmm[2], 1);
xmm[3] = _mm_slli_epi16(xmm[3], 1);
}
}
else if (N == 7)
{
__m128i xmm[8];
__m128i tmm[8];
for (int i = 0; i < 7; i++)
{
xmm[i] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + i);
}
xmm[7] = _mm_setzero_si128();
// 0_f 0_e 0_d 0_c 0_b 0_a 0_9 0_8 0_7 0_6 0_5 0_4 0_3 0_2 0_1 0_0
// 1_f 1_e 1_d 1_c 1_b 1_a 1_9 1_8 1_7 1_6 1_5 1_4 1_3 1_2 1_1 1_0
// 2_f 2_e 2_d 2_c 2_b 2_a 2_9 2_8 2_7 2_6 2_5 2_4 2_3 2_2 2_1 2_0
// 3_f 3_e 3_d 3_c 3_b 3_a 3_9 3_8 3_7 3_6 3_5 3_4 3_3 3_2 3_1 3_0
// 4_f 4_e 4_d 4_c 4_b 4_a 4_9 4_8 4_7 4_6 4_5 4_4 4_3 4_2 4_1 4_0
// 5_f 5_e 5_d 5_c 5_b 5_a 5_9 5_8 5_7 5_6 5_5 5_4 5_3 5_2 5_1 5_0
// 6_f 6_e 6_d 6_c 6_b 6_a 6_9 6_8 6_7 6_6 6_5 6_4 6_3 6_2 6_1 6_0
// x_f x_e x_d x_c x_b x_a x_9 x_8 x_7 x_6 x_5 x_4 x_3 x_2 x_1 x_0
tmm[0] = _mm_unpacklo_epi8(xmm[0], xmm[1]); // 1_7 0_7 1_6 0_6 1_5 0_5 1_4 0_4 1_3 0_3 1_2 0_2 1_1 0_1 1_0 0_0
tmm[1] = _mm_unpackhi_epi8(xmm[0], xmm[1]); // 1_f 0_f 1_e 0_e 1_d 0_d 1_c 0_c 1_b 0_b 1_a 0_a 1_9 0_9 1_8 0_8
tmm[2] = _mm_unpacklo_epi8(xmm[2], xmm[3]); // 3_7 2_7 3_6 2_6 3_5 2_5 3_4 2_4 3_3 2_3 3_2 2_2 3_1 2_1 3_0 2_0
tmm[3] = _mm_unpackhi_epi8(xmm[2], xmm[3]); // 3_f 2_f 3_e 2_e 3_d 2_d 3_c 2_c 3_b 2_b 3_a 2_a 3_9 2_9 3_8 2_8
tmm[4] = _mm_unpacklo_epi8(xmm[4], xmm[5]); // 5_7 4_7 5_6 4_6 5_5 4_5 5_4 4_4 5_3 4_3 5_2 4_2 5_1 4_1 5_0 4_0
tmm[5] = _mm_unpackhi_epi8(xmm[4], xmm[5]); // 5_f 4_f 5_e 4_e 5_d 4_d 5_c 4_c 5_b 4_b 5_a 4_a 5_9 4_9 5_8 4_8
tmm[6] = _mm_unpacklo_epi8(xmm[6], xmm[7]); // x_7 6_7 x_6 6_6 x_5 6_5 x_4 6_4 x_3 6_3 x_2 6_2 x_1 6_1 x_0 6_0
tmm[7] = _mm_unpackhi_epi8(xmm[6], xmm[7]); // x_f 6_f x_e 6_e x_d 6_d x_c 6_c x_b 6_b x_a 6_a x_9 6_9 x_8 6_8
xmm[0] = _mm_unpacklo_epi16(tmm[0], tmm[2]); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
xmm[1] = _mm_unpackhi_epi16(tmm[0], tmm[2]); // 3_7 2_7 1_7 0_7 3_6 2_6 1_6 0_6 3_5 2_5 1_5 0_5 3_4 2_4 1_4 0_4
xmm[2] = _mm_unpacklo_epi16(tmm[1], tmm[3]); // 3_b 2_b 1_b 0_b 3_a 2_a 1_a 0_a 3_9 2_9 1_9 0_9 3_8 2_8 1_8 0_8
xmm[3] = _mm_unpackhi_epi16(tmm[1], tmm[3]); // 3_f 2_f 1_f 0_f 3_e 2_e 1_e 0_e 3_d 2_d 1_d 0_d 3_c 2_c 1_c 0_c
xmm[4] = _mm_unpacklo_epi16(tmm[4], tmm[6]); // x_3 6_3 5_3 4_3 x_2 6_2 5_2 4_2 x_1 6_1 5_1 4_1 x_0 6_0 5_0 4_0
xmm[5] = _mm_unpackhi_epi16(tmm[4], tmm[6]); // x_7 6_7 5_7 4_7 x_6 6_6 5_6 4_6 x_5 6_5 5_5 4_5 x_4 6_4 5_4 4_4
xmm[6] = _mm_unpacklo_epi16(tmm[5], tmm[7]); // x_b 6_b 5_b 4_b x_a 6_a 5_a 4_a x_9 6_9 5_9 4_9 x_8 6_8 5_8 4_8
xmm[7] = _mm_unpackhi_epi16(tmm[5], tmm[7]); // x_f 6_f 5_f 4_f x_e 6_e 5_e 4_e x_d 6_d 5_d 4_d x_c 6_c 5_c 4_c
tmm[0] = _mm_unpacklo_epi32(xmm[0], xmm[4]); // x_1 6_1 5_1 4_1 3_1 2_1 1_1 0_1 x_0 6_0 5_0 4_0 3_0 2_0 1_0 0_0
tmm[1] = _mm_unpackhi_epi32(xmm[0], xmm[4]); // x_3 6_3 5_3 4_3 3_3 2_3 1_3 0_3 x_2 6_2 5_2 4_2 3_2 2_2 1_2 0_2
tmm[2] = _mm_unpacklo_epi32(xmm[1], xmm[5]); // x_5 6_5 5_5 4_5 3_5 2_5 1_5 0_5 x_4 6_4 5_4 4_4 3_4 2_4 1_4 0_4
tmm[3] = _mm_unpackhi_epi32(xmm[1], xmm[5]); // x_7 6_7 5_7 4_7 3_7 2_7 1_7 0_7 x_6 6_6 5_6 4_6 3_6 2_6 1_6 0_6
tmm[4] = _mm_unpacklo_epi32(xmm[2], xmm[6]); // x_9 6_9 5_9 4_9 3_9 2_9 1_9 0_9 x_8 6_8 5_8 4_8 3_8 2_8 1_8 0_8
tmm[5] = _mm_unpackhi_epi32(xmm[2], xmm[6]); // x_b 6_b 5_b 4_b 3_b 2_b 1_b 0_b x_a 6_a 5_a 4_a 3_a 2_a 1_a 0_a
tmm[6] = _mm_unpacklo_epi32(xmm[3], xmm[7]); // x_d 6_d 5_d 4_d 3_d 2_d 1_d 0_d x_c 6_c 5_c 4_c 3_c 2_c 1_c 0_c
tmm[7] = _mm_unpackhi_epi32(xmm[3], xmm[7]); // x_f 6_f 5_f 4_f 3_f 2_f 1_f 0_f x_e 6_e 5_e 4_e 3_e 2_e 1_e 0_e
int tmp[8];
for (int i = 7; i >= 0; i--)
{
tmp[0] = _mm_movemask_epi8(tmm[0]);
tmp[1] = _mm_movemask_epi8(tmm[1]);
tmp[2] = _mm_movemask_epi8(tmm[2]);
tmp[3] = _mm_movemask_epi8(tmm[3]);
tmp[4] = _mm_movemask_epi8(tmm[4]);
tmp[5] = _mm_movemask_epi8(tmm[5]);
tmp[6] = _mm_movemask_epi8(tmm[6]);
tmp[7] = _mm_movemask_epi8(tmm[7]);
x[((tmp[7] >> (8 * 1)) & 0x7f)] = i + 16 * 7 + 8; x[((tmp[7] >> (8 * 0)) & 0x7f)] = i + 16 * 7;
x[((tmp[6] >> (8 * 1)) & 0x7f)] = i + 16 * 6 + 8; x[((tmp[6] >> (8 * 0)) & 0x7f)] = i + 16 * 6;
x[((tmp[5] >> (8 * 1)) & 0x7f)] = i + 16 * 5 + 8; x[((tmp[5] >> (8 * 0)) & 0x7f)] = i + 16 * 5;
x[((tmp[4] >> (8 * 1)) & 0x7f)] = i + 16 * 4 + 8; x[((tmp[4] >> (8 * 0)) & 0x7f)] = i + 16 * 4;
x[((tmp[3] >> (8 * 1)) & 0x7f)] = i + 16 * 3 + 8; x[((tmp[3] >> (8 * 0)) & 0x7f)] = i + 16 * 3;
x[((tmp[2] >> (8 * 1)) & 0x7f)] = i + 16 * 2 + 8; x[((tmp[2] >> (8 * 0)) & 0x7f)] = i + 16 * 2;
x[((tmp[1] >> (8 * 1)) & 0x7f)] = i + 16 * 1 + 8; x[((tmp[1] >> (8 * 0)) & 0x7f)] = i + 16 * 1;
x[((tmp[0] >> (8 * 1)) & 0x7f)] = i + 16 * 0 + 8; x[((tmp[0] >> (8 * 0)) & 0x7f)] = i + 16 * 0;
tmm[0] = _mm_slli_epi16(tmm[0], 1);
tmm[1] = _mm_slli_epi16(tmm[1], 1);
tmm[2] = _mm_slli_epi16(tmm[2], 1);
tmm[3] = _mm_slli_epi16(tmm[3], 1);
tmm[4] = _mm_slli_epi16(tmm[4], 1);
tmm[5] = _mm_slli_epi16(tmm[5], 1);
tmm[6] = _mm_slli_epi16(tmm[6], 1);
tmm[7] = _mm_slli_epi16(tmm[7], 1);
}
}
else if (N == 8)
{
__m128i xmm[8];
__m128i tmm[8];
for (int k = 1; k >= 0; k--)
{
for (int i = 0; i < 8; i++)
{
xmm[i] = _mm_loadu_si128(((__m128i *)bit_slice.data()) + i * 2 + k);
}
tmm[0] = _mm_unpacklo_epi8(xmm[0], xmm[1]); // 1_7 0_7 1_6 0_6 1_5 0_5 1_4 0_4 1_3 0_3 1_2 0_2 1_1 0_1 1_0 0_0
tmm[1] = _mm_unpackhi_epi8(xmm[0], xmm[1]); // 1_f 0_f 1_e 0_e 1_d 0_d 1_c 0_c 1_b 0_b 1_a 0_a 1_9 0_9 1_8 0_8
tmm[2] = _mm_unpacklo_epi8(xmm[2], xmm[3]); // 3_7 2_7 3_6 2_6 3_5 2_5 3_4 2_4 3_3 2_3 3_2 2_2 3_1 2_1 3_0 2_0
tmm[3] = _mm_unpackhi_epi8(xmm[2], xmm[3]); // 3_f 2_f 3_e 2_e 3_d 2_d 3_c 2_c 3_b 2_b 3_a 2_a 3_9 2_9 3_8 2_8
tmm[4] = _mm_unpacklo_epi8(xmm[4], xmm[5]); // 5_7 4_7 5_6 4_6 5_5 4_5 5_4 4_4 5_3 4_3 5_2 4_2 5_1 4_1 5_0 4_0
tmm[5] = _mm_unpackhi_epi8(xmm[4], xmm[5]); // 5_f 4_f 5_e 4_e 5_d 4_d 5_c 4_c 5_b 4_b 5_a 4_a 5_9 4_9 5_8 4_8
tmm[6] = _mm_unpacklo_epi8(xmm[6], xmm[7]); // 7_7 6_7 7_6 6_6 7_5 6_5 7_4 6_4 7_3 6_3 7_2 6_2 7_1 6_1 7_0 6_0
tmm[7] = _mm_unpackhi_epi8(xmm[6], xmm[7]); // 7_f 6_f 7_e 6_e 7_d 6_d 7_c 6_c 7_b 6_b 7_a 6_a 7_9 6_9 7_8 6_8
xmm[0] = _mm_unpacklo_epi16(tmm[0], tmm[2]); // 3_3 2_3 1_3 0_3 3_2 2_2 1_2 0_2 3_1 2_1 1_1 0_1 3_0 2_0 1_0 0_0
xmm[1] = _mm_unpackhi_epi16(tmm[0], tmm[2]); // 3_7 2_7 1_7 0_7 3_6 2_6 1_6 0_6 3_5 2_5 1_5 0_5 3_4 2_4 1_4 0_4
xmm[2] = _mm_unpacklo_epi16(tmm[1], tmm[3]); // 3_b 2_b 1_b 0_b 3_a 2_a 1_a 0_a 3_9 2_9 1_9 0_9 3_8 2_8 1_8 0_8
xmm[3] = _mm_unpackhi_epi16(tmm[1], tmm[3]); // 3_f 2_f 1_f 0_f 3_e 2_e 1_e 0_e 3_d 2_d 1_d 0_d 3_c 2_c 1_c 0_c
xmm[4] = _mm_unpacklo_epi16(tmm[4], tmm[6]); // 7_3 6_3 5_3 4_3 7_2 6_2 5_2 4_2 7_1 6_1 5_1 4_1 7_0 6_0 5_0 4_0
xmm[5] = _mm_unpackhi_epi16(tmm[4], tmm[6]); // 7_7 6_7 5_7 4_7 7_6 6_6 5_6 4_6 7_5 6_5 5_5 4_5 7_4 6_4 5_4 4_4
xmm[6] = _mm_unpacklo_epi16(tmm[5], tmm[7]); // 7_b 6_b 5_b 4_b 7_a 6_a 5_a 4_a 7_9 6_9 5_9 4_9 7_8 6_8 5_8 4_8
xmm[7] = _mm_unpackhi_epi16(tmm[5], tmm[7]); // 7_f 6_f 5_f 4_f 7_e 6_e 5_e 4_e 7_d 6_d 5_d 4_d 7_c 6_c 5_c 4_c
tmm[0] = _mm_unpacklo_epi32(xmm[0], xmm[4]); // 7_1 6_1 5_1 4_1 3_1 2_1 1_1 0_1 7_0 6_0 5_0 4_0 3_0 2_0 1_0 0_0
tmm[1] = _mm_unpackhi_epi32(xmm[0], xmm[4]); // 7_3 6_3 5_3 4_3 3_3 2_3 1_3 0_3 7_2 6_2 5_2 4_2 3_2 2_2 1_2 0_2
tmm[2] = _mm_unpacklo_epi32(xmm[1], xmm[5]); // 7_5 6_5 5_5 4_5 3_5 2_5 1_5 0_5 7_4 6_4 5_4 4_4 3_4 2_4 1_4 0_4
tmm[3] = _mm_unpackhi_epi32(xmm[1], xmm[5]); // 7_7 6_7 5_7 4_7 3_7 2_7 1_7 0_7 7_6 6_6 5_6 4_6 3_6 2_6 1_6 0_6
tmm[4] = _mm_unpacklo_epi32(xmm[2], xmm[6]); // 7_9 6_9 5_9 4_9 3_9 2_9 1_9 0_9 7_8 6_8 5_8 4_8 3_8 2_8 1_8 0_8
tmm[5] = _mm_unpackhi_epi32(xmm[2], xmm[6]); // 7_b 6_b 5_b 4_b 3_b 2_b 1_b 0_b 7_a 6_a 5_a 4_a 3_a 2_a 1_a 0_a
tmm[6] = _mm_unpacklo_epi32(xmm[3], xmm[7]); // 7_d 6_d 5_d 4_d 3_d 2_d 1_d 0_d 7_c 6_c 5_c 4_c 3_c 2_c 1_c 0_c
tmm[7] = _mm_unpackhi_epi32(xmm[3], xmm[7]); // 7_f 6_f 5_f 4_f 3_f 2_f 1_f 0_f 7_e 6_e 5_e 4_e 3_e 2_e 1_e 0_e
int tmp[8];
for (int i = 7; i >= 0; i--)
{
tmp[0] = _mm_movemask_epi8(tmm[0]);
tmp[1] = _mm_movemask_epi8(tmm[1]);
tmp[2] = _mm_movemask_epi8(tmm[2]);
tmp[3] = _mm_movemask_epi8(tmm[3]);
tmp[4] = _mm_movemask_epi8(tmm[4]);
tmp[5] = _mm_movemask_epi8(tmm[5]);
tmp[6] = _mm_movemask_epi8(tmm[6]);
tmp[7] = _mm_movemask_epi8(tmm[7]);
x[((tmp[7] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 7 + 8; x[((tmp[7] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 7;
x[((tmp[6] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 6 + 8; x[((tmp[6] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 6;
x[((tmp[5] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 5 + 8; x[((tmp[5] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 5;
x[((tmp[4] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 4 + 8; x[((tmp[4] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 4;
x[((tmp[3] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 3 + 8; x[((tmp[3] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 3;
x[((tmp[2] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 2 + 8; x[((tmp[2] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 2;
x[((tmp[1] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 1 + 8; x[((tmp[1] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 1;
x[((tmp[0] >> (8 * 1)) & 0xff)] = 128 * k + i + 16 * 0 + 8; x[((tmp[0] >> (8 * 0)) & 0xff)] = 128 * k + i + 16 * 0;
tmm[0] = _mm_slli_epi16(tmm[0], 1);
tmm[1] = _mm_slli_epi16(tmm[1], 1);
tmm[2] = _mm_slli_epi16(tmm[2], 1);
tmm[3] = _mm_slli_epi16(tmm[3], 1);
tmm[4] = _mm_slli_epi16(tmm[4], 1);
tmm[5] = _mm_slli_epi16(tmm[5], 1);
tmm[6] = _mm_slli_epi16(tmm[6], 1);
tmm[7] = _mm_slli_epi16(tmm[7], 1);
}
}
}
else
{
cout << "bit_slice_to_iLUT():: Not support for N > 8." << endl;
}
}
#define EQU(w, u) (_mm_testz_si128(one_128, _mm_xor_si128(w, u)) == 1)
#define PERM(w) (_mm_extract_epi16(_mm_cmpestrm(w, 16, S4_I, 16, 0x00), 0) == 0xffff)
bool is_permutation() const
{
if (N == 4)
{
return PERM(LUT[0]);
}
else
{
const uint8_t * LUTp = (uint8_t *) LUT;
bit_slice_l_t<N> ind = { {0} };
for (int i = 0; i < LUT_UNIT_N; i++)
{
uint8_t y = LUTp[i];
if (OPs<N>.get_bit(ind, y) == 1)
{
return false;
}
OPs<N>.set_bit_inplace(ind, y);
}
return true;
}
}
void inverse(uint8_t iS[LUT_UNIT_N]) const
{
if (is_permutation())
{
uint8_t * S = (uint8_t *) LUT;
for (int x = 0; x < LUT_UNIT_N; x++)
{
iS[S[x]] = x;
}
}
else
{
for (int x = 0; x < LUT_UNIT_N; x++)
{
iS[x] = 0;
}
}
};
function_t<N> inverse() const
{
function_t<N> res;
uint8_t * iS = (uint8_t *) res.LUT;
if (is_permutation())
{
uint8_t * S = (uint8_t *) LUT;
for (int x = 0; x < LUT_UNIT_N; x++)
{
iS[S[x]] = x;
}
}
else
{
for (int x = 0; x < LUT_UNIT_N; x++)
{
iS[x] = 0;
}
}
res.LUT_to_bit_slice();
return res;
};
bool is_involution() const
{
if (N == 4)
{
__m128i iLUT[LUT_XMM_N];
inverse((uint8_t *)iLUT);
return EQU(LUT[0], iLUT[0]);
}
else
{
uint8_t iS[LUT_UNIT_N];
uint8_t * S = (uint8_t *)LUT;
inverse(iS);
for (int i = 0; i < LUT_UNIT_N; i++)
{
if (S[i] != iS[i]) return false;
}
return true;
}
}
void difference_distribution_matrix(int DDT[LUT_UNIT_N][LUT_UNIT_N], int & Diff, int DDT_spectrum[LUT_UNIT_N+1], int & Diff1, int DDT1_spectrum[LUT_UNIT_N+1]) const
{
memset(DDT[0], 0, sizeof(int) * LUT_UNIT_N * LUT_UNIT_N);
Diff = 0;
memset(DDT_spectrum, 0, sizeof(int) * (LUT_UNIT_N+1));
Diff1 = 0;
memset(DDT1_spectrum, 0, sizeof(int) * (LUT_UNIT_N+1));
if (N == 4)
{
__m128i ton;
__m128i t1;
__m128i t2;
int cnt;
#undef TestOd_1_1
#define TestOd_1_1(id, od) \
{ \
ton = _mm_cmpeq_epi8(x##od, t2); \
cnt = _mm_popcnt_u32(_mm_movemask_epi8(ton)); \
DDT[0x##id][0x##od] = cnt; \
Diff = cnt > Diff ? cnt : Diff; \
DDT_spectrum[cnt]++; \
Diff1 = cnt > Diff1 ? cnt : Diff1; \
DDT1_spectrum[cnt]++; \
}
#undef TestOd
#define TestOd(id, od) \
{ \
ton = _mm_cmpeq_epi8(x##od, t2); \
cnt = _mm_popcnt_u32(_mm_movemask_epi8(ton)); \
DDT[0x##id][0x##od] = cnt; \
Diff = cnt > Diff ? cnt : Diff; \