-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxctr_amd64.s
223 lines (203 loc) · 4.18 KB
/
xctr_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// Code generated by command: go run asm.go -out out/xctr_amd64.s -stubs out/stub_amd64.go -pkg hctr2. DO NOT EDIT.
//go:build gc && !purego
#include "textflag.h"
// func xctrAsm(nr int, xk *uint32, out *byte, in *byte, nblocks int, iv *[16]byte)
// Requires: AES, SSE2
TEXT ·xctrAsm(SB), NOSPLIT, $0-48
MOVQ nr+0(FP), AX
MOVQ xk+8(FP), CX
MOVQ out+16(FP), DX
MOVQ in+24(FP), BX
MOVQ nblocks+32(FP), SI
MOVQ iv+40(FP), DI
// Load every fourth round key starting with the initial
// round key addition.
// Initialize per-block constants.
// Counter index.
MOVQ $0x00000001, R8
// Offset into dst, src.
XORQ R9, R9
// Nonce.
MOVOU (DI), X0
MOVQ SI, R10
ANDQ $0x03, R10
JZ initWideLoop
SHLQ $0x04, R10
singleLoop:
MOVQ R8, X1
PXOR X0, X1
XORQ DI, DI
// Initial round key addition.
MOVOU (CX)(DI*1), X2
PXOR X2, X1
ADDQ $0x00000010, DI
// Choose between AES-128, AES-192, and AES-256.
CMPQ AX, $0x0000000c
JEQ enc192x1
JLT enc128x1
// Rounds 1 and 2.
MOVOU (CX)(DI*1), X2
AESENC X2, X1
MOVOU 16(CX)(DI*1), X2
AESENC X2, X1
ADDQ $0x00000020, DI
// Rounds 3 and 4.
enc192x1:
MOVOU (CX)(DI*1), X2
AESENC X2, X1
MOVOU 16(CX)(DI*1), X2
AESENC X2, X1
ADDQ $0x00000020, DI
// Rounds 5 through 14.
enc128x1:
MOVOU (CX)(DI*1), X2
AESENC X2, X1
MOVOU 16(CX)(DI*1), X2
AESENC X2, X1
MOVOU 32(CX)(DI*1), X2
AESENC X2, X1
MOVOU 48(CX)(DI*1), X2
AESENC X2, X1
MOVOU 64(CX)(DI*1), X2
AESENC X2, X1
MOVOU 80(CX)(DI*1), X2
AESENC X2, X1
MOVOU 96(CX)(DI*1), X2
AESENC X2, X1
MOVOU 112(CX)(DI*1), X2
AESENC X2, X1
MOVOU 128(CX)(DI*1), X2
AESENC X2, X1
MOVOU 144(CX)(DI*1), X2
AESENCLAST X2, X1
MOVOU (BX)(R9*1), X2
PXOR X2, X1
MOVOU X1, (DX)(R9*1)
ADDQ $0x10, R9
ADDQ $0x01, R8
CMPQ R10, R9
JNE singleLoop
initWideLoop:
SHRQ $0x02, SI
JZ done
SHLQ $0x06, SI
ADDQ R9, SI
wideLoop:
MOVQ R8, X1
INCQ R8
MOVQ R8, X2
INCQ R8
MOVQ R8, X3
INCQ R8
MOVQ R8, X4
INCQ R8
PXOR X0, X1
PXOR X0, X2
PXOR X0, X3
PXOR X0, X4
XORQ DI, DI
// Initial round key addition.
MOVOU (CX)(DI*1), X5
PXOR X5, X1
PXOR X5, X2
PXOR X5, X3
PXOR X5, X4
ADDQ $0x00000010, DI
// Choose between AES-128, AES-192, and AES-256.
CMPQ AX, $0x0000000c
JEQ enc192x4
JLT enc128x4
// Rounds 1 and 2.
MOVOU (CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 16(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
ADDQ $0x00000020, DI
// Rounds 3 and 4.
enc192x4:
MOVOU (CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 16(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
ADDQ $0x00000020, DI
// Rounds 5 through 14.
enc128x4:
MOVOU (CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 16(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 32(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 48(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 64(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 80(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 96(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 112(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 128(CX)(DI*1), X5
AESENC X5, X1
AESENC X5, X2
AESENC X5, X3
AESENC X5, X4
MOVOU 144(CX)(DI*1), X5
AESENCLAST X5, X1
AESENCLAST X5, X2
AESENCLAST X5, X3
AESENCLAST X5, X4
MOVOU (BX)(R9*1), X5
MOVOU 16(BX)(R9*1), X6
MOVOU 32(BX)(R9*1), X7
MOVOU 48(BX)(R9*1), X8
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
PXOR X8, X4
MOVOU X1, (DX)(R9*1)
MOVOU X2, 16(DX)(R9*1)
MOVOU X3, 32(DX)(R9*1)
MOVOU X4, 48(DX)(R9*1)
ADDQ $0x40, R9
CMPQ SI, R9
JNE wideLoop
done:
RET