-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsha1.c
241 lines (217 loc) · 8.63 KB
/
sha1.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// clang -Wall -O3 -mssse3 -msha sha.c -o sha
#include <stdint.h>
#include <immintrin.h>
#include <memory.h>
#define MBYTES 64
typedef struct {
unsigned char msgbuf[MBYTES];
size_t msgbuf_count;
uint64_t total_count;
// Intermediate hash
__m128i h0123; // h0 : h1 : h2 : h3
__m128i h4; // h4 : 0 : 0 : 0
} sha1_ctx;
#define H0 0x67452301
#define H1 0xefcdab89
#define H2 0x98badcfe
#define H3 0x10325476
#define H4 0xc3d2e1f0
void SHA1Init(sha1_ctx* ctx)
{
ctx->h0123 = _mm_set_epi32(H0, H1, H2, H3);
ctx->h4 = _mm_set_epi32(H4, 0, 0, 0);
ctx->msgbuf_count = 0;
ctx->total_count = 0;
}
void SHA1ProcessMsgBlock(sha1_ctx* ctx, const unsigned char* msg)
{
// Cyclic W array
// We keep the W array content cyclically in 4 variables
// Initially:
// cw0 = w0 : w1 : w2 : w3
// cw1 = w4 : w5 : w6 : w7
// cw2 = w8 : w9 : w10 : w11
// cw3 = w12 : w13 : w14 : w15
const __m128i byteswapindex = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
const __m128i* msgx = (const __m128i*)msg;
__m128i cw0 = _mm_shuffle_epi8(_mm_loadu_si128(msgx), byteswapindex);
__m128i cw1 = _mm_shuffle_epi8(_mm_loadu_si128(msgx + 1), byteswapindex);
__m128i cw2 = _mm_shuffle_epi8(_mm_loadu_si128(msgx + 2), byteswapindex);
__m128i cw3 = _mm_shuffle_epi8(_mm_loadu_si128(msgx + 3), byteswapindex);
// Advance W array cycle
// Inputs:
// CW0 = w[t-16] : w[t-15] : w[t-14] : w[t-13]
// CW1 = w[t-12] : w[t-11] : w[t-10] : w[t-9]
// CW2 = w[t-8] : w[t-7] : w[t-6] : w[t-5]
// CW3 = w[t-4] : w[t-3] : w[t-2] : w[t-1]
// Outputs:
// CW1 = w[t-12] : w[t-11] : w[t-10] : w[t-9]
// CW2 = w[t-8] : w[t-7] : w[t-6] : w[t-5]
// CW3 = w[t-4] : w[t-3] : w[t-2] : w[t-1]
// CW0 = w[t] : w[t+1] : w[t+2] : w[t+3]
#define CYCLE_W(CW0, CW1, CW2, CW3) \
CW0 = _mm_sha1msg1_epu32(CW0, CW1); \
CW0 = _mm_xor_si128(CW0, CW2); \
CW0 = _mm_sha1msg2_epu32(CW0, CW3);
__m128i state1 = ctx->h0123; // state1 = a : b : c : d
__m128i w_next = _mm_add_epi32(cw0, ctx->h4); // w_next = w0+e : w1 : w2 : w3
__m128i state2;
// w0 - w3
state2 = _mm_sha1rnds4_epu32(state1, w_next, 0);// state2 = a' : b' : c' : d'
w_next = _mm_sha1nexte_epu32(state1, cw1); // w_next = w4+e' : w5 : w6 : w7
// w4 - w7
state1 = _mm_sha1rnds4_epu32(state2, w_next, 0);
w_next = _mm_sha1nexte_epu32(state2, cw2);
// w8 - w11
state2 = _mm_sha1rnds4_epu32(state1, w_next, 0);
w_next = _mm_sha1nexte_epu32(state1, cw3);
// w12 - w15
CYCLE_W(cw0, cw1, cw2, cw3); // cw0 = w16 : w17 : w18 : w19
state1 = _mm_sha1rnds4_epu32(state2, w_next, 0);
w_next = _mm_sha1nexte_epu32(state2, cw0);
// w16 - w19
CYCLE_W(cw1, cw2, cw3, cw0); // cw1 = w20 : w21 : w22 : w23
state2 = _mm_sha1rnds4_epu32(state1, w_next, 0);
w_next = _mm_sha1nexte_epu32(state1, cw1);
// w20 - w23
CYCLE_W(cw2, cw3, cw0, cw1); // cw2 = w24 : w25 : w26 : w27
state1 = _mm_sha1rnds4_epu32(state2, w_next, 1);
w_next = _mm_sha1nexte_epu32(state2, cw2);
// w24 - w27
CYCLE_W(cw3, cw0, cw1, cw2); // cw3 = w28 : w29 : w30 : w31
state2 = _mm_sha1rnds4_epu32(state1, w_next, 1);
w_next = _mm_sha1nexte_epu32(state1, cw3);
// w28 - w31
CYCLE_W(cw0, cw1, cw2, cw3); // cw0 = w32 : w33 : w34 : w35
state1 = _mm_sha1rnds4_epu32(state2, w_next, 1);
w_next = _mm_sha1nexte_epu32(state2, cw0);
// w32 - w35
CYCLE_W(cw1, cw2, cw3, cw0); // cw1 = w36 : w37 : w38 : w39
state2 = _mm_sha1rnds4_epu32(state1, w_next, 1);
w_next = _mm_sha1nexte_epu32(state1, cw1);
// w36 - w39
CYCLE_W(cw2, cw3, cw0, cw1); // cw2 = w40 : w41 : w42 : w43
state1 = _mm_sha1rnds4_epu32(state2, w_next, 1);
w_next = _mm_sha1nexte_epu32(state2, cw2);
// w40 - w43
CYCLE_W(cw3, cw0, cw1, cw2); // cw3 = w44 : w45 : w46 : w47
state2 = _mm_sha1rnds4_epu32(state1, w_next, 2);
w_next = _mm_sha1nexte_epu32(state1, cw3);
// w44 - w47
CYCLE_W(cw0, cw1, cw2, cw3); // cw0 = w48 : w49 : w50 : w51
state1 = _mm_sha1rnds4_epu32(state2, w_next, 2);
w_next = _mm_sha1nexte_epu32(state2, cw0);
// w48 - w51
CYCLE_W(cw1, cw2, cw3, cw0); // cw1 = w52 : w53 : w54 : w55
state2 = _mm_sha1rnds4_epu32(state1, w_next, 2);
w_next = _mm_sha1nexte_epu32(state1, cw1);
// w52 - w55
CYCLE_W(cw2, cw3, cw0, cw1); // cw2 = w56 : w57 : w58 : w59
state1 = _mm_sha1rnds4_epu32(state2, w_next, 2);
w_next = _mm_sha1nexte_epu32(state2, cw2);
// w56 - w59
CYCLE_W(cw3, cw0, cw1, cw2); // cw3 = w60 : w61 : w62 : w63
state2 = _mm_sha1rnds4_epu32(state1, w_next, 2);
w_next = _mm_sha1nexte_epu32(state1, cw3);
// w60 - w63
CYCLE_W(cw0, cw1, cw2, cw3); // cw0 = w64 : w65 : w66 : w67
state1 = _mm_sha1rnds4_epu32(state2, w_next, 3);
w_next = _mm_sha1nexte_epu32(state2, cw0);
// w64 - w67
CYCLE_W(cw1, cw2, cw3, cw0); // cw1 = w68 : w69 : w70 : w71
state2 = _mm_sha1rnds4_epu32(state1, w_next, 3);
w_next = _mm_sha1nexte_epu32(state1, cw1);
// w68 - w71
CYCLE_W(cw2, cw3, cw0, cw1); // cw2 = w72 : w73 : w74 : w75
state1 = _mm_sha1rnds4_epu32(state2, w_next, 3);
w_next = _mm_sha1nexte_epu32(state2, cw2);
// w72 - w75
CYCLE_W(cw3, cw0, cw1, cw2); // cw3 = w76 : w77 : w78 : w79
state2 = _mm_sha1rnds4_epu32(state1, w_next, 3);
w_next = _mm_sha1nexte_epu32(state1, cw3);
// w76 - w79
state1 = _mm_sha1rnds4_epu32(state2, w_next, 3); // state1 = final a : b : c : d
ctx->h4 = _mm_sha1nexte_epu32(state2, ctx->h4); // Add final e to h4
ctx->h0123 = _mm_add_epi32(state1, ctx->h0123); // Add final a:b:c:d to h0:h1:h2:h3
}
void SHA1Update(sha1_ctx* ctx, const void* buf, size_t length)
{
const unsigned char* p = (const unsigned char*)buf;
ctx->total_count += length;
// If any bytes are left in the message buffer,
// fullfill the block first
if (ctx->msgbuf_count) {
size_t c = MBYTES - ctx->msgbuf_count;
if (length < c) {
memcpy(ctx->msgbuf + ctx->msgbuf_count, p, length);
ctx->msgbuf_count += length;
return;
}
else {
memcpy(ctx->msgbuf + ctx->msgbuf_count, p, c);
p += c;
length -= c;
SHA1ProcessMsgBlock(ctx, ctx->msgbuf);
ctx->msgbuf_count = 0;
}
}
// When we reach here, we have no data left in the message buffer
while (length >= MBYTES) {
// No need to copy into the internal message block
SHA1ProcessMsgBlock(ctx, p);
p += MBYTES;
length -= MBYTES;
}
// Leave the remaining bytes in the message buffer
if (length) {
memcpy(ctx->msgbuf, p, length);
ctx->msgbuf_count = length;
}
}
void SHA1Final(sha1_ctx* ctx, void* digest)
{
// When we reach here, the block is supposed to be unfullfilled.
// Add the terminating bit
ctx->msgbuf[ctx->msgbuf_count++] = 0x80;
// Need to set total length in the last 8-byte of the block.
// If there is no room for the length, process this block first
if (ctx->msgbuf_count + 8 > MBYTES) {
// Fill zeros and process
memset(ctx->msgbuf + ctx->msgbuf_count, 0, MBYTES - ctx->msgbuf_count);
SHA1ProcessMsgBlock(ctx, ctx->msgbuf);
ctx->msgbuf_count = 0;
}
// Fill zeros before the last 8-byte of the block
memset(ctx->msgbuf + ctx->msgbuf_count, 0, MBYTES - 8 - ctx->msgbuf_count);
// Set the length of the message in big-endian
__m128i tmp = _mm_loadl_epi64((__m128i*)&ctx->total_count);
tmp = _mm_slli_epi64(tmp, 3); // convert # of bytes to # of bits
const __m128i total_count_byteswapindex = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7);
tmp = _mm_shuffle_epi8(tmp, total_count_byteswapindex); // convert to big endian
_mm_storel_epi64((__m128i*)(ctx->msgbuf + MBYTES - 8), tmp);
// Process the last block
SHA1ProcessMsgBlock(ctx, ctx->msgbuf);
// Set the resulting hash value, upside down
const __m128i byteswapindex = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i r0123 = _mm_shuffle_epi8(ctx->h0123, byteswapindex);
__m128i r4 = _mm_shuffle_epi8(ctx->h4, byteswapindex);
uint32_t* digestdw = (uint32_t*)digest;
_mm_storeu_si128((__m128i*)digestdw, r0123);
digestdw[4] = _mm_cvtsi128_si32(r4);
}
#if 0
#include <stdio.h>
int main()
{
sha1_ctx ctx;
SHA1Init(&ctx);
SHA1Update(&ctx, "a", 1);
uint8_t digest[20];
SHA1Final(&ctx, digest);
for (int i = 0; i < 20; i++)
{
printf("%02x", digest[i]);
}
printf("\n");
}
#endif