-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreedsolomon-x86_64-mmx-orig.s
120 lines (106 loc) · 3.1 KB
/
reedsolomon-x86_64-mmx-orig.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
.align 16
.globl rs_process_x86_64_mmx_orig
.text
rs_process_x86_64_mmx_orig:
# void rs_process_x86_64_mmx(void* dst, const void* src, size_t size, unsigned* LH);
#
push %rbp
# push %rsi
# push %rdi
push %rbx
mov %rcx, %rbp # combined multiplication table
mov %rdx, %rcx # number of bytes to process (multiple of 8)
mov (%rsi), %edx # load 1st 8 source bytes
movd 4(%rsi), %mm4
sub $8, %rcx # reduce # of loop iterations by 1
jz last8
add %rcx, %rsi # point to last set of 8-bytes of input
add %rcx, %rdi # point to last set of 8-bytes of output
neg %rcx # convert byte size to count-up
# This is faster than the scalar code mainly because wider load/stores
# for the source and dest data leave the load unit(s) free
# for 32b loads from the LH lookup table.
# punpckldq just loads 32b from memory into the high half of the MMX reg
# %rdi # destination (function arg)
# %rsi # source (function arg)
# rbp: lookup table
# eax: scratch (holds %dl)
# ebx: scratch (holds %dh)
# ecx: -count, counts upward to 0.
# edx / mm4: src. (mm4 loads 64B. edx gets 32B at a time from mm4, and is shifted by 16B for the low/high GF16)
# mm5: previous value of dest
.align 32
loop:
movzx %dl, %eax
movzx %dh, %ebx
movd 0x0000(%rbp, %rax, 4), %mm0
shr $16, %edx
movd 0x0400(%rbp, %rbx, 4), %mm1
movzx %dl, %eax
movq 0(%rdi, %rcx, 1), %mm5
movzx %dh, %ebx
movd 0x0000(%rbp, %rax, 4), %mm2
movd %mm4, %edx
movq 8(%rsi, %rcx, 1), %mm4 # read-ahead next 8 source bytes
movzx %dl, %eax
movd 0x0400(%rbp, %rbx, 4), %mm3
movzx %dh, %ebx
shr $16, %edx
punpckldq 0x0000(%rbp, %rax, 4), %mm0
movzx %dl, %eax
punpckldq 0x0400(%rbp, %rbx, 4), %mm1
movzx %dh, %ebx
punpckldq 0x0000(%rbp, %rax, 4), %mm2
pxor %mm0, %mm1
punpckldq 0x0400(%rbp, %rbx, 4), %mm3
movd %mm4, %edx # prepare src bytes 3-0 for next loop
pxor %mm5, %mm1
pxor %mm2, %mm3
psllq $16, %mm3
psrlq $32, %mm4 # align src bytes 7-4 for next loop
pxor %mm3, %mm1
movq %mm1, 0(%rdi, %rcx, 1)
add $8, %rcx
jnz loop
#
# handle final iteration separately (so that a read beyond the end of the input/output buffer is avoided)
#
last8:
movzx %dl, %eax
movzx %dh, %ebx
movd 0x0000(%rbp, %rax, 4), %mm0
shr $16, %edx
movd 0x0400(%rbp, %rbx, 4), %mm1
movzx %dl, %eax
movq 0(%rdi, %rcx, 1), %mm5
movzx %dh, %ebx
movd 0x0000(%rbp, %rax, 4), %mm2
movd %mm4, %edx
# movq 8(%rsi, %rcx, 1), %mm4 # read-ahead next 8 source bytes
movzx %dl, %eax
movd 0x0400(%rbp, %rbx, 4), %mm3
movzx %dh, %ebx
shr $16, %edx
punpckldq 0x0000(%rbp, %rax, 4), %mm0
movzx %dl, %eax
punpckldq 0x0400(%rbp, %rbx, 4), %mm1
movzx %dh, %ebx
punpckldq 0x0000(%rbp, %rax, 4), %mm2
pxor %mm0, %mm1
punpckldq 0x0400(%rbp, %rbx, 4), %mm3
# movd %mm4, %edx # prepare src bytes 3-0 for next loop
pxor %mm5, %mm1
pxor %mm2, %mm3
psllq $16, %mm3
# psrlq $32, %mm4 # align src bytes 7-4 for next loop
pxor %mm3, %mm1
movq %mm1, 0(%rdi, %rcx, 1)
#
# done: exit MMX mode, restore regs/stack, exit
#
emms
pop %rbx
# pop %rdi
# pop %rsi
pop %rbp
ret