-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathasm-gpshift8.s
63 lines (57 loc) · 1.83 KB
/
asm-gpshift8.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#define LUT_MOVD(r, x1, x2) \
movzx %r#l, %eax \
movzx %r#h, %ebx \
movd 0x000(%rbp, %rax, 2), x1 \
movd 0x200(%rbp, %rbx, 2), x2
# VC++ intrinsics code used s0 = punpck(s0, movd(lut))
# cacheline splits or MMX might be faster than the extra uop to load first.
# PUNPCKLDQ is one fused uop, even from RAM. ...Quad -> DoubleQuad (64->128) from RAM is 2 fused uops, but others are 1
#define LUT_UNPCK(r, x1, x2) \
movzx %r#l, %eax \
movzx %r#h, %ebx \
punpckldq 0x000(%rbp, %rax, 2), x1 \
punpckldq 0x200(%rbp, %rbx, 2), x2
#define LUT_UNPCK_AVX
movzx %r#l, %eax \
movzx %r#h, %ebx \
vpunpckldq 0x000(%rbp, %rax, 2), x1 \
vpunpcklwd 0x200(%rbp, %rbx, 2), x2
.align 32
loop2xGP:
movq 0(%rsi), %rcx # src[0..3] (4xu16)
movq 8(%rsi), %rdx # src[4..8]
LUT_MOVD (c, %xmm0, %xmm1)
LUT_MOVD (d, %xmm8, %xmm9)
shr $16, %rcx
shr $16, %rdx
LUT_UNPCK (c, %xmm0, %xmm1) # cacheline split 1 in 16 (assuming uniform distribution of src lower few bits in every byte)
LUT_UNPCK (d, %xmm8, %xmm9) # cacheline split 1 in 16
# xmm0 has: dwords: [3]=0, [2]=0, [1]=LUT(src[1LO]) [0]=LUT(src[0LO])
# xmm1 has: dwords: [3]=0, [2]=0, [1]=LUT(src[1HI]) [0]=LUT(src[0HI])
shr $16, %rcx
shr $16, %rdx
LUT_MOVD (c, %xmm2, %xmm3)
LUT_MOVD (d, %xmm8, %xmm9)
.align 32
shr8loop:
### %rdx has 8 bytes data from (%rsi)
# %rsi points to source data
# %rdi points to the location in the output buffer to modify
movq (%rsi), %rdx # TODO: read-ahead next 8 source bytes to avoid potential false dep on store that's a multiple of 4k away
movzx %dl, %eax # 0
shr $8, %rdx
movzx %dl, %eax # 1
shr $8, %rdx
movzx %dl, %eax # 2
shr $8, %rdx
movzx %dl, %eax # 3
shr $8, %rdx
movzx %dl, %eax # 4
shr $8, %rdx
movzx %dl, %eax # 5
shr $8, %rdx
movzx %dl, %eax # 6
shr $8, %rdx
#movzx %dl, %eax # 7
# %rdx is ready for use
ja shr8loop