Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make utf8_length_from_utf32 branchless. #489

Closed
wants to merge 1 commit into from

Conversation

ttsugriy
Copy link
Contributor

@ttsugriy ttsugriy commented Aug 29, 2023

This helps GCC to autovectorize its implementation. https://compiler-explorer.com/z/M6Tjd43nz
so instead of

utf8_length_from_utf32(char32_t const*, unsigned long):
        test    rsi, rsi
        je      .L8
        lea     rcx, [rdi+rsi*4]
        xor     edx, edx
        jmp     .L7
.L11:
        add     rdx, 1
.L4:
        add     rdi, 4
        cmp     rcx, rdi
        je      .L1
.L7:
        mov     eax, DWORD PTR [rdi]
        cmp     eax, 127
        jbe     .L11
        cmp     eax, 2047
        ja      .L5
        add     rdi, 4
        add     rdx, 2
        cmp     rcx, rdi
        jne     .L7
.L1:
        mov     rax, rdx
        ret
.L5:
        mov     rsi, rdx
        xor     edx, edx
        cmp     eax, 65535
        seta    dl
        lea     rdx, [rdx+3+rsi]
        jmp     .L4
.L8:
        xor     edx, edx
        mov     rax, rdx
        ret

it generates

utf8_length_from_utf32(char32_t const*, unsigned long):
        mov     rcx, rsi
        test    rsi, rsi
        je      .L7
        lea     rax, [rsi-1]
        cmp     rax, 2
        jbe     .L8
        mov     rdx, rsi
        pxor    xmm5, xmm5
        mov     rax, rdi
        movdqa  xmm4, XMMWORD PTR .LC1[rip]
        shr     rdx, 2
        movdqa  xmm3, xmm5
        movdqa  xmm10, XMMWORD PTR .LC2[rip]
        movdqa  xmm6, XMMWORD PTR .LC5[rip]
        sal     rdx, 4
        movdqa  xmm7, XMMWORD PTR .LC8[rip]
        pxor    xmm2, xmm2
        movdqa  xmm9, XMMWORD PTR .LC6[rip]
        movdqa  xmm8, XMMWORD PTR .LC7[rip]
        add     rdx, rdi
.L4:
        movdqu  xmm0, XMMWORD PTR [rax]
        add     rax, 16
        psubd   xmm0, xmm10
        movdqa  xmm1, xmm0
        movdqa  xmm11, xmm0
        pcmpgtd xmm1, xmm9
        pcmpgtd xmm11, xmm8
        pcmpgtd xmm0, xmm7
        pand    xmm1, xmm4
        pand    xmm11, xmm4
        movdqa  xmm13, xmm1
        movdqa  xmm12, xmm11
        punpckhdq       xmm1, xmm2
        pand    xmm0, xmm4
        punpckhdq       xmm11, xmm2
        punpckldq       xmm13, xmm2
        paddq   xmm11, xmm1
        movdqa  xmm1, xmm0
        punpckldq       xmm12, xmm2
        punpckldq       xmm1, xmm2
        punpckhdq       xmm0, xmm2
        paddq   xmm12, xmm13
        paddq   xmm1, xmm6
        paddq   xmm0, xmm6
        paddq   xmm1, xmm12
        paddq   xmm0, xmm11
        paddq   xmm3, xmm1
        paddq   xmm5, xmm0
        cmp     rax, rdx
        jne     .L4
        paddq   xmm3, xmm5
        movdqa  xmm0, xmm3
        psrldq  xmm0, 8
        paddq   xmm3, xmm0
        movq    rax, xmm3
        test    cl, 3
        je      .L1
        mov     rdx, rcx
        and     rdx, -4
.L3:
        mov     r8d, DWORD PTR [rdi+rdx*4]
        xor     esi, esi
        lea     r9, [0+rdx*4]
        cmp     r8d, 127
        seta    sil
        lea     rax, [rax+1+rsi]
        xor     esi, esi
        cmp     r8d, 2047
        seta    sil
        add     rsi, rax
        xor     eax, eax
        cmp     r8d, 65535
        seta    al
        add     rax, rsi
        lea     rsi, [rdx+1]
        cmp     rsi, rcx
        jnb     .L1
        mov     r8d, DWORD PTR [rdi+4+r9]
        xor     r10d, r10d
        cmp     r8d, 127
        seta    r10b
        xor     esi, esi
        cmp     r8d, 2047
        seta    sil
        cmp     r8d, 65535
        seta    r8b
        lea     rsi, [r10+1+rsi]
        add     rdx, 2
        movzx   r8d, r8b
        add     rsi, r8
        add     rax, rsi
        cmp     rdx, rcx
        jnb     .L1
        mov     ecx, DWORD PTR [rdi+8+r9]
        xor     esi, esi
        cmp     ecx, 2047
        seta    sil
        xor     edx, edx
        cmp     ecx, 127
        seta    dl
        cmp     ecx, 65535
        seta    cl
        lea     rdx, [rsi+1+rdx]
        movzx   ecx, cl
        add     rdx, rcx
        add     rax, rdx
        ret
.L7:
        xor     eax, eax
.L1:
        ret
.L8:
        xor     edx, edx
        xor     eax, eax
        jmp     .L3
.LC1:
        .long   1
        .long   1
        .long   1
        .long   1
.LC2:
        .long   -2147483648
        .long   -2147483648
        .long   -2147483648
        .long   -2147483648
.LC5:
        .quad   1
        .quad   1
.LC6:
        .long   -2147483521
        .long   -2147483521
        .long   -2147483521
        .long   -2147483521
.LC7:
        .long   -2147481601
        .long   -2147481601
        .long   -2147481601
        .long   -2147481601
.LC8:
        .long   -2147418113
        .long   -2147418113
        .long   -2147418113
        .long   -2147418113

This helps GCC to autovectorize its implementation.
https://compiler-explorer.com/z/61xhGE784
so instead of
```
utf8_length_from_utf32(char32_t const*, unsigned long):
        test    rsi, rsi
        je      .L8
        lea     rcx, [rdi+rsi*4]
        xor     edx, edx
        jmp     .L7
.L11:
        add     rdx, 1
.L4:
        add     rdi, 4
        cmp     rcx, rdi
        je      .L1
.L7:
        mov     eax, DWORD PTR [rdi]
        cmp     eax, 127
        jbe     .L11
        cmp     eax, 2047
        ja      .L5
        add     rdi, 4
        add     rdx, 2
        cmp     rcx, rdi
        jne     .L7
.L1:
        mov     rax, rdx
        ret
.L5:
        mov     rsi, rdx
        xor     edx, edx
        cmp     eax, 65535
        seta    dl
        lea     rdx, [rdx+3+rsi]
        jmp     .L4
.L8:
        xor     edx, edx
        mov     rax, rdx
        ret
```
it generates
```
utf8_length_from_utf32(char32_t const*, unsigned long):
        mov     rcx, rsi
        test    rsi, rsi
        je      .L7
        lea     rax, [rsi-1]
        cmp     rax, 2
        jbe     .L8
        mov     rdx, rsi
        pxor    xmm5, xmm5
        mov     rax, rdi
        movdqa  xmm4, XMMWORD PTR .LC1[rip]
        shr     rdx, 2
        movdqa  xmm3, xmm5
        movdqa  xmm10, XMMWORD PTR .LC2[rip]
        movdqa  xmm6, XMMWORD PTR .LC5[rip]
        sal     rdx, 4
        movdqa  xmm7, XMMWORD PTR .LC8[rip]
        pxor    xmm2, xmm2
        movdqa  xmm9, XMMWORD PTR .LC6[rip]
        movdqa  xmm8, XMMWORD PTR .LC7[rip]
        add     rdx, rdi
.L4:
        movdqu  xmm0, XMMWORD PTR [rax]
        add     rax, 16
        psubd   xmm0, xmm10
        movdqa  xmm1, xmm0
        movdqa  xmm11, xmm0
        pcmpgtd xmm1, xmm9
        pcmpgtd xmm11, xmm8
        pcmpgtd xmm0, xmm7
        pand    xmm1, xmm4
        pand    xmm11, xmm4
        movdqa  xmm13, xmm1
        movdqa  xmm12, xmm11
        punpckhdq       xmm1, xmm2
        pand    xmm0, xmm4
        punpckhdq       xmm11, xmm2
        punpckldq       xmm13, xmm2
        paddq   xmm11, xmm1
        movdqa  xmm1, xmm0
        punpckldq       xmm12, xmm2
        punpckldq       xmm1, xmm2
        punpckhdq       xmm0, xmm2
        paddq   xmm12, xmm13
        paddq   xmm1, xmm6
        paddq   xmm0, xmm6
        paddq   xmm1, xmm12
        paddq   xmm0, xmm11
        paddq   xmm3, xmm1
        paddq   xmm5, xmm0
        cmp     rax, rdx
        jne     .L4
        paddq   xmm3, xmm5
        movdqa  xmm0, xmm3
        psrldq  xmm0, 8
        paddq   xmm3, xmm0
        movq    rax, xmm3
        test    cl, 3
        je      .L1
        mov     rdx, rcx
        and     rdx, -4
.L3:
        mov     r8d, DWORD PTR [rdi+rdx*4]
        xor     esi, esi
        lea     r9, [0+rdx*4]
        cmp     r8d, 127
        seta    sil
        lea     rax, [rax+1+rsi]
        xor     esi, esi
        cmp     r8d, 2047
        seta    sil
        add     rsi, rax
        xor     eax, eax
        cmp     r8d, 65535
        seta    al
        add     rax, rsi
        lea     rsi, [rdx+1]
        cmp     rsi, rcx
        jnb     .L1
        mov     r8d, DWORD PTR [rdi+4+r9]
        xor     r10d, r10d
        cmp     r8d, 127
        seta    r10b
        xor     esi, esi
        cmp     r8d, 2047
        seta    sil
        cmp     r8d, 65535
        seta    r8b
        lea     rsi, [r10+1+rsi]
        add     rdx, 2
        movzx   r8d, r8b
        add     rsi, r8
        add     rax, rsi
        cmp     rdx, rcx
        jnb     .L1
        mov     ecx, DWORD PTR [rdi+8+r9]
        xor     esi, esi
        cmp     ecx, 2047
        seta    sil
        xor     edx, edx
        cmp     ecx, 127
        seta    dl
        cmp     ecx, 65535
        seta    cl
        lea     rdx, [rsi+1+rdx]
        movzx   ecx, cl
        add     rdx, rcx
        add     rax, rdx
        ret
.L7:
        xor     eax, eax
.L1:
        ret
.L8:
        xor     edx, edx
        xor     eax, eax
        jmp     .L3
.LC1:
        .long   1
        .long   1
        .long   1
        .long   1
.LC2:
        .long   -2147483648
        .long   -2147483648
        .long   -2147483648
        .long   -2147483648
.LC5:
        .quad   1
        .quad   1
.LC6:
        .long   -2147483521
        .long   -2147483521
        .long   -2147483521
        .long   -2147483521
.LC7:
        .long   -2147481601
        .long   -2147481601
        .long   -2147481601
        .long   -2147481601
.LC8:
        .long   -2147418113
        .long   -2147418113
        .long   -2147418113
        .long   -2147418113
```
@anonrig
Copy link
Member

anonrig commented Aug 29, 2023

ada_idna.cpp file is originally implemented in idna repository. Can you update it directly?

@ttsugriy
Copy link
Contributor Author

oh, sorry, I didn't realize it's from a different repo. I'll send a PR to it.

@ttsugriy ttsugriy closed this Aug 29, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants