Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make utf8_length_from_utf32 branchless. #489

Closed
wants to merge 1 commit into from

Commits on Aug 29, 2023

  1. Make utf8_length_from_utf32 branchless.

    This helps GCC to autovectorize its implementation.
    https://compiler-explorer.com/z/61xhGE784
    so instead of
    ```
    utf8_length_from_utf32(char32_t const*, unsigned long):
            test    rsi, rsi
            je      .L8
            lea     rcx, [rdi+rsi*4]
            xor     edx, edx
            jmp     .L7
    .L11:
            add     rdx, 1
    .L4:
            add     rdi, 4
            cmp     rcx, rdi
            je      .L1
    .L7:
            mov     eax, DWORD PTR [rdi]
            cmp     eax, 127
            jbe     .L11
            cmp     eax, 2047
            ja      .L5
            add     rdi, 4
            add     rdx, 2
            cmp     rcx, rdi
            jne     .L7
    .L1:
            mov     rax, rdx
            ret
    .L5:
            mov     rsi, rdx
            xor     edx, edx
            cmp     eax, 65535
            seta    dl
            lea     rdx, [rdx+3+rsi]
            jmp     .L4
    .L8:
            xor     edx, edx
            mov     rax, rdx
            ret
    ```
    it generates
    ```
    utf8_length_from_utf32(char32_t const*, unsigned long):
            mov     rcx, rsi
            test    rsi, rsi
            je      .L7
            lea     rax, [rsi-1]
            cmp     rax, 2
            jbe     .L8
            mov     rdx, rsi
            pxor    xmm5, xmm5
            mov     rax, rdi
            movdqa  xmm4, XMMWORD PTR .LC1[rip]
            shr     rdx, 2
            movdqa  xmm3, xmm5
            movdqa  xmm10, XMMWORD PTR .LC2[rip]
            movdqa  xmm6, XMMWORD PTR .LC5[rip]
            sal     rdx, 4
            movdqa  xmm7, XMMWORD PTR .LC8[rip]
            pxor    xmm2, xmm2
            movdqa  xmm9, XMMWORD PTR .LC6[rip]
            movdqa  xmm8, XMMWORD PTR .LC7[rip]
            add     rdx, rdi
    .L4:
            movdqu  xmm0, XMMWORD PTR [rax]
            add     rax, 16
            psubd   xmm0, xmm10
            movdqa  xmm1, xmm0
            movdqa  xmm11, xmm0
            pcmpgtd xmm1, xmm9
            pcmpgtd xmm11, xmm8
            pcmpgtd xmm0, xmm7
            pand    xmm1, xmm4
            pand    xmm11, xmm4
            movdqa  xmm13, xmm1
            movdqa  xmm12, xmm11
            punpckhdq       xmm1, xmm2
            pand    xmm0, xmm4
            punpckhdq       xmm11, xmm2
            punpckldq       xmm13, xmm2
            paddq   xmm11, xmm1
            movdqa  xmm1, xmm0
            punpckldq       xmm12, xmm2
            punpckldq       xmm1, xmm2
            punpckhdq       xmm0, xmm2
            paddq   xmm12, xmm13
            paddq   xmm1, xmm6
            paddq   xmm0, xmm6
            paddq   xmm1, xmm12
            paddq   xmm0, xmm11
            paddq   xmm3, xmm1
            paddq   xmm5, xmm0
            cmp     rax, rdx
            jne     .L4
            paddq   xmm3, xmm5
            movdqa  xmm0, xmm3
            psrldq  xmm0, 8
            paddq   xmm3, xmm0
            movq    rax, xmm3
            test    cl, 3
            je      .L1
            mov     rdx, rcx
            and     rdx, -4
    .L3:
            mov     r8d, DWORD PTR [rdi+rdx*4]
            xor     esi, esi
            lea     r9, [0+rdx*4]
            cmp     r8d, 127
            seta    sil
            lea     rax, [rax+1+rsi]
            xor     esi, esi
            cmp     r8d, 2047
            seta    sil
            add     rsi, rax
            xor     eax, eax
            cmp     r8d, 65535
            seta    al
            add     rax, rsi
            lea     rsi, [rdx+1]
            cmp     rsi, rcx
            jnb     .L1
            mov     r8d, DWORD PTR [rdi+4+r9]
            xor     r10d, r10d
            cmp     r8d, 127
            seta    r10b
            xor     esi, esi
            cmp     r8d, 2047
            seta    sil
            cmp     r8d, 65535
            seta    r8b
            lea     rsi, [r10+1+rsi]
            add     rdx, 2
            movzx   r8d, r8b
            add     rsi, r8
            add     rax, rsi
            cmp     rdx, rcx
            jnb     .L1
            mov     ecx, DWORD PTR [rdi+8+r9]
            xor     esi, esi
            cmp     ecx, 2047
            seta    sil
            xor     edx, edx
            cmp     ecx, 127
            seta    dl
            cmp     ecx, 65535
            seta    cl
            lea     rdx, [rsi+1+rdx]
            movzx   ecx, cl
            add     rdx, rcx
            add     rax, rdx
            ret
    .L7:
            xor     eax, eax
    .L1:
            ret
    .L8:
            xor     edx, edx
            xor     eax, eax
            jmp     .L3
    .LC1:
            .long   1
            .long   1
            .long   1
            .long   1
    .LC2:
            .long   -2147483648
            .long   -2147483648
            .long   -2147483648
            .long   -2147483648
    .LC5:
            .quad   1
            .quad   1
    .LC6:
            .long   -2147483521
            .long   -2147483521
            .long   -2147483521
            .long   -2147483521
    .LC7:
            .long   -2147481601
            .long   -2147481601
            .long   -2147481601
            .long   -2147481601
    .LC8:
            .long   -2147418113
            .long   -2147418113
            .long   -2147418113
            .long   -2147418113
    ```
    ttsugriy committed Aug 29, 2023
    Configuration menu
    Copy the full SHA
    03945fd View commit details
    Browse the repository at this point in the history