This helps GCC to autovectorize its implementation.
https://compiler-explorer.com/z/61xhGE784
so instead of
```
utf8_length_from_utf32(char32_t const*, unsigned long):
test rsi, rsi
je .L8
lea rcx, [rdi+rsi*4]
xor edx, edx
jmp .L7
.L11:
add rdx, 1
.L4:
add rdi, 4
cmp rcx, rdi
je .L1
.L7:
mov eax, DWORD PTR [rdi]
cmp eax, 127
jbe .L11
cmp eax, 2047
ja .L5
add rdi, 4
add rdx, 2
cmp rcx, rdi
jne .L7
.L1:
mov rax, rdx
ret
.L5:
mov rsi, rdx
xor edx, edx
cmp eax, 65535
seta dl
lea rdx, [rdx+3+rsi]
jmp .L4
.L8:
xor edx, edx
mov rax, rdx
ret
```
it generates
```
utf8_length_from_utf32(char32_t const*, unsigned long):
mov rcx, rsi
test rsi, rsi
je .L7
lea rax, [rsi-1]
cmp rax, 2
jbe .L8
mov rdx, rsi
pxor xmm5, xmm5
mov rax, rdi
movdqa xmm4, XMMWORD PTR .LC1[rip]
shr rdx, 2
movdqa xmm3, xmm5
movdqa xmm10, XMMWORD PTR .LC2[rip]
movdqa xmm6, XMMWORD PTR .LC5[rip]
sal rdx, 4
movdqa xmm7, XMMWORD PTR .LC8[rip]
pxor xmm2, xmm2
movdqa xmm9, XMMWORD PTR .LC6[rip]
movdqa xmm8, XMMWORD PTR .LC7[rip]
add rdx, rdi
.L4:
movdqu xmm0, XMMWORD PTR [rax]
add rax, 16
psubd xmm0, xmm10
movdqa xmm1, xmm0
movdqa xmm11, xmm0
pcmpgtd xmm1, xmm9
pcmpgtd xmm11, xmm8
pcmpgtd xmm0, xmm7
pand xmm1, xmm4
pand xmm11, xmm4
movdqa xmm13, xmm1
movdqa xmm12, xmm11
punpckhdq xmm1, xmm2
pand xmm0, xmm4
punpckhdq xmm11, xmm2
punpckldq xmm13, xmm2
paddq xmm11, xmm1
movdqa xmm1, xmm0
punpckldq xmm12, xmm2
punpckldq xmm1, xmm2
punpckhdq xmm0, xmm2
paddq xmm12, xmm13
paddq xmm1, xmm6
paddq xmm0, xmm6
paddq xmm1, xmm12
paddq xmm0, xmm11
paddq xmm3, xmm1
paddq xmm5, xmm0
cmp rax, rdx
jne .L4
paddq xmm3, xmm5
movdqa xmm0, xmm3
psrldq xmm0, 8
paddq xmm3, xmm0
movq rax, xmm3
test cl, 3
je .L1
mov rdx, rcx
and rdx, -4
.L3:
mov r8d, DWORD PTR [rdi+rdx*4]
xor esi, esi
lea r9, [0+rdx*4]
cmp r8d, 127
seta sil
lea rax, [rax+1+rsi]
xor esi, esi
cmp r8d, 2047
seta sil
add rsi, rax
xor eax, eax
cmp r8d, 65535
seta al
add rax, rsi
lea rsi, [rdx+1]
cmp rsi, rcx
jnb .L1
mov r8d, DWORD PTR [rdi+4+r9]
xor r10d, r10d
cmp r8d, 127
seta r10b
xor esi, esi
cmp r8d, 2047
seta sil
cmp r8d, 65535
seta r8b
lea rsi, [r10+1+rsi]
add rdx, 2
movzx r8d, r8b
add rsi, r8
add rax, rsi
cmp rdx, rcx
jnb .L1
mov ecx, DWORD PTR [rdi+8+r9]
xor esi, esi
cmp ecx, 2047
seta sil
xor edx, edx
cmp ecx, 127
seta dl
cmp ecx, 65535
seta cl
lea rdx, [rsi+1+rdx]
movzx ecx, cl
add rdx, rcx
add rax, rdx
ret
.L7:
xor eax, eax
.L1:
ret
.L8:
xor edx, edx
xor eax, eax
jmp .L3
.LC1:
.long 1
.long 1
.long 1
.long 1
.LC2:
.long -2147483648
.long -2147483648
.long -2147483648
.long -2147483648
.LC5:
.quad 1
.quad 1
.LC6:
.long -2147483521
.long -2147483521
.long -2147483521
.long -2147483521
.LC7:
.long -2147481601
.long -2147481601
.long -2147481601
.long -2147481601
.LC8:
.long -2147418113
.long -2147418113
.long -2147418113
.long -2147418113
```