Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utf8 valid arm64 neon #87

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions utf8/test.out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�􏿿􏿿􏿿􏿿􏿿􏿿􏿿􏿿
2 changes: 1 addition & 1 deletion utf8/valid.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
type Validation byte

const (
Invalid = 0
Invalid = 0b00
UTF8 = 0b01
ASCII = 0b10 | UTF8
)
Expand Down
2 changes: 1 addition & 1 deletion utf8/valid_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions utf8/valid_arm64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

228 changes: 228 additions & 0 deletions utf8/valid_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
// TODO: license

//go:build !purego

#include "textflag.h"

// func validateNEON(p []byte) byte
TEXT ·validateNEON(SB),NOSPLIT,$0-25
MOVD s_base+0(FP), R10
MOVD s_len+8(FP), R11
CBZ R11, valid
CMP $16, R11
BLT small

VMOVQ $0x8080808080808080, $0x8080808080808080, V0

ascii_loop:
CMP $16, R11
BLT small

VLD1 (R10), [V1.B16]
VCMTST V1.B16, V0.B16, V2.B16
VMOV V2.D[0], R2
VMOV V2.D[1], R3
ORR R2, R3, R2
CBNZ R2, stop_ascii

ADD $16, R10
SUB $16, R11
B ascii_loop

stop_ascii:
VMOVQ $0x0202020202020202, $0x4915012180808080, V11
VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13
VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15
VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, V18
VMOVQ $0x0707070707070707, $0x0707070707070707, V12
VMOVQ $0xFFFFFFFFFFFFFFFF, $0xFFFFFFFFFFFFFFFF, V14
VMOVQ $0x7F7F7F7F7F7F7F7F, $0x7F7F7F7F7F7F7F7F, V16
VMOVQ $0xDFDFDFDFDFDFDFDF, $0xDFDFDFDFDFDFDFDF, V17
VMOVQ $0x0808080808080808, $0x0808080808080808, V19
VMOVQ $0x8080808080808080, $0x8080808080808080, V20
VMOVQ $0x0000000000000000, $0x0000000000000000, V30
VMOVQ $0x0000000000000000, $0x0000000000000000, V3

aligned_loop:
VLD1.P 16(R10), [V4.B16]
VEXT $15, V4.B16, V3.B16, V5.B16
VUSHR $4, V5.B16, V6.B16
VTBL V6.B16, [V11.B16], V6.B16
VAND V5.B16, V18.B16, V7.B16
VTBL V7.B16, [V13.B16], V7.B16
VUSHR $4, V4.B16, V8.B16
VTBL V8.B16, [V15.B16], V8.B16
VAND V6.B16, V7.B16, V9.B16
VAND V9.B16, V8.B16, V10.B16
VEXT $14, V4.B16, V3.B16, V5.B16
VUSHR $5, V5.B16, V6.B16
VCMEQ V12.B16, V6.B16, V6.B16
VEXT $13, V4.B16, V3.B16, V5.B16
VUSHR $4, V5.B16, V9.B16
VCMEQ V18.B16, V9.B16, V9.B16
VORR V6.B16, V9.B16, V9.B16
VAND V9.B16, V20.B16, V9.B16
VSUB V9.B16, V10.B16, V9.B16
VMOV V9.D[0], R1
VMOV V9.D[1], R2
ORR R1, R2, R1
CBNZ R1, no_valid
VMOV V4.B16, V3.B16
SUB $16, R11, R11
CMP $16, R11

BGE aligned_loop

B small_no_const

small:
CBZ R11, valid_ascii

tail_loop:
MOVBU (R10), R2
AND $0x80, R2
CBNZ R2, check_utf8
ADD $1, R10
SUB $1, R11
CBNZ R11, tail_loop
B valid_ascii


check_utf8:

VMOVQ $0x0202020202020202, $0x4915012180808080, V11
VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13
VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15
VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, V18
VMOVQ $0x0707070707070707, $0x0707070707070707, V12
VMOVQ $0xFFFFFFFFFFFFFFFF, $0xFFFFFFFFFFFFFFFF, V14
VMOVQ $0x7F7F7F7F7F7F7F7F, $0x7F7F7F7F7F7F7F7F, V16
VMOVQ $0xDFDFDFDFDFDFDFDF, $0xDFDFDFDFDFDFDFDF, V17
VMOVQ $0x0808080808080808, $0x0808080808080808, V19
VMOVQ $0x8080808080808080, $0x8080808080808080, V20
VMOVQ $0x0000000000000000, $0x0000000000000000, V30
VMOVQ $0x0000000000000000, $0x0000000000000000, V3

small_no_const:

SUB $16, R10, R10
ADD R11, R10, R10
VLD1.P 16(R10), [V4.B16]

ADR shift_table, R2
MOVW R11, R3
LSL $2, R3
ADD R3, R2
B (R2)


shift_table:
B do_shift_0
B do_shift_1
B do_shift_2
B do_shift_3
B do_shift_4
B do_shift_5
B do_shift_6
B do_shift_7
B do_shift_8
B do_shift_9
B do_shift_10
B do_shift_11
B do_shift_12
B do_shift_13
B do_shift_14
B do_shift_15

do_shift_0:
VMOVQ $0x6161616161616161, $0x6161616161616161, V4
B end_swith
do_shift_1:
VEXT $15, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_2:
VEXT $14, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_3:
VEXT $13, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_4:
VEXT $12, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_5:
VEXT $11, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_6:
VEXT $10, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_7:
VEXT $9, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_8:
VEXT $8, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_9:
VEXT $7, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_10:
VEXT $6, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_11:
VEXT $5, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_12:
VEXT $4, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_13:
VEXT $3, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_14:
VEXT $2, V30.B16, V4.B16, V4.B16
B end_swith
do_shift_15:
VEXT $1, V30.B16, V4.B16, V4.B16
B end_swith

end_swith:
VEXT $15, V4.B16, V3.B16, V5.B16
VUSHR $4, V5.B16, V6.B16
VTBL V6.B16, [V11.B16], V6.B16
VAND V5.B16, V18.B16, V7.B16
VTBL V7.B16, [V13.B16], V7.B16
VUSHR $4, V4.B16, V8.B16
VTBL V8.B16, [V15.B16], V8.B16
VAND V6.B16, V7.B16, V9.B16
VAND V9.B16, V8.B16, V10.B16

VEXT $14, V4.B16, V3.B16, V5.B16
VUSHR $5, V5.B16, V6.B16
VCMEQ V12.B16, V6.B16, V6.B16

VEXT $13, V4.B16, V3.B16, V5.B16
VUSHR $4, V5.B16, V9.B16
VCMEQ V18.B16, V9.B16, V9.B16
VORR V6.B16, V9.B16, V9.B16

VAND V9.B16, V20.B16, V9.B16
VSUB V9.B16, V10.B16, V9.B16
VMOV V9.D[0], R1
VMOV V9.D[1], R2
ORR R1, R2, R1
CBNZ R1, no_valid

valid:
MOVD $1, R0
MOVD R0, ret+24(FP)
RET

no_valid:
MOVD $0, R0
MOVD R0, ret+24(FP)
RET

valid_ascii:
MOVD $3, R0
MOVD R0, ret+24(FP)
RET


4 changes: 2 additions & 2 deletions utf8/valid_default.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build purego || !amd64
// +build purego !amd64
//go:build purego
// +build purego

package utf8

Expand Down
4 changes: 2 additions & 2 deletions utf8/valid_support_amd64.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build !purego
// +build !purego
//go:build !purego || amd64
// +build !purego amd64

package utf8

Expand Down
21 changes: 21 additions & 0 deletions utf8/valid_support_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//go:build !purego || arm64
// +build !purego arm64

package utf8

import (
"github.com/segmentio/asm/cpu"
"github.com/segmentio/asm/cpu/arm64"
)

var noNEON = !cpu.ARM64.Has(arm64.ASIMD)

// Validate is a more precise version of Valid that also indicates whether the
// input was valid ASCII.
func Validate(p []byte) Validation {
if noNEON || len(p) < 32 {
return validate(p)
}
r := validateNEON(p)
return Validation(r)
}
2 changes: 1 addition & 1 deletion utf8/valid_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ var someutf8 = []byte("\xF4\x8F\xBF\xBF")

func BenchmarkValid(b *testing.B) {
impls := map[string]func([]byte) bool{
"AVX": Valid,
"SIMD": Valid,
"Stdlib": utf8.Valid,
}

Expand Down