mirror of https://go.googlesource.com/go
166 lines
2.8 KiB
ArmAsm
166 lines
2.8 KiB
ArmAsm
// Copyright 2018 The Go Authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style
|
||
// license that can be found in the LICENSE file.
|
||
|
||
#include "go_asm.h"
|
||
#include "asm_amd64.h"
|
||
#include "textflag.h"
|
||
|
||
// memequal(a, b unsafe.Pointer, size uintptr) bool
|
||
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
|
||
// AX = a (want in SI)
|
||
// BX = b (want in DI)
|
||
// CX = size (want in BX)
|
||
CMPQ AX, BX
|
||
JNE neq
|
||
MOVQ $1, AX // return 1
|
||
RET
|
||
neq:
|
||
MOVQ AX, SI
|
||
MOVQ BX, DI
|
||
MOVQ CX, BX
|
||
JMP memeqbody<>(SB)
|
||
|
||
// memequal_varlen(a, b unsafe.Pointer) bool
|
||
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
|
||
// AX = a (want in SI)
|
||
// BX = b (want in DI)
|
||
// 8(DX) = size (want in BX)
|
||
CMPQ AX, BX
|
||
JNE neq
|
||
MOVQ $1, AX // return 1
|
||
RET
|
||
neq:
|
||
MOVQ AX, SI
|
||
MOVQ BX, DI
|
||
MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
|
||
JMP memeqbody<>(SB)
|
||
|
||
// Input:
|
||
// a in SI
|
||
// b in DI
|
||
// count in BX
|
||
// Output:
|
||
// result in AX
|
||
TEXT memeqbody<>(SB),NOSPLIT,$0-0
|
||
CMPQ BX, $8
|
||
JB small
|
||
CMPQ BX, $64
|
||
JB bigloop
|
||
#ifndef hasAVX2
|
||
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
|
||
JE hugeloop_avx2
|
||
|
||
// 64 bytes at a time using xmm registers
|
||
PCALIGN $16
|
||
hugeloop:
|
||
CMPQ BX, $64
|
||
JB bigloop
|
||
MOVOU (SI), X0
|
||
MOVOU (DI), X1
|
||
MOVOU 16(SI), X2
|
||
MOVOU 16(DI), X3
|
||
MOVOU 32(SI), X4
|
||
MOVOU 32(DI), X5
|
||
MOVOU 48(SI), X6
|
||
MOVOU 48(DI), X7
|
||
PCMPEQB X1, X0
|
||
PCMPEQB X3, X2
|
||
PCMPEQB X5, X4
|
||
PCMPEQB X7, X6
|
||
PAND X2, X0
|
||
PAND X6, X4
|
||
PAND X4, X0
|
||
PMOVMSKB X0, DX
|
||
ADDQ $64, SI
|
||
ADDQ $64, DI
|
||
SUBQ $64, BX
|
||
CMPL DX, $0xffff
|
||
JEQ hugeloop
|
||
XORQ AX, AX // return 0
|
||
RET
|
||
#endif
|
||
|
||
// 64 bytes at a time using ymm registers
|
||
PCALIGN $16
|
||
hugeloop_avx2:
|
||
CMPQ BX, $64
|
||
JB bigloop_avx2
|
||
VMOVDQU (SI), Y0
|
||
VMOVDQU (DI), Y1
|
||
VMOVDQU 32(SI), Y2
|
||
VMOVDQU 32(DI), Y3
|
||
VPCMPEQB Y1, Y0, Y4
|
||
VPCMPEQB Y2, Y3, Y5
|
||
VPAND Y4, Y5, Y6
|
||
VPMOVMSKB Y6, DX
|
||
ADDQ $64, SI
|
||
ADDQ $64, DI
|
||
SUBQ $64, BX
|
||
CMPL DX, $0xffffffff
|
||
JEQ hugeloop_avx2
|
||
VZEROUPPER
|
||
XORQ AX, AX // return 0
|
||
RET
|
||
|
||
bigloop_avx2:
|
||
VZEROUPPER
|
||
|
||
// 8 bytes at a time using 64-bit register
|
||
PCALIGN $16
|
||
bigloop:
|
||
CMPQ BX, $8
|
||
JBE leftover
|
||
MOVQ (SI), CX
|
||
MOVQ (DI), DX
|
||
ADDQ $8, SI
|
||
ADDQ $8, DI
|
||
SUBQ $8, BX
|
||
CMPQ CX, DX
|
||
JEQ bigloop
|
||
XORQ AX, AX // return 0
|
||
RET
|
||
|
||
// remaining 0-8 bytes
|
||
leftover:
|
||
MOVQ -8(SI)(BX*1), CX
|
||
MOVQ -8(DI)(BX*1), DX
|
||
CMPQ CX, DX
|
||
SETEQ AX
|
||
RET
|
||
|
||
small:
|
||
CMPQ BX, $0
|
||
JEQ equal
|
||
|
||
LEAQ 0(BX*8), CX
|
||
NEGQ CX
|
||
|
||
CMPB SI, $0xf8
|
||
JA si_high
|
||
|
||
// load at SI won't cross a page boundary.
|
||
MOVQ (SI), SI
|
||
JMP si_finish
|
||
si_high:
|
||
// address ends in 11111xxx. Load up to bytes we want, move to correct position.
|
||
MOVQ -8(SI)(BX*1), SI
|
||
SHRQ CX, SI
|
||
si_finish:
|
||
|
||
// same for DI.
|
||
CMPB DI, $0xf8
|
||
JA di_high
|
||
MOVQ (DI), DI
|
||
JMP di_finish
|
||
di_high:
|
||
MOVQ -8(DI)(BX*1), DI
|
||
SHRQ CX, DI
|
||
di_finish:
|
||
|
||
SUBQ SI, DI
|
||
SHLQ CX, DI
|
||
equal:
|
||
SETEQ AX
|
||
RET
|