mirror of https://go.googlesource.com/go
343 lines
6.7 KiB
ArmAsm
343 lines
6.7 KiB
ArmAsm
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:build ppc64 || ppc64le
|
|
|
|
#include "go_asm.h"
|
|
#include "textflag.h"
|
|
|
|
// Helper names for x-form loads in BE ordering.
|
|
#ifdef GOARCH_ppc64le
|
|
#define _LDBEX MOVDBR
|
|
#define _LWBEX MOVWBR
|
|
#define _LHBEX MOVHBR
|
|
#else
|
|
#define _LDBEX MOVD
|
|
#define _LWBEX MOVW
|
|
#define _LHBEX MOVH
|
|
#endif
|
|
|
|
#ifdef GOPPC64_power9
|
|
#define SETB_CR0(rout) SETB CR0, rout
|
|
#define SETB_CR1(rout) SETB CR1, rout
|
|
#define SETB_INIT()
|
|
#define SETB_CR0_NE(rout) SETB_CR0(rout)
|
|
#else
|
|
// A helper macro to emulate SETB on P8. This assumes
|
|
// -1 is in R20, and 1 is in R21. crxlt and crxeq must
|
|
// also be the same CR field.
|
|
#define _SETB(crxlt, crxeq, rout) \
|
|
ISEL crxeq,R0,R21,rout \
|
|
ISEL crxlt,R20,rout,rout
|
|
|
|
// A special case when it is know the comparison
|
|
// will always be not equal. The result must be -1 or 1.
|
|
#define SETB_CR0_NE(rout) \
|
|
ISEL CR0LT,R20,R21,rout
|
|
|
|
#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
|
|
#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
|
|
#define SETB_INIT() \
|
|
MOVD $-1,R20 \
|
|
MOVD $1,R21
|
|
#endif
|
|
|
|
TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
|
|
// incoming:
|
|
// R3 a addr
|
|
// R4 a len
|
|
// R6 b addr
|
|
// R7 b len
|
|
//
|
|
// on entry to cmpbody:
|
|
// R3 return value if len(a) == len(b)
|
|
// R5 a addr
|
|
// R6 b addr
|
|
// R9 min(len(a),len(b))
|
|
SETB_INIT()
|
|
MOVD R3,R5
|
|
CMP R4,R7,CR0
|
|
CMP R3,R6,CR7
|
|
ISEL CR0LT,R4,R7,R9
|
|
SETB_CR0(R3)
|
|
BC $12,30,LR // beqlr cr7
|
|
BR cmpbody<>(SB)
|
|
|
|
TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
|
|
// incoming:
|
|
// R3 a addr -> R5
|
|
// R4 a len -> R3
|
|
// R5 b addr -> R6
|
|
// R6 b len -> R4
|
|
//
|
|
// on entry to cmpbody:
|
|
// R3 compare value if compared length is same.
|
|
// R5 a addr
|
|
// R6 b addr
|
|
// R9 min(len(a),len(b))
|
|
SETB_INIT()
|
|
CMP R4,R6,CR0
|
|
CMP R3,R5,CR7
|
|
ISEL CR0LT,R4,R6,R9
|
|
MOVD R5,R6
|
|
MOVD R3,R5
|
|
SETB_CR0(R3)
|
|
BC $12,30,LR // beqlr cr7
|
|
BR cmpbody<>(SB)
|
|
|
|
#ifdef GOARCH_ppc64le
|
|
DATA byteswap<>+0(SB)/8, $0x0706050403020100
|
|
DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
|
|
GLOBL byteswap<>+0(SB), RODATA, $16
|
|
#define SWAP V21
|
|
#endif
|
|
|
|
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
|
|
start:
|
|
CMP R9,$16,CR0
|
|
CMP R9,$32,CR1
|
|
CMP R9,$64,CR2
|
|
MOVD $16,R10
|
|
BLT cmp8
|
|
BLT CR1,cmp16
|
|
BLT CR2,cmp32
|
|
|
|
cmp64: // >= 64B
|
|
DCBT (R5) // optimize for size>=64
|
|
DCBT (R6) // cache hint
|
|
|
|
SRD $6,R9,R14 // There is at least one iteration.
|
|
MOVD R14,CTR
|
|
ANDCC $63,R9,R9
|
|
CMP R9,$16,CR1 // Do setup for tail check early on.
|
|
CMP R9,$32,CR2
|
|
CMP R9,$48,CR3
|
|
ADD $-16,R9,R9
|
|
|
|
MOVD $32,R11 // set offsets to load into vector
|
|
MOVD $48,R12 // set offsets to load into vector
|
|
|
|
PCALIGN $16
|
|
cmp64_loop:
|
|
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
|
|
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different // jump out if its different
|
|
|
|
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
|
|
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
|
|
LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
|
|
LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
ADD $64,R5,R5 // increment to next 64 bytes of A
|
|
ADD $64,R6,R6 // increment to next 64 bytes of B
|
|
BDNZ cmp64_loop
|
|
BC $12,2,LR // beqlr
|
|
|
|
// Finish out tail with minimal overlapped checking.
|
|
// Note, 0 tail is handled by beqlr above.
|
|
BLE CR1,cmp64_tail_gt0
|
|
BLE CR2,cmp64_tail_gt16
|
|
BLE CR3,cmp64_tail_gt32
|
|
|
|
cmp64_tail_gt48: // 49 - 63 B
|
|
LXVD2X (R0)(R5),V3
|
|
LXVD2X (R0)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R5)(R10),V3
|
|
LXVD2X (R6)(R10),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R5)(R11),V3
|
|
LXVD2X (R6)(R11),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
BR cmp64_tail_gt0
|
|
|
|
PCALIGN $16
|
|
cmp64_tail_gt32: // 33 - 48B
|
|
LXVD2X (R0)(R5),V3
|
|
LXVD2X (R0)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R5)(R10),V3
|
|
LXVD2X (R6)(R10),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
BR cmp64_tail_gt0
|
|
|
|
PCALIGN $16
|
|
cmp64_tail_gt16: // 17 - 32B
|
|
LXVD2X (R0)(R5),V3
|
|
LXVD2X (R0)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
BR cmp64_tail_gt0
|
|
|
|
PCALIGN $16
|
|
cmp64_tail_gt0: // 1 - 16B
|
|
LXVD2X (R5)(R9),V3
|
|
LXVD2X (R6)(R9),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
RET
|
|
|
|
PCALIGN $16
|
|
cmp32: // 32 - 63B
|
|
ANDCC $31,R9,R9
|
|
|
|
LXVD2X (R0)(R5),V3
|
|
LXVD2X (R0)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R10)(R5),V3
|
|
LXVD2X (R10)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
BC $12,2,LR // beqlr
|
|
ADD R9,R10,R10
|
|
|
|
LXVD2X (R9)(R5),V3
|
|
LXVD2X (R9)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
|
|
LXVD2X (R10)(R5),V3
|
|
LXVD2X (R10)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
RET
|
|
|
|
PCALIGN $16
|
|
cmp16: // 16 - 31B
|
|
ANDCC $15,R9,R9
|
|
LXVD2X (R0)(R5),V3
|
|
LXVD2X (R0)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
BC $12,2,LR // beqlr
|
|
|
|
LXVD2X (R9)(R5),V3
|
|
LXVD2X (R9)(R6),V4
|
|
VCMPEQUDCC V3,V4,V1
|
|
BGE CR6,different
|
|
RET
|
|
|
|
PCALIGN $16
|
|
different:
|
|
#ifdef GOARCH_ppc64le
|
|
MOVD $byteswap<>+00(SB),R16
|
|
LXVD2X (R16)(R0),SWAP // Set up swap string
|
|
|
|
VPERM V3,V3,SWAP,V3
|
|
VPERM V4,V4,SWAP,V4
|
|
#endif
|
|
|
|
MFVSRD VS35,R16 // move upper doublewords of A and B into GPR for comparison
|
|
MFVSRD VS36,R10
|
|
|
|
CMPU R16,R10
|
|
BEQ lower
|
|
SETB_CR0_NE(R3)
|
|
RET
|
|
|
|
PCALIGN $16
|
|
lower:
|
|
VSLDOI $8,V3,V3,V3 // move lower doublewords of A and B into GPR for comparison
|
|
MFVSRD VS35,R16
|
|
VSLDOI $8,V4,V4,V4
|
|
MFVSRD VS36,R10
|
|
|
|
CMPU R16,R10
|
|
SETB_CR0_NE(R3)
|
|
RET
|
|
|
|
PCALIGN $16
|
|
cmp8: // 8 - 15B (0 - 15B if GOPPC64_power10)
|
|
#ifdef GOPPC64_power10
|
|
SLD $56,R9,R9
|
|
LXVLL R5,R9,V3 // Load bytes starting from MSB to LSB, unused are zero filled.
|
|
LXVLL R6,R9,V4
|
|
VCMPUQ V3,V4,CR0 // Compare as a 128b integer.
|
|
SETB_CR0(R6)
|
|
ISEL CR0EQ,R3,R6,R3 // If equal, length determines the return value.
|
|
RET
|
|
#else
|
|
CMP R9,$8
|
|
BLT cmp4
|
|
ANDCC $7,R9,R9
|
|
_LDBEX (R0)(R5),R10
|
|
_LDBEX (R0)(R6),R11
|
|
_LDBEX (R9)(R5),R12
|
|
_LDBEX (R9)(R6),R14
|
|
CMPU R10,R11,CR0
|
|
SETB_CR0(R5)
|
|
CMPU R12,R14,CR1
|
|
SETB_CR1(R6)
|
|
CRAND CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
|
|
ISEL CR0EQ,R6,R5,R4
|
|
ISEL CR1EQ,R3,R4,R3
|
|
RET
|
|
|
|
PCALIGN $16
|
|
cmp4: // 4 - 7B
|
|
CMP R9,$4
|
|
BLT cmp2
|
|
ANDCC $3,R9,R9
|
|
_LWBEX (R0)(R5),R10
|
|
_LWBEX (R0)(R6),R11
|
|
_LWBEX (R9)(R5),R12
|
|
_LWBEX (R9)(R6),R14
|
|
RLDIMI $32,R10,$0,R12
|
|
RLDIMI $32,R11,$0,R14
|
|
CMPU R12,R14
|
|
BR cmp0
|
|
|
|
PCALIGN $16
|
|
cmp2: // 2 - 3B
|
|
CMP R9,$2
|
|
BLT cmp1
|
|
ANDCC $1,R9,R9
|
|
_LHBEX (R0)(R5),R10
|
|
_LHBEX (R0)(R6),R11
|
|
_LHBEX (R9)(R5),R12
|
|
_LHBEX (R9)(R6),R14
|
|
RLDIMI $32,R10,$0,R12
|
|
RLDIMI $32,R11,$0,R14
|
|
CMPU R12,R14
|
|
BR cmp0
|
|
|
|
PCALIGN $16
|
|
cmp1:
|
|
CMP R9,$0
|
|
BEQ cmp0
|
|
MOVBZ (R5),R10
|
|
MOVBZ (R6),R11
|
|
CMPU R10,R11
|
|
cmp0:
|
|
SETB_CR0(R6)
|
|
ISEL CR0EQ,R3,R6,R3
|
|
RET
|
|
#endif
|