mirror of https://go.googlesource.com/go
787 lines
20 KiB
ArmAsm
787 lines
20 KiB
ArmAsm
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:build !math_big_pure_go
|
|
|
|
#include "textflag.h"
|
|
|
|
// This file provides fast assembly versions for the elementary
|
|
// arithmetic operations on vectors implemented in arith.go.
|
|
|
|
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
|
|
// func addVV(z, x, y []Word) (c Word)
|
|
|
|
TEXT ·addVV(SB), NOSPLIT, $0
|
|
MOVD addvectorfacility+0x00(SB), R1
|
|
BR (R1)
|
|
|
|
TEXT ·addVV_check(SB), NOSPLIT, $0
|
|
MOVB ·hasVX(SB), R1
|
|
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
|
|
MOVD $addvectorfacility+0x00(SB), R1
|
|
MOVD $·addVV_novec(SB), R2
|
|
MOVD R2, 0(R1)
|
|
|
|
// MOVD $·addVV_novec(SB), 0(R1)
|
|
BR ·addVV_novec(SB)
|
|
|
|
vectorimpl:
|
|
MOVD $addvectorfacility+0x00(SB), R1
|
|
MOVD $·addVV_vec(SB), R2
|
|
MOVD R2, 0(R1)
|
|
|
|
// MOVD $·addVV_vec(SB), 0(R1)
|
|
BR ·addVV_vec(SB)
|
|
|
|
GLOBL addvectorfacility+0x00(SB), NOPTR, $8
|
|
DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
|
|
|
|
TEXT ·addVV_vec(SB), NOSPLIT, $0
|
|
MOVD z_len+8(FP), R3
|
|
MOVD x+24(FP), R8
|
|
MOVD y+48(FP), R9
|
|
MOVD z+0(FP), R2
|
|
|
|
MOVD $0, R4 // c = 0
|
|
MOVD $0, R0 // make sure it's zero
|
|
MOVD $0, R10 // i = 0
|
|
|
|
// s/JL/JMP/ below to disable the unrolled loop
|
|
SUB $4, R3
|
|
BLT v1
|
|
SUB $12, R3 // n -= 16
|
|
BLT A1 // if n < 0 goto A1
|
|
|
|
MOVD R8, R5
|
|
MOVD R9, R6
|
|
MOVD R2, R7
|
|
|
|
// n >= 0
|
|
// regular loop body unrolled 16x
|
|
VZERO V0 // c = 0
|
|
|
|
UU1:
|
|
VLM 0(R5), V1, V4 // 64-bytes into V1..V8
|
|
ADD $64, R5
|
|
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
|
|
|
|
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
|
|
ADD $64, R6
|
|
VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
|
|
|
|
VACCCQ V1, V9, V0, V25
|
|
VACQ V1, V9, V0, V17
|
|
VACCCQ V2, V10, V25, V26
|
|
VACQ V2, V10, V25, V18
|
|
|
|
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
|
|
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
|
|
ADD $32, R5
|
|
ADD $32, R6
|
|
|
|
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
|
|
|
|
VACCCQ V3, V11, V26, V27
|
|
VACQ V3, V11, V26, V19
|
|
VACCCQ V4, V12, V27, V28
|
|
VACQ V4, V12, V27, V20
|
|
|
|
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
|
|
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
|
|
ADD $32, R5
|
|
ADD $32, R6
|
|
|
|
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
|
|
|
|
VACCCQ V5, V13, V28, V29
|
|
VACQ V5, V13, V28, V21
|
|
VACCCQ V6, V14, V29, V30
|
|
VACQ V6, V14, V29, V22
|
|
|
|
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
|
|
|
|
VACCCQ V7, V15, V30, V31
|
|
VACQ V7, V15, V30, V23
|
|
VACCCQ V8, V16, V31, V0 // V0 has carry-over
|
|
VACQ V8, V16, V31, V24
|
|
|
|
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
|
|
VSTM V17, V24, 0(R7) // 128-bytes into z
|
|
ADD $128, R7
|
|
ADD $128, R10 // i += 16
|
|
SUB $16, R3 // n -= 16
|
|
BGE UU1 // if n >= 0 goto U1
|
|
VLGVG $1, V0, R4 // put cf into R4
|
|
NEG R4, R4 // save cf
|
|
|
|
A1:
|
|
ADD $12, R3 // n += 16
|
|
|
|
// s/JL/JMP/ below to disable the unrolled loop
|
|
BLT v1 // if n < 0 goto v1
|
|
|
|
U1: // n >= 0
|
|
// regular loop body unrolled 4x
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 8(R8)(R10*1), R6
|
|
MOVD 16(R8)(R10*1), R7
|
|
MOVD 24(R8)(R10*1), R1
|
|
ADDC R4, R4 // restore CF
|
|
MOVD 0(R9)(R10*1), R11
|
|
ADDE R11, R5
|
|
MOVD 8(R9)(R10*1), R11
|
|
ADDE R11, R6
|
|
MOVD 16(R9)(R10*1), R11
|
|
ADDE R11, R7
|
|
MOVD 24(R9)(R10*1), R11
|
|
ADDE R11, R1
|
|
MOVD R0, R4
|
|
ADDE R4, R4 // save CF
|
|
NEG R4, R4
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R6, 8(R2)(R10*1)
|
|
MOVD R7, 16(R2)(R10*1)
|
|
MOVD R1, 24(R2)(R10*1)
|
|
|
|
ADD $32, R10 // i += 4
|
|
SUB $4, R3 // n -= 4
|
|
BGE U1 // if n >= 0 goto U1
|
|
|
|
v1:
|
|
ADD $4, R3 // n += 4
|
|
BLE E1 // if n <= 0 goto E1
|
|
|
|
L1: // n > 0
|
|
ADDC R4, R4 // restore CF
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 0(R9)(R10*1), R11
|
|
ADDE R11, R5
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R0, R4
|
|
ADDE R4, R4 // save CF
|
|
NEG R4, R4
|
|
|
|
ADD $8, R10 // i++
|
|
SUB $1, R3 // n--
|
|
BGT L1 // if n > 0 goto L1
|
|
|
|
E1:
|
|
NEG R4, R4
|
|
MOVD R4, c+72(FP) // return c
|
|
RET
|
|
|
|
TEXT ·addVV_novec(SB), NOSPLIT, $0
|
|
novec:
|
|
MOVD z_len+8(FP), R3
|
|
MOVD x+24(FP), R8
|
|
MOVD y+48(FP), R9
|
|
MOVD z+0(FP), R2
|
|
|
|
MOVD $0, R4 // c = 0
|
|
MOVD $0, R0 // make sure it's zero
|
|
MOVD $0, R10 // i = 0
|
|
|
|
// s/JL/JMP/ below to disable the unrolled loop
|
|
SUB $4, R3 // n -= 4
|
|
BLT v1n // if n < 0 goto v1n
|
|
|
|
U1n: // n >= 0
|
|
// regular loop body unrolled 4x
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 8(R8)(R10*1), R6
|
|
MOVD 16(R8)(R10*1), R7
|
|
MOVD 24(R8)(R10*1), R1
|
|
ADDC R4, R4 // restore CF
|
|
MOVD 0(R9)(R10*1), R11
|
|
ADDE R11, R5
|
|
MOVD 8(R9)(R10*1), R11
|
|
ADDE R11, R6
|
|
MOVD 16(R9)(R10*1), R11
|
|
ADDE R11, R7
|
|
MOVD 24(R9)(R10*1), R11
|
|
ADDE R11, R1
|
|
MOVD R0, R4
|
|
ADDE R4, R4 // save CF
|
|
NEG R4, R4
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R6, 8(R2)(R10*1)
|
|
MOVD R7, 16(R2)(R10*1)
|
|
MOVD R1, 24(R2)(R10*1)
|
|
|
|
ADD $32, R10 // i += 4
|
|
SUB $4, R3 // n -= 4
|
|
BGE U1n // if n >= 0 goto U1n
|
|
|
|
v1n:
|
|
ADD $4, R3 // n += 4
|
|
BLE E1n // if n <= 0 goto E1n
|
|
|
|
L1n: // n > 0
|
|
ADDC R4, R4 // restore CF
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 0(R9)(R10*1), R11
|
|
ADDE R11, R5
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R0, R4
|
|
ADDE R4, R4 // save CF
|
|
NEG R4, R4
|
|
|
|
ADD $8, R10 // i++
|
|
SUB $1, R3 // n--
|
|
BGT L1n // if n > 0 goto L1n
|
|
|
|
E1n:
|
|
NEG R4, R4
|
|
MOVD R4, c+72(FP) // return c
|
|
RET
|
|
|
|
TEXT ·subVV(SB), NOSPLIT, $0
|
|
MOVD subvectorfacility+0x00(SB), R1
|
|
BR (R1)
|
|
|
|
TEXT ·subVV_check(SB), NOSPLIT, $0
|
|
MOVB ·hasVX(SB), R1
|
|
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
|
|
MOVD $subvectorfacility+0x00(SB), R1
|
|
MOVD $·subVV_novec(SB), R2
|
|
MOVD R2, 0(R1)
|
|
|
|
// MOVD $·subVV_novec(SB), 0(R1)
|
|
BR ·subVV_novec(SB)
|
|
|
|
vectorimpl:
|
|
MOVD $subvectorfacility+0x00(SB), R1
|
|
MOVD $·subVV_vec(SB), R2
|
|
MOVD R2, 0(R1)
|
|
|
|
// MOVD $·subVV_vec(SB), 0(R1)
|
|
BR ·subVV_vec(SB)
|
|
|
|
GLOBL subvectorfacility+0x00(SB), NOPTR, $8
|
|
DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
|
|
|
|
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
|
|
// func subVV(z, x, y []Word) (c Word)
|
|
// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
|
|
TEXT ·subVV_vec(SB), NOSPLIT, $0
|
|
MOVD z_len+8(FP), R3
|
|
MOVD x+24(FP), R8
|
|
MOVD y+48(FP), R9
|
|
MOVD z+0(FP), R2
|
|
MOVD $0, R4 // c = 0
|
|
MOVD $0, R0 // make sure it's zero
|
|
MOVD $0, R10 // i = 0
|
|
|
|
// s/JL/JMP/ below to disable the unrolled loop
|
|
SUB $4, R3 // n -= 4
|
|
BLT v1 // if n < 0 goto v1
|
|
SUB $12, R3 // n -= 16
|
|
BLT A1 // if n < 0 goto A1
|
|
|
|
MOVD R8, R5
|
|
MOVD R9, R6
|
|
MOVD R2, R7
|
|
|
|
// n >= 0
|
|
// regular loop body unrolled 16x
|
|
VZERO V0 // cf = 0
|
|
MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
|
|
VLVGG $1, R4, V0 // put carry into V0
|
|
|
|
UU1:
|
|
VLM 0(R5), V1, V4 // 64-bytes into V1..V8
|
|
ADD $64, R5
|
|
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
|
|
|
|
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
|
|
ADD $64, R6
|
|
VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
|
|
|
|
VSBCBIQ V1, V9, V0, V25
|
|
VSBIQ V1, V9, V0, V17
|
|
VSBCBIQ V2, V10, V25, V26
|
|
VSBIQ V2, V10, V25, V18
|
|
|
|
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
|
|
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
|
|
ADD $32, R5
|
|
ADD $32, R6
|
|
|
|
VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
|
|
|
|
VSBCBIQ V3, V11, V26, V27
|
|
VSBIQ V3, V11, V26, V19
|
|
VSBCBIQ V4, V12, V27, V28
|
|
VSBIQ V4, V12, V27, V20
|
|
|
|
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
|
|
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
|
|
ADD $32, R5
|
|
ADD $32, R6
|
|
|
|
VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
|
|
|
|
VSBCBIQ V5, V13, V28, V29
|
|
VSBIQ V5, V13, V28, V21
|
|
VSBCBIQ V6, V14, V29, V30
|
|
VSBIQ V6, V14, V29, V22
|
|
|
|
VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
|
|
|
|
VSBCBIQ V7, V15, V30, V31
|
|
VSBIQ V7, V15, V30, V23
|
|
VSBCBIQ V8, V16, V31, V0 // V0 has carry-over
|
|
VSBIQ V8, V16, V31, V24
|
|
|
|
VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
|
|
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
|
|
VSTM V17, V24, 0(R7) // 128-bytes into z
|
|
ADD $128, R7
|
|
ADD $128, R10 // i += 16
|
|
SUB $16, R3 // n -= 16
|
|
BGE UU1 // if n >= 0 goto U1
|
|
VLGVG $1, V0, R4 // put cf into R4
|
|
SUB $1, R4 // save cf
|
|
|
|
A1:
|
|
ADD $12, R3 // n += 16
|
|
BLT v1 // if n < 0 goto v1
|
|
|
|
U1: // n >= 0
|
|
// regular loop body unrolled 4x
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 8(R8)(R10*1), R6
|
|
MOVD 16(R8)(R10*1), R7
|
|
MOVD 24(R8)(R10*1), R1
|
|
MOVD R0, R11
|
|
SUBC R4, R11 // restore CF
|
|
MOVD 0(R9)(R10*1), R11
|
|
SUBE R11, R5
|
|
MOVD 8(R9)(R10*1), R11
|
|
SUBE R11, R6
|
|
MOVD 16(R9)(R10*1), R11
|
|
SUBE R11, R7
|
|
MOVD 24(R9)(R10*1), R11
|
|
SUBE R11, R1
|
|
MOVD R0, R4
|
|
SUBE R4, R4 // save CF
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R6, 8(R2)(R10*1)
|
|
MOVD R7, 16(R2)(R10*1)
|
|
MOVD R1, 24(R2)(R10*1)
|
|
|
|
ADD $32, R10 // i += 4
|
|
SUB $4, R3 // n -= 4
|
|
BGE U1 // if n >= 0 goto U1n
|
|
|
|
v1:
|
|
ADD $4, R3 // n += 4
|
|
BLE E1 // if n <= 0 goto E1
|
|
|
|
L1: // n > 0
|
|
MOVD R0, R11
|
|
SUBC R4, R11 // restore CF
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 0(R9)(R10*1), R11
|
|
SUBE R11, R5
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R0, R4
|
|
SUBE R4, R4 // save CF
|
|
|
|
ADD $8, R10 // i++
|
|
SUB $1, R3 // n--
|
|
BGT L1 // if n > 0 goto L1n
|
|
|
|
E1:
|
|
NEG R4, R4
|
|
MOVD R4, c+72(FP) // return c
|
|
RET
|
|
|
|
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
|
|
// func subVV(z, x, y []Word) (c Word)
|
|
// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
|
|
TEXT ·subVV_novec(SB), NOSPLIT, $0
|
|
MOVD z_len+8(FP), R3
|
|
MOVD x+24(FP), R8
|
|
MOVD y+48(FP), R9
|
|
MOVD z+0(FP), R2
|
|
|
|
MOVD $0, R4 // c = 0
|
|
MOVD $0, R0 // make sure it's zero
|
|
MOVD $0, R10 // i = 0
|
|
|
|
// s/JL/JMP/ below to disable the unrolled loop
|
|
SUB $4, R3 // n -= 4
|
|
BLT v1 // if n < 0 goto v1
|
|
|
|
U1: // n >= 0
|
|
// regular loop body unrolled 4x
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 8(R8)(R10*1), R6
|
|
MOVD 16(R8)(R10*1), R7
|
|
MOVD 24(R8)(R10*1), R1
|
|
MOVD R0, R11
|
|
SUBC R4, R11 // restore CF
|
|
MOVD 0(R9)(R10*1), R11
|
|
SUBE R11, R5
|
|
MOVD 8(R9)(R10*1), R11
|
|
SUBE R11, R6
|
|
MOVD 16(R9)(R10*1), R11
|
|
SUBE R11, R7
|
|
MOVD 24(R9)(R10*1), R11
|
|
SUBE R11, R1
|
|
MOVD R0, R4
|
|
SUBE R4, R4 // save CF
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R6, 8(R2)(R10*1)
|
|
MOVD R7, 16(R2)(R10*1)
|
|
MOVD R1, 24(R2)(R10*1)
|
|
|
|
ADD $32, R10 // i += 4
|
|
SUB $4, R3 // n -= 4
|
|
BGE U1 // if n >= 0 goto U1
|
|
|
|
v1:
|
|
ADD $4, R3 // n += 4
|
|
BLE E1 // if n <= 0 goto E1
|
|
|
|
L1: // n > 0
|
|
MOVD R0, R11
|
|
SUBC R4, R11 // restore CF
|
|
MOVD 0(R8)(R10*1), R5
|
|
MOVD 0(R9)(R10*1), R11
|
|
SUBE R11, R5
|
|
MOVD R5, 0(R2)(R10*1)
|
|
MOVD R0, R4
|
|
SUBE R4, R4 // save CF
|
|
|
|
ADD $8, R10 // i++
|
|
SUB $1, R3 // n--
|
|
BGT L1 // if n > 0 goto L1
|
|
|
|
E1:
|
|
NEG R4, R4
|
|
MOVD R4, c+72(FP) // return c
|
|
RET
|
|
|
|
TEXT ·addVW(SB), NOSPLIT, $0
|
|
MOVD z_len+8(FP), R5 // length of z
|
|
MOVD x+24(FP), R6
|
|
MOVD y+48(FP), R7 // c = y
|
|
MOVD z+0(FP), R8
|
|
|
|
CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
|
|
|
|
// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
|
|
ADDC 0(R6), R7
|
|
MOVD R7, 0(R8)
|
|
CMPBEQ R5, $1, returnResult // len(z) == 1
|
|
MOVD $0, R9
|
|
ADDE 8(R6), R9
|
|
MOVD R9, 8(R8)
|
|
CMPBEQ R5, $2, returnResult // len(z) == 2
|
|
|
|
// Update the counters
|
|
MOVD $16, R12 // i = 2
|
|
MOVD $-2(R5), R5 // n = n - 2
|
|
|
|
loopOverEachWord:
|
|
BRC $12, copySetup // carry = 0, copy the rest
|
|
MOVD $1, R9
|
|
|
|
// Originally we used the carry flag generated in the previous iteration
|
|
// (i.e: ADDE could be used here to do the addition). However, since we
|
|
// already know carry is 1 (otherwise we will go to copy section), we can use
|
|
// ADDC here so the current iteration does not depend on the carry flag
|
|
// generated in the previous iteration. This could be useful when branch prediction happens.
|
|
ADDC 0(R6)(R12*1), R9
|
|
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
|
|
|
|
MOVD $8(R12), R12 // i++
|
|
BRCTG R5, loopOverEachWord // n--
|
|
|
|
// Return the current carry value
|
|
returnResult:
|
|
MOVD $0, R0
|
|
ADDE R0, R0
|
|
MOVD R0, c+56(FP)
|
|
RET
|
|
|
|
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
|
|
// With the assumption that x and z will not overlap with each other or x and z will
|
|
// point to same memory region, we can use a faster version of copy using only MVC here.
|
|
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
|
|
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
|
|
copySetup:
|
|
ADD R12, R6
|
|
ADD R12, R8
|
|
|
|
CMPBGE R5, $4, mediumLoop
|
|
|
|
smallLoop: // does a loop unrolling to copy word when n < 4
|
|
CMPBEQ R5, $0, returnZero
|
|
MVC $8, 0(R6), 0(R8)
|
|
CMPBEQ R5, $1, returnZero
|
|
MVC $8, 8(R6), 8(R8)
|
|
CMPBEQ R5, $2, returnZero
|
|
MVC $8, 16(R6), 16(R8)
|
|
|
|
returnZero:
|
|
MOVD $0, c+56(FP) // return 0 as carry
|
|
RET
|
|
|
|
mediumLoop:
|
|
CMPBLT R5, $4, smallLoop
|
|
CMPBLT R5, $32, mediumLoopBody
|
|
|
|
largeLoop: // Copying 256 bytes at a time.
|
|
MVC $256, 0(R6), 0(R8)
|
|
MOVD $256(R6), R6
|
|
MOVD $256(R8), R8
|
|
MOVD $-32(R5), R5
|
|
CMPBGE R5, $32, largeLoop
|
|
BR mediumLoop
|
|
|
|
mediumLoopBody: // Copying 32 bytes at a time
|
|
MVC $32, 0(R6), 0(R8)
|
|
MOVD $32(R6), R6
|
|
MOVD $32(R8), R8
|
|
MOVD $-4(R5), R5
|
|
CMPBGE R5, $4, mediumLoopBody
|
|
BR smallLoop
|
|
|
|
returnC:
|
|
MOVD R7, c+56(FP)
|
|
RET
|
|
|
|
TEXT ·subVW(SB), NOSPLIT, $0
|
|
MOVD z_len+8(FP), R5
|
|
MOVD x+24(FP), R6
|
|
MOVD y+48(FP), R7 // The borrow bit passed in
|
|
MOVD z+0(FP), R8
|
|
MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
|
|
|
|
CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
|
|
|
|
// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
|
|
MOVD 0(R6), R9
|
|
SUBC R7, R9
|
|
MOVD R9, 0(R8)
|
|
CMPBEQ R5, $1, returnResult
|
|
MOVD 8(R6), R9
|
|
SUBE R0, R9
|
|
MOVD R9, 8(R8)
|
|
CMPBEQ R5, $2, returnResult
|
|
|
|
// Update the counters
|
|
MOVD $16, R12 // i = 2
|
|
MOVD $-2(R5), R5 // n = n - 2
|
|
|
|
loopOverEachWord:
|
|
BRC $3, copySetup // no borrow, copy the rest
|
|
MOVD 0(R6)(R12*1), R9
|
|
|
|
// Originally we used the borrow flag generated in the previous iteration
|
|
// (i.e: SUBE could be used here to do the subtraction). However, since we
|
|
// already know borrow is 1 (otherwise we will go to copy section), we can
|
|
// use SUBC here so the current iteration does not depend on the borrow flag
|
|
// generated in the previous iteration. This could be useful when branch prediction happens.
|
|
SUBC $1, R9
|
|
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
|
|
|
|
MOVD $8(R12), R12 // i++
|
|
BRCTG R5, loopOverEachWord // n--
|
|
|
|
// return the current borrow value
|
|
returnResult:
|
|
SUBE R0, R0
|
|
NEG R0, R0
|
|
MOVD R0, c+56(FP)
|
|
RET
|
|
|
|
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
|
|
// With the assumption that x and z will not overlap with each other or x and z will
|
|
// point to same memory region, we can use a faster version of copy using only MVC here.
|
|
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
|
|
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
|
|
copySetup:
|
|
ADD R12, R6
|
|
ADD R12, R8
|
|
|
|
CMPBGE R5, $4, mediumLoop
|
|
|
|
smallLoop: // does a loop unrolling to copy word when n < 4
|
|
CMPBEQ R5, $0, returnZero
|
|
MVC $8, 0(R6), 0(R8)
|
|
CMPBEQ R5, $1, returnZero
|
|
MVC $8, 8(R6), 8(R8)
|
|
CMPBEQ R5, $2, returnZero
|
|
MVC $8, 16(R6), 16(R8)
|
|
|
|
returnZero:
|
|
MOVD $0, c+56(FP) // return 0 as borrow
|
|
RET
|
|
|
|
mediumLoop:
|
|
CMPBLT R5, $4, smallLoop
|
|
CMPBLT R5, $32, mediumLoopBody
|
|
|
|
largeLoop: // Copying 256 bytes at a time
|
|
MVC $256, 0(R6), 0(R8)
|
|
MOVD $256(R6), R6
|
|
MOVD $256(R8), R8
|
|
MOVD $-32(R5), R5
|
|
CMPBGE R5, $32, largeLoop
|
|
BR mediumLoop
|
|
|
|
mediumLoopBody: // Copying 32 bytes at a time
|
|
MVC $32, 0(R6), 0(R8)
|
|
MOVD $32(R6), R6
|
|
MOVD $32(R8), R8
|
|
MOVD $-4(R5), R5
|
|
CMPBGE R5, $4, mediumLoopBody
|
|
BR smallLoop
|
|
|
|
returnC:
|
|
MOVD R7, c+56(FP)
|
|
RET
|
|
|
|
// func shlVU(z, x []Word, s uint) (c Word)
|
|
TEXT ·shlVU(SB), NOSPLIT, $0
|
|
BR ·shlVU_g(SB)
|
|
|
|
// func shrVU(z, x []Word, s uint) (c Word)
|
|
TEXT ·shrVU(SB), NOSPLIT, $0
|
|
BR ·shrVU_g(SB)
|
|
|
|
// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
|
|
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
|
TEXT ·mulAddVWW(SB), NOSPLIT, $0
|
|
MOVD z+0(FP), R2
|
|
MOVD x+24(FP), R8
|
|
MOVD y+48(FP), R9
|
|
MOVD r+56(FP), R4 // c = r
|
|
MOVD z_len+8(FP), R5
|
|
MOVD $0, R1 // i = 0
|
|
MOVD $0, R7 // i*8 = 0
|
|
MOVD $0, R0 // make sure it's zero
|
|
BR E5
|
|
|
|
L5:
|
|
MOVD (R8)(R1*1), R6
|
|
MULHDU R9, R6
|
|
ADDC R4, R11 // add to low order bits
|
|
ADDE R0, R6
|
|
MOVD R11, (R2)(R1*1)
|
|
MOVD R6, R4
|
|
ADD $8, R1 // i*8 + 8
|
|
ADD $1, R7 // i++
|
|
|
|
E5:
|
|
CMPBLT R7, R5, L5 // i < n
|
|
|
|
MOVD R4, c+64(FP)
|
|
RET
|
|
|
|
// func addMulVVW(z, x []Word, y Word) (c Word)
|
|
// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
|
|
TEXT ·addMulVVW(SB), NOSPLIT, $0
|
|
MOVD z+0(FP), R2
|
|
MOVD x+24(FP), R8
|
|
MOVD y+48(FP), R9
|
|
MOVD z_len+8(FP), R5
|
|
|
|
MOVD $0, R1 // i*8 = 0
|
|
MOVD $0, R7 // i = 0
|
|
MOVD $0, R0 // make sure it's zero
|
|
MOVD $0, R4 // c = 0
|
|
|
|
MOVD R5, R12
|
|
AND $-2, R12
|
|
CMPBGE R5, $2, A6
|
|
BR E6
|
|
|
|
A6:
|
|
MOVD (R8)(R1*1), R6
|
|
MULHDU R9, R6
|
|
MOVD (R2)(R1*1), R10
|
|
ADDC R10, R11 // add to low order bits
|
|
ADDE R0, R6
|
|
ADDC R4, R11
|
|
ADDE R0, R6
|
|
MOVD R6, R4
|
|
MOVD R11, (R2)(R1*1)
|
|
|
|
MOVD (8)(R8)(R1*1), R6
|
|
MULHDU R9, R6
|
|
MOVD (8)(R2)(R1*1), R10
|
|
ADDC R10, R11 // add to low order bits
|
|
ADDE R0, R6
|
|
ADDC R4, R11
|
|
ADDE R0, R6
|
|
MOVD R6, R4
|
|
MOVD R11, (8)(R2)(R1*1)
|
|
|
|
ADD $16, R1 // i*8 + 8
|
|
ADD $2, R7 // i++
|
|
|
|
CMPBLT R7, R12, A6
|
|
BR E6
|
|
|
|
L6:
|
|
MOVD (R8)(R1*1), R6
|
|
MULHDU R9, R6
|
|
MOVD (R2)(R1*1), R10
|
|
ADDC R10, R11 // add to low order bits
|
|
ADDE R0, R6
|
|
ADDC R4, R11
|
|
ADDE R0, R6
|
|
MOVD R6, R4
|
|
MOVD R11, (R2)(R1*1)
|
|
|
|
ADD $8, R1 // i*8 + 8
|
|
ADD $1, R7 // i++
|
|
|
|
E6:
|
|
CMPBLT R7, R5, L6 // i < n
|
|
|
|
MOVD R4, c+56(FP)
|
|
RET
|
|
|