// Copyright 2017 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build amd64,gc,!purego #include "textflag.h" DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ MOVO v4, t1; \ MOVO v5, v4; \ MOVO t1, v5; \ MOVO v6, t1; \ PUNPCKLQDQ v6, t2; \ PUNPCKHQDQ v7, v6; \ PUNPCKHQDQ t2, v6; \ PUNPCKLQDQ v7, t2; \ MOVO t1, v7; \ MOVO v2, t1; \ PUNPCKHQDQ t2, v7; \ PUNPCKLQDQ v3, t2; \ PUNPCKHQDQ t2, v2; \ PUNPCKLQDQ t1, t2; \ PUNPCKHQDQ t2, v3 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ MOVO v4, t1; \ MOVO v5, v4; \ MOVO t1, v5; \ MOVO v2, t1; \ PUNPCKLQDQ v2, t2; \ PUNPCKHQDQ v3, v2; \ PUNPCKHQDQ t2, v2; \ PUNPCKLQDQ v3, t2; \ MOVO t1, v3; \ MOVO v6, t1; \ PUNPCKHQDQ t2, v3; \ PUNPCKLQDQ v7, t2; \ PUNPCKHQDQ t2, v6; \ PUNPCKLQDQ t1, t2; \ PUNPCKHQDQ t2, v7 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ MOVO v0, t0; \ PMULULQ v2, t0; \ PADDQ v2, v0; \ PADDQ t0, v0; \ PADDQ t0, v0; \ PXOR v0, v6; \ PSHUFD $0xB1, v6, v6; \ MOVO v4, t0; \ PMULULQ v6, t0; \ PADDQ v6, v4; \ PADDQ t0, v4; \ PADDQ t0, v4; \ PXOR v4, v2; \ PSHUFB c40, v2; \ MOVO v0, t0; \ PMULULQ v2, t0; \ PADDQ v2, v0; \ PADDQ t0, v0; \ PADDQ t0, v0; \ PXOR v0, v6; \ PSHUFB c48, v6; \ MOVO v4, t0; \ PMULULQ v6, t0; \ PADDQ v6, v4; \ PADDQ t0, v4; \ PADDQ t0, v4; \ PXOR v4, v2; \ MOVO v2, t0; \ PADDQ v2, t0; \ PSRLQ $63, v2; \ PXOR t0, v2; \ MOVO v1, t0; \ PMULULQ v3, t0; \ PADDQ v3, v1; \ PADDQ t0, v1; \ PADDQ t0, v1; \ PXOR v1, v7; \ PSHUFD $0xB1, v7, v7; \ MOVO v5, t0; \ PMULULQ v7, t0; \ PADDQ v7, v5; \ PADDQ t0, v5; \ PADDQ t0, v5; \ PXOR v5, v3; \ PSHUFB c40, v3; \ MOVO v1, t0; \ PMULULQ v3, t0; \ PADDQ v3, v1; \ PADDQ t0, v1; \ PADDQ t0, v1; \ PXOR v1, v7; \ PSHUFB c48, v7; \ MOVO v5, t0; \ PMULULQ v7, t0; \ PADDQ v7, v5; \ PADDQ t0, v5; \ PADDQ t0, v5; \ PXOR v5, v3; \ MOVO v3, t0; \ PADDQ v3, t0; \ PSRLQ $63, v3; \ PXOR t0, v3 #define LOAD_MSG_0(block, off) \ MOVOU 8*(off+0)(block), X0; \ MOVOU 8*(off+2)(block), X1; \ MOVOU 8*(off+4)(block), X2; \ MOVOU 8*(off+6)(block), X3; \ MOVOU 8*(off+8)(block), X4; \ MOVOU 8*(off+10)(block), X5; \ MOVOU 8*(off+12)(block), X6; \ MOVOU 8*(off+14)(block), X7 #define STORE_MSG_0(block, off) \ MOVOU X0, 8*(off+0)(block); \ MOVOU X1, 8*(off+2)(block); \ MOVOU X2, 8*(off+4)(block); \ MOVOU X3, 8*(off+6)(block); \ MOVOU X4, 8*(off+8)(block); \ MOVOU X5, 8*(off+10)(block); \ MOVOU X6, 8*(off+12)(block); \ MOVOU X7, 8*(off+14)(block) #define LOAD_MSG_1(block, off) \ MOVOU 8*off+0*8(block), X0; \ MOVOU 8*off+16*8(block), X1; \ MOVOU 8*off+32*8(block), X2; \ MOVOU 8*off+48*8(block), X3; \ MOVOU 8*off+64*8(block), X4; \ MOVOU 8*off+80*8(block), X5; \ MOVOU 8*off+96*8(block), X6; \ MOVOU 8*off+112*8(block), X7 #define STORE_MSG_1(block, off) \ MOVOU X0, 8*off+0*8(block); \ MOVOU X1, 8*off+16*8(block); \ MOVOU X2, 8*off+32*8(block); \ MOVOU X3, 8*off+48*8(block); \ MOVOU X4, 8*off+64*8(block); \ MOVOU X5, 8*off+80*8(block); \ MOVOU X6, 8*off+96*8(block); \ MOVOU X7, 8*off+112*8(block) #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ LOAD_MSG_0(block, off); \ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ STORE_MSG_0(block, off) #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ LOAD_MSG_1(block, off); \ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ STORE_MSG_1(block, off) // func blamkaSSE4(b *block) TEXT ·blamkaSSE4(SB), 4, $0-8 MOVQ b+0(FP), AX MOVOU ·c40<>(SB), X10 MOVOU ·c48<>(SB), X11 BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) RET // func mixBlocksSSE2(out, a, b, c *block) TEXT ·mixBlocksSSE2(SB), 4, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ a+24(FP), CX MOVQ $128, BP loop: MOVOU 0(AX), X0 MOVOU 0(BX), X1 MOVOU 0(CX), X2 PXOR X1, X0 PXOR X2, X0 MOVOU X0, 0(DX) ADDQ $16, AX ADDQ $16, BX ADDQ $16, CX ADDQ $16, DX SUBQ $2, BP JA loop RET // func xorBlocksSSE2(out, a, b, c *block) TEXT ·xorBlocksSSE2(SB), 4, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ a+24(FP), CX MOVQ $128, BP loop: MOVOU 0(AX), X0 MOVOU 0(BX), X1 MOVOU 0(CX), X2 MOVOU 0(DX), X3 PXOR X1, X0 PXOR X2, X0 PXOR X3, X0 MOVOU X0, 0(DX) ADDQ $16, AX ADDQ $16, BX ADDQ $16, CX ADDQ $16, DX SUBQ $2, BP JA loop RET