linux/linux-5.4.31/arch/x86/crypto/blowfish-x86_64-asm_64.S

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Blowfish Cipher Algorithm (x86_64)
 *
 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 */

#include <linux/linkage.h>

.file "blowfish-x86_64-asm.S"
.text

/* structure of crypto context */
#define p	0
#define s0	((16 + 2) * 4)
#define s1	((16 + 2 + (1 * 256)) * 4)
#define s2	((16 + 2 + (2 * 256)) * 4)
#define s3	((16 + 2 + (3 * 256)) * 4)

/* register macros */
#define CTX %r12
#define RIO %rsi

#define RX0 %rax
#define RX1 %rbx
#define RX2 %rcx
#define RX3 %rdx

#define RX0d %eax
#define RX1d %ebx
#define RX2d %ecx
#define RX3d %edx

#define RX0bl %al
#define RX1bl %bl
#define RX2bl %cl
#define RX3bl %dl

#define RX0bh %ah
#define RX1bh %bh
#define RX2bh %ch
#define RX3bh %dh

#define RT0 %rdi
#define RT1 %rsi
#define RT2 %r8
#define RT3 %r9

#define RT0d %edi
#define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d

#define RKEY %r10

/***********************************************************************
 * 1-way blowfish
 ***********************************************************************/
#define F() \
	rorq $16,		RX0; \
	movzbl RX0bh,		RT0d; \
	movzbl RX0bl,		RT1d; \
	rolq $16,		RX0; \
	movl s0(CTX,RT0,4),	RT0d; \
	addl s1(CTX,RT1,4),	RT0d; \
	movzbl RX0bh,		RT1d; \
	movzbl RX0bl,		RT2d; \
	rolq $32,		RX0; \
	xorl s2(CTX,RT1,4),	RT0d; \
	addl s3(CTX,RT2,4),	RT0d; \
	xorq RT0,		RX0;

#define add_roundkey_enc(n) \
	xorq p+4*(n)(CTX), 	RX0;

#define round_enc(n) \
	add_roundkey_enc(n); \
	\
	F(); \
	F();

#define add_roundkey_dec(n) \
	movq p+4*(n-1)(CTX),	RT0; \
	rorq $32,		RT0; \
	xorq RT0,		RX0;

#define round_dec(n) \
	add_roundkey_dec(n); \
	\
	F(); \
	F(); \

#define read_block() \
	movq (RIO), 		RX0; \
	rorq $32, 		RX0; \
	bswapq 			RX0;

#define write_block() \
	bswapq 			RX0; \
	movq RX0, 		(RIO);

#define xor_block() \
	bswapq 			RX0; \
	xorq RX0, 		(RIO);

ENTRY(__blowfish_enc_blk)
	/* input:
	 *	%rdi: ctx
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool, if true: xor output
	 */
	movq %r12, %r11;

	movq %rdi, CTX;
	movq %rsi, %r10;
	movq %rdx, RIO;

	read_block();

	round_enc(0);
	round_enc(2);
	round_enc(4);
	round_enc(6);
	round_enc(8);
	round_enc(10);
	round_enc(12);
	round_enc(14);
	add_roundkey_enc(16);

	movq %r11, %r12;

	movq %r10, RIO;
	test %cl, %cl;
	jnz .L__enc_xor;

	write_block();
	ret;
.L__enc_xor:
	xor_block();
	ret;
ENDPROC(__blowfish_enc_blk)

ENTRY(blowfish_dec_blk)
	/* input:
	 *	%rdi: ctx
	 *	%rsi: dst
	 *	%rdx: src
	 */
	movq %r12, %r11;

	movq %rdi, CTX;
	movq %rsi, %r10;
	movq %rdx, RIO;

	read_block();

	round_dec(17);
	round_dec(15);
	round_dec(13);
	round_dec(11);
	round_dec(9);
	round_dec(7);
	round_dec(5);
	round_dec(3);
	add_roundkey_dec(1);

	movq %r10, RIO;
	write_block();

	movq %r11, %r12;

	ret;
ENDPROC(blowfish_dec_blk)

/**********************************************************************
  4-way blowfish, four blocks parallel
 **********************************************************************/

/* F() for 4-way. Slower when used alone/1-way, but faster when used
 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 */
#define F4(x) \
	movzbl x ## bh,		RT1d; \
	movzbl x ## bl,		RT3d; \
	rorq $16,		x; \
	movzbl x ## bh,		RT0d; \
	movzbl x ## bl,		RT2d; \
	rorq $16,		x; \
	movl s0(CTX,RT0,4),	RT0d; \
	addl s1(CTX,RT2,4),	RT0d; \
	xorl s2(CTX,RT1,4),	RT0d; \
	addl s3(CTX,RT3,4),	RT0d; \
	xorq RT0,		x;

#define add_preloaded_roundkey4() \
	xorq RKEY,		RX0; \
	xorq RKEY,		RX1; \
	xorq RKEY,		RX2; \
	xorq RKEY,		RX3;

#define preload_roundkey_enc(n) \
	movq p+4*(n)(CTX),	RKEY;

#define add_roundkey_enc4(n) \
	add_preloaded_roundkey4(); \
	preload_roundkey_enc(n + 2);

#define round_enc4(n) \
	add_roundkey_enc4(n); \
	\
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3); \
	\
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3);

#define preload_roundkey_dec(n) \
	movq p+4*((n)-1)(CTX),	RKEY; \
	rorq $32,		RKEY;

#define add_roundkey_dec4(n) \
	add_preloaded_roundkey4(); \
	preload_roundkey_dec(n - 2);

#define round_dec4(n) \
	add_roundkey_dec4(n); \
	\
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3); \
	\
	F4(RX0); \
	F4(RX1); \
	F4(RX2); \
	F4(RX3);

#define read_block4() \
	movq (RIO),		RX0; \
	rorq $32,		RX0; \
	bswapq 			RX0; \
	\
	movq 8(RIO),		RX1; \
	rorq $32,		RX1; \
	bswapq 			RX1; \
	\
	movq 16(RIO),		RX2; \
	rorq $32,		RX2; \
	bswapq 			RX2; \
	\
	movq 24(RIO),		RX3; \
	rorq $32,		RX3; \
	bswapq 			RX3;

#define write_block4() \
	bswapq 			RX0; \
	movq RX0,		(RIO); \
	\
	bswapq 			RX1; \
	movq RX1,		8(RIO); \
	\
	bswapq 			RX2; \
	movq RX2,		16(RIO); \
	\
	bswapq 			RX3; \
	movq RX3,		24(RIO);

#define xor_block4() \
	bswapq 			RX0; \
	xorq RX0,		(RIO); \
	\
	bswapq 			RX1; \
	xorq RX1,		8(RIO); \
	\
	bswapq 			RX2; \
	xorq RX2,		16(RIO); \
	\
	bswapq 			RX3; \
	xorq RX3,		24(RIO);

ENTRY(__blowfish_enc_blk_4way)
	/* input:
	 *	%rdi: ctx
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool, if true: xor output
	 */
	pushq %r12;
	pushq %rbx;
	pushq %rcx;

	movq %rdi, CTX
	movq %rsi, %r11;
	movq %rdx, RIO;

	preload_roundkey_enc(0);

	read_block4();

	round_enc4(0);
	round_enc4(2);
	round_enc4(4);
	round_enc4(6);
	round_enc4(8);
	round_enc4(10);
	round_enc4(12);
	round_enc4(14);
	add_preloaded_roundkey4();

	popq %r12;
	movq %r11, RIO;

	test %r12b, %r12b;
	jnz .L__enc_xor4;

	write_block4();

	popq %rbx;
	popq %r12;
	ret;

.L__enc_xor4:
	xor_block4();

	popq %rbx;
	popq %r12;
	ret;
ENDPROC(__blowfish_enc_blk_4way)

ENTRY(blowfish_dec_blk_4way)
	/* input:
	 *	%rdi: ctx
	 *	%rsi: dst
	 *	%rdx: src
	 */
	pushq %r12;
	pushq %rbx;

	movq %rdi, CTX;
	movq %rsi, %r11
	movq %rdx, RIO;

	preload_roundkey_dec(17);
	read_block4();

	round_dec4(17);
	round_dec4(15);
	round_dec4(13);
	round_dec4(11);
	round_dec4(9);
	round_dec4(7);
	round_dec4(5);
	round_dec4(3);
	add_preloaded_roundkey4();

	movq %r11, RIO;
	write_block4();

	popq %rbx;
	popq %r12;

	ret;
ENDPROC(blowfish_dec_blk_4way)
1 2024-01-30 10:43:28 +00:00			`/* SPDX-License-Identifier: GPL-2.0-or-later */`
			`/*`
			`* Blowfish Cipher Algorithm (x86_64)`
			`*`
			`* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>`
			`*/`

			`#include <linux/linkage.h>`

			`.file "blowfish-x86_64-asm.S"`
			`.text`

			`/* structure of crypto context */`
			`#define p 0`
			`#define s0 ((16 + 2) * 4)`
			`#define s1 ((16 + 2 + (1 * 256)) * 4)`
			`#define s2 ((16 + 2 + (2 * 256)) * 4)`
			`#define s3 ((16 + 2 + (3 * 256)) * 4)`

			`/* register macros */`
			`#define CTX %r12`
			`#define RIO %rsi`

			`#define RX0 %rax`
			`#define RX1 %rbx`
			`#define RX2 %rcx`
			`#define RX3 %rdx`

			`#define RX0d %eax`
			`#define RX1d %ebx`
			`#define RX2d %ecx`
			`#define RX3d %edx`

			`#define RX0bl %al`
			`#define RX1bl %bl`
			`#define RX2bl %cl`
			`#define RX3bl %dl`

			`#define RX0bh %ah`
			`#define RX1bh %bh`
			`#define RX2bh %ch`
			`#define RX3bh %dh`

			`#define RT0 %rdi`
			`#define RT1 %rsi`
			`#define RT2 %r8`
			`#define RT3 %r9`

			`#define RT0d %edi`
			`#define RT1d %esi`
			`#define RT2d %r8d`
			`#define RT3d %r9d`

			`#define RKEY %r10`

			`/***********************************************************************`
			`* 1-way blowfish`
			`***********************************************************************/`
			`#define F() \`
			`rorq $16, RX0; \`
			`movzbl RX0bh, RT0d; \`
			`movzbl RX0bl, RT1d; \`
			`rolq $16, RX0; \`
			`movl s0(CTX,RT0,4), RT0d; \`
			`addl s1(CTX,RT1,4), RT0d; \`
			`movzbl RX0bh, RT1d; \`
			`movzbl RX0bl, RT2d; \`
			`rolq $32, RX0; \`
			`xorl s2(CTX,RT1,4), RT0d; \`
			`addl s3(CTX,RT2,4), RT0d; \`
			`xorq RT0, RX0;`

			`#define add_roundkey_enc(n) \`
			`xorq p+4*(n)(CTX), RX0;`

			`#define round_enc(n) \`
			`add_roundkey_enc(n); \`
			`\`
			`F(); \`
			`F();`

			`#define add_roundkey_dec(n) \`
			`movq p+4*(n-1)(CTX), RT0; \`
			`rorq $32, RT0; \`
			`xorq RT0, RX0;`

			`#define round_dec(n) \`
			`add_roundkey_dec(n); \`
			`\`
			`F(); \`
			`F(); \`

			`#define read_block() \`
			`movq (RIO), RX0; \`
			`rorq $32, RX0; \`
			`bswapq RX0;`

			`#define write_block() \`
			`bswapq RX0; \`
			`movq RX0, (RIO);`

			`#define xor_block() \`
			`bswapq RX0; \`
			`xorq RX0, (RIO);`

			`ENTRY(__blowfish_enc_blk)`
			`/* input:`
			`* %rdi: ctx`
			`* %rsi: dst`
			`* %rdx: src`
			`* %rcx: bool, if true: xor output`
			`*/`
			`movq %r12, %r11;`

			`movq %rdi, CTX;`
			`movq %rsi, %r10;`
			`movq %rdx, RIO;`

			`read_block();`

			`round_enc(0);`
			`round_enc(2);`
			`round_enc(4);`
			`round_enc(6);`
			`round_enc(8);`
			`round_enc(10);`
			`round_enc(12);`
			`round_enc(14);`
			`add_roundkey_enc(16);`

			`movq %r11, %r12;`

			`movq %r10, RIO;`
			`test %cl, %cl;`
			`jnz .L__enc_xor;`

			`write_block();`
			`ret;`
			`.L__enc_xor:`
			`xor_block();`
			`ret;`
			`ENDPROC(__blowfish_enc_blk)`

			`ENTRY(blowfish_dec_blk)`
			`/* input:`
			`* %rdi: ctx`
			`* %rsi: dst`
			`* %rdx: src`
			`*/`
			`movq %r12, %r11;`

			`movq %rdi, CTX;`
			`movq %rsi, %r10;`
			`movq %rdx, RIO;`

			`read_block();`

			`round_dec(17);`
			`round_dec(15);`
			`round_dec(13);`
			`round_dec(11);`
			`round_dec(9);`
			`round_dec(7);`
			`round_dec(5);`
			`round_dec(3);`
			`add_roundkey_dec(1);`

			`movq %r10, RIO;`
			`write_block();`

			`movq %r11, %r12;`

			`ret;`
			`ENDPROC(blowfish_dec_blk)`

			`/**********************************************************************`
			`4-way blowfish, four blocks parallel`
			`**********************************************************************/`

			`/* F() for 4-way. Slower when used alone/1-way, but faster when used`
			`* parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).`
			`*/`
			`#define F4(x) \`
			`movzbl x ## bh, RT1d; \`
			`movzbl x ## bl, RT3d; \`
			`rorq $16, x; \`
			`movzbl x ## bh, RT0d; \`
			`movzbl x ## bl, RT2d; \`
			`rorq $16, x; \`
			`movl s0(CTX,RT0,4), RT0d; \`
			`addl s1(CTX,RT2,4), RT0d; \`
			`xorl s2(CTX,RT1,4), RT0d; \`
			`addl s3(CTX,RT3,4), RT0d; \`
			`xorq RT0, x;`

			`#define add_preloaded_roundkey4() \`
			`xorq RKEY, RX0; \`
			`xorq RKEY, RX1; \`
			`xorq RKEY, RX2; \`
			`xorq RKEY, RX3;`

			`#define preload_roundkey_enc(n) \`
			`movq p+4*(n)(CTX), RKEY;`

			`#define add_roundkey_enc4(n) \`
			`add_preloaded_roundkey4(); \`
			`preload_roundkey_enc(n + 2);`

			`#define round_enc4(n) \`
			`add_roundkey_enc4(n); \`
			`\`
			`F4(RX0); \`
			`F4(RX1); \`
			`F4(RX2); \`
			`F4(RX3); \`
			`\`
			`F4(RX0); \`
			`F4(RX1); \`
			`F4(RX2); \`
			`F4(RX3);`

			`#define preload_roundkey_dec(n) \`
			`movq p+4*((n)-1)(CTX), RKEY; \`
			`rorq $32, RKEY;`

			`#define add_roundkey_dec4(n) \`
			`add_preloaded_roundkey4(); \`
			`preload_roundkey_dec(n - 2);`

			`#define round_dec4(n) \`
			`add_roundkey_dec4(n); \`
			`\`
			`F4(RX0); \`
			`F4(RX1); \`
			`F4(RX2); \`
			`F4(RX3); \`
			`\`
			`F4(RX0); \`
			`F4(RX1); \`
			`F4(RX2); \`
			`F4(RX3);`

			`#define read_block4() \`
			`movq (RIO), RX0; \`
			`rorq $32, RX0; \`
			`bswapq RX0; \`
			`\`
			`movq 8(RIO), RX1; \`
			`rorq $32, RX1; \`
			`bswapq RX1; \`
			`\`
			`movq 16(RIO), RX2; \`
			`rorq $32, RX2; \`
			`bswapq RX2; \`
			`\`
			`movq 24(RIO), RX3; \`
			`rorq $32, RX3; \`
			`bswapq RX3;`

			`#define write_block4() \`
			`bswapq RX0; \`
			`movq RX0, (RIO); \`
			`\`
			`bswapq RX1; \`
			`movq RX1, 8(RIO); \`
			`\`
			`bswapq RX2; \`
			`movq RX2, 16(RIO); \`
			`\`
			`bswapq RX3; \`
			`movq RX3, 24(RIO);`

			`#define xor_block4() \`
			`bswapq RX0; \`
			`xorq RX0, (RIO); \`
			`\`
			`bswapq RX1; \`
			`xorq RX1, 8(RIO); \`
			`\`
			`bswapq RX2; \`
			`xorq RX2, 16(RIO); \`
			`\`
			`bswapq RX3; \`
			`xorq RX3, 24(RIO);`

			`ENTRY(__blowfish_enc_blk_4way)`
			`/* input:`
			`* %rdi: ctx`
			`* %rsi: dst`
			`* %rdx: src`
			`* %rcx: bool, if true: xor output`
			`*/`
			`pushq %r12;`
			`pushq %rbx;`
			`pushq %rcx;`

			`movq %rdi, CTX`
			`movq %rsi, %r11;`
			`movq %rdx, RIO;`

			`preload_roundkey_enc(0);`

			`read_block4();`

			`round_enc4(0);`
			`round_enc4(2);`
			`round_enc4(4);`
			`round_enc4(6);`
			`round_enc4(8);`
			`round_enc4(10);`
			`round_enc4(12);`
			`round_enc4(14);`
			`add_preloaded_roundkey4();`

			`popq %r12;`
			`movq %r11, RIO;`

			`test %r12b, %r12b;`
			`jnz .L__enc_xor4;`

			`write_block4();`

			`popq %rbx;`
			`popq %r12;`
			`ret;`

			`.L__enc_xor4:`
			`xor_block4();`

			`popq %rbx;`
			`popq %r12;`
			`ret;`
			`ENDPROC(__blowfish_enc_blk_4way)`

			`ENTRY(blowfish_dec_blk_4way)`
			`/* input:`
			`* %rdi: ctx`
			`* %rsi: dst`
			`* %rdx: src`
			`*/`
			`pushq %r12;`
			`pushq %rbx;`

			`movq %rdi, CTX;`
			`movq %rsi, %r11`
			`movq %rdx, RIO;`

			`preload_roundkey_dec(17);`
			`read_block4();`

			`round_dec4(17);`
			`round_dec4(15);`
			`round_dec4(13);`
			`round_dec4(11);`
			`round_dec4(9);`
			`round_dec4(7);`
			`round_dec4(5);`
			`round_dec4(3);`
			`add_preloaded_roundkey4();`

			`movq %r11, RIO;`
			`write_block4();`

			`popq %rbx;`
			`popq %r12;`

			`ret;`
			`ENDPROC(blowfish_dec_blk_4way)`