linux/linux-5.4.31/arch/arm64/lib/memmove.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 */

#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>

/*
 * Move a buffer from src to test (alignment handled by the hardware).
 * If dest <= src, call memcpy, otherwise copy in reverse order.
 *
 * Parameters:
 *	x0 - dest
 *	x1 - src
 *	x2 - n
 * Returns:
 *	x0 - dest
 */
dstin	.req	x0
src	.req	x1
count	.req	x2
tmp1	.req	x3
tmp1w	.req	w3
tmp2	.req	x4
tmp2w	.req	w4
tmp3	.req	x5
tmp3w	.req	w5
dst	.req	x6

A_l	.req	x7
A_h	.req	x8
B_l	.req	x9
B_h	.req	x10
C_l	.req	x11
C_h	.req	x12
D_l	.req	x13
D_h	.req	x14

	.weak memmove
ENTRY(__memmove)
ENTRY(memmove)
	cmp	dstin, src
	b.lo	__memcpy
	add	tmp1, src, count
	cmp	dstin, tmp1
	b.hs	__memcpy		/* No overlap.  */

	add	dst, dstin, count
	add	src, src, count
	cmp	count, #16
	b.lo	.Ltail15  /*probably non-alignment accesses.*/

	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
	b.eq	.LSrcAligned
	sub	count, count, tmp2
	/*
	* process the aligned offset length to make the src aligned firstly.
	* those extra instructions' cost is acceptable. It also make the
	* coming accesses are based on aligned address.
	*/
	tbz	tmp2, #0, 1f
	ldrb	tmp1w, [src, #-1]!
	strb	tmp1w, [dst, #-1]!
1:
	tbz	tmp2, #1, 2f
	ldrh	tmp1w, [src, #-2]!
	strh	tmp1w, [dst, #-2]!
2:
	tbz	tmp2, #2, 3f
	ldr	tmp1w, [src, #-4]!
	str	tmp1w, [dst, #-4]!
3:
	tbz	tmp2, #3, .LSrcAligned
	ldr	tmp1, [src, #-8]!
	str	tmp1, [dst, #-8]!

.LSrcAligned:
	cmp	count, #64
	b.ge	.Lcpy_over64

	/*
	* Deal with small copies quickly by dropping straight into the
	* exit block.
	*/
.Ltail63:
	/*
	* Copy up to 48 bytes of data. At this point we only need the
	* bottom 6 bits of count to be accurate.
	*/
	ands	tmp1, count, #0x30
	b.eq	.Ltail15
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	ldp	A_l, A_h, [src, #-16]!
	stp	A_l, A_h, [dst, #-16]!
1:
	ldp	A_l, A_h, [src, #-16]!
	stp	A_l, A_h, [dst, #-16]!
2:
	ldp	A_l, A_h, [src, #-16]!
	stp	A_l, A_h, [dst, #-16]!

.Ltail15:
	tbz	count, #3, 1f
	ldr	tmp1, [src, #-8]!
	str	tmp1, [dst, #-8]!
1:
	tbz	count, #2, 2f
	ldr	tmp1w, [src, #-4]!
	str	tmp1w, [dst, #-4]!
2:
	tbz	count, #1, 3f
	ldrh	tmp1w, [src, #-2]!
	strh	tmp1w, [dst, #-2]!
3:
	tbz	count, #0, .Lexitfunc
	ldrb	tmp1w, [src, #-1]
	strb	tmp1w, [dst, #-1]

.Lexitfunc:
	ret

.Lcpy_over64:
	subs	count, count, #128
	b.ge	.Lcpy_body_large
	/*
	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
	* to the tail.
	*/
	ldp	A_l, A_h, [src, #-16]
	stp	A_l, A_h, [dst, #-16]
	ldp	B_l, B_h, [src, #-32]
	ldp	C_l, C_h, [src, #-48]
	stp	B_l, B_h, [dst, #-32]
	stp	C_l, C_h, [dst, #-48]
	ldp	D_l, D_h, [src, #-64]!
	stp	D_l, D_h, [dst, #-64]!

	tst	count, #0x3f
	b.ne	.Ltail63
	ret

	/*
	* Critical loop. Start at a new cache line boundary. Assuming
	* 64 bytes per line this ensures the entire loop is in one line.
	*/
	.p2align	L1_CACHE_SHIFT
.Lcpy_body_large:
	/* pre-load 64 bytes data. */
	ldp	A_l, A_h, [src, #-16]
	ldp	B_l, B_h, [src, #-32]
	ldp	C_l, C_h, [src, #-48]
	ldp	D_l, D_h, [src, #-64]!
1:
	/*
	* interlace the load of next 64 bytes data block with store of the last
	* loaded 64 bytes data.
	*/
	stp	A_l, A_h, [dst, #-16]
	ldp	A_l, A_h, [src, #-16]
	stp	B_l, B_h, [dst, #-32]
	ldp	B_l, B_h, [src, #-32]
	stp	C_l, C_h, [dst, #-48]
	ldp	C_l, C_h, [src, #-48]
	stp	D_l, D_h, [dst, #-64]!
	ldp	D_l, D_h, [src, #-64]!
	subs	count, count, #64
	b.ge	1b
	stp	A_l, A_h, [dst, #-16]
	stp	B_l, B_h, [dst, #-32]
	stp	C_l, C_h, [dst, #-48]
	stp	D_l, D_h, [dst, #-64]!

	tst	count, #0x3f
	b.ne	.Ltail63
	ret
ENDPIPROC(memmove)
EXPORT_SYMBOL(memmove)
ENDPROC(__memmove)
EXPORT_SYMBOL(__memmove)
1 2024-01-30 10:43:28 +00:00			`/* SPDX-License-Identifier: GPL-2.0-only */`
			`/*`
			`* Copyright (C) 2013 ARM Ltd.`
			`* Copyright (C) 2013 Linaro.`
			`*`
			`* This code is based on glibc cortex strings work originally authored by Linaro`
			`* be found @`
			`*`
			`* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/`
			`* files/head:/src/aarch64/`
			`*/`

			`#include <linux/linkage.h>`
			`#include <asm/assembler.h>`
			`#include <asm/cache.h>`

			`/*`
			`* Move a buffer from src to test (alignment handled by the hardware).`
			`* If dest <= src, call memcpy, otherwise copy in reverse order.`
			`*`
			`* Parameters:`
			`* x0 - dest`
			`* x1 - src`
			`* x2 - n`
			`* Returns:`
			`* x0 - dest`
			`*/`
			`dstin .req x0`
			`src .req x1`
			`count .req x2`
			`tmp1 .req x3`
			`tmp1w .req w3`
			`tmp2 .req x4`
			`tmp2w .req w4`
			`tmp3 .req x5`
			`tmp3w .req w5`
			`dst .req x6`

			`A_l .req x7`
			`A_h .req x8`
			`B_l .req x9`
			`B_h .req x10`
			`C_l .req x11`
			`C_h .req x12`
			`D_l .req x13`
			`D_h .req x14`

			`.weak memmove`
			`ENTRY(__memmove)`
			`ENTRY(memmove)`
			`cmp dstin, src`
			`b.lo __memcpy`
			`add tmp1, src, count`
			`cmp dstin, tmp1`
			`b.hs __memcpy /* No overlap. */`

			`add dst, dstin, count`
			`add src, src, count`
			`cmp count, #16`
			`b.lo .Ltail15 /probably non-alignment accesses./`

			`ands tmp2, src, #15 /* Bytes to reach alignment. */`
			`b.eq .LSrcAligned`
			`sub count, count, tmp2`
			`/*`
			`* process the aligned offset length to make the src aligned firstly.`
			`* those extra instructions' cost is acceptable. It also make the`
			`* coming accesses are based on aligned address.`
			`*/`
			`tbz tmp2, #0, 1f`
			`ldrb tmp1w, [src, #-1]!`
			`strb tmp1w, [dst, #-1]!`
			`1:`
			`tbz tmp2, #1, 2f`
			`ldrh tmp1w, [src, #-2]!`
			`strh tmp1w, [dst, #-2]!`
			`2:`
			`tbz tmp2, #2, 3f`
			`ldr tmp1w, [src, #-4]!`
			`str tmp1w, [dst, #-4]!`
			`3:`
			`tbz tmp2, #3, .LSrcAligned`
			`ldr tmp1, [src, #-8]!`
			`str tmp1, [dst, #-8]!`

			`.LSrcAligned:`
			`cmp count, #64`
			`b.ge .Lcpy_over64`

			`/*`
			`* Deal with small copies quickly by dropping straight into the`
			`* exit block.`
			`*/`
			`.Ltail63:`
			`/*`
			`* Copy up to 48 bytes of data. At this point we only need the`
			`* bottom 6 bits of count to be accurate.`
			`*/`
			`ands tmp1, count, #0x30`
			`b.eq .Ltail15`
			`cmp tmp1w, #0x20`
			`b.eq 1f`
			`b.lt 2f`
			`ldp A_l, A_h, [src, #-16]!`
			`stp A_l, A_h, [dst, #-16]!`
			`1:`
			`ldp A_l, A_h, [src, #-16]!`
			`stp A_l, A_h, [dst, #-16]!`
			`2:`
			`ldp A_l, A_h, [src, #-16]!`
			`stp A_l, A_h, [dst, #-16]!`

			`.Ltail15:`
			`tbz count, #3, 1f`
			`ldr tmp1, [src, #-8]!`
			`str tmp1, [dst, #-8]!`
			`1:`
			`tbz count, #2, 2f`
			`ldr tmp1w, [src, #-4]!`
			`str tmp1w, [dst, #-4]!`
			`2:`
			`tbz count, #1, 3f`
			`ldrh tmp1w, [src, #-2]!`
			`strh tmp1w, [dst, #-2]!`
			`3:`
			`tbz count, #0, .Lexitfunc`
			`ldrb tmp1w, [src, #-1]`
			`strb tmp1w, [dst, #-1]`

			`.Lexitfunc:`
			`ret`

			`.Lcpy_over64:`
			`subs count, count, #128`
			`b.ge .Lcpy_body_large`
			`/*`
			`* Less than 128 bytes to copy, so handle 64 bytes here and then jump`
			`* to the tail.`
			`*/`
			`ldp A_l, A_h, [src, #-16]`
			`stp A_l, A_h, [dst, #-16]`
			`ldp B_l, B_h, [src, #-32]`
			`ldp C_l, C_h, [src, #-48]`
			`stp B_l, B_h, [dst, #-32]`
			`stp C_l, C_h, [dst, #-48]`
			`ldp D_l, D_h, [src, #-64]!`
			`stp D_l, D_h, [dst, #-64]!`

			`tst count, #0x3f`
			`b.ne .Ltail63`
			`ret`

			`/*`
			`* Critical loop. Start at a new cache line boundary. Assuming`
			`* 64 bytes per line this ensures the entire loop is in one line.`
			`*/`
			`.p2align L1_CACHE_SHIFT`
			`.Lcpy_body_large:`
			`/* pre-load 64 bytes data. */`
			`ldp A_l, A_h, [src, #-16]`
			`ldp B_l, B_h, [src, #-32]`
			`ldp C_l, C_h, [src, #-48]`
			`ldp D_l, D_h, [src, #-64]!`
			`1:`
			`/*`
			`* interlace the load of next 64 bytes data block with store of the last`
			`* loaded 64 bytes data.`
			`*/`
			`stp A_l, A_h, [dst, #-16]`
			`ldp A_l, A_h, [src, #-16]`
			`stp B_l, B_h, [dst, #-32]`
			`ldp B_l, B_h, [src, #-32]`
			`stp C_l, C_h, [dst, #-48]`
			`ldp C_l, C_h, [src, #-48]`
			`stp D_l, D_h, [dst, #-64]!`
			`ldp D_l, D_h, [src, #-64]!`
			`subs count, count, #64`
			`b.ge 1b`
			`stp A_l, A_h, [dst, #-16]`
			`stp B_l, B_h, [dst, #-32]`
			`stp C_l, C_h, [dst, #-48]`
			`stp D_l, D_h, [dst, #-64]!`

			`tst count, #0x3f`
			`b.ne .Ltail63`
			`ret`
			`ENDPIPROC(memmove)`
			`EXPORT_SYMBOL(memmove)`
			`ENDPROC(__memmove)`
			`EXPORT_SYMBOL(__memmove)`