linux/linux-5.4.31/arch/arm64/lib/copy_template.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 */


/*
 * Copy a buffer from src to dest (alignment handled by the hardware)
 *
 * Parameters:
 *	x0 - dest
 *	x1 - src
 *	x2 - n
 * Returns:
 *	x0 - dest
 */
dstin	.req	x0
src	.req	x1
count	.req	x2
tmp1	.req	x3
tmp1w	.req	w3
tmp2	.req	x4
tmp2w	.req	w4
dst	.req	x6

A_l	.req	x7
A_h	.req	x8
B_l	.req	x9
B_h	.req	x10
C_l	.req	x11
C_h	.req	x12
D_l	.req	x13
D_h	.req	x14

	mov	dst, dstin
	cmp	count, #16
	/*When memory length is less than 16, the accessed are not aligned.*/
	b.lo	.Ltiny15

	neg	tmp2, src
	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
	b.eq	.LSrcAligned
	sub	count, count, tmp2
	/*
	* Copy the leading memory data from src to dst in an increasing
	* address order.By this way,the risk of overwriting the source
	* memory data is eliminated when the distance between src and
	* dst is less than 16. The memory accesses here are alignment.
	*/
	tbz	tmp2, #0, 1f
	ldrb1	tmp1w, src, #1
	strb1	tmp1w, dst, #1
1:
	tbz	tmp2, #1, 2f
	ldrh1	tmp1w, src, #2
	strh1	tmp1w, dst, #2
2:
	tbz	tmp2, #2, 3f
	ldr1	tmp1w, src, #4
	str1	tmp1w, dst, #4
3:
	tbz	tmp2, #3, .LSrcAligned
	ldr1	tmp1, src, #8
	str1	tmp1, dst, #8

.LSrcAligned:
	cmp	count, #64
	b.ge	.Lcpy_over64
	/*
	* Deal with small copies quickly by dropping straight into the
	* exit block.
	*/
.Ltail63:
	/*
	* Copy up to 48 bytes of data. At this point we only need the
	* bottom 6 bits of count to be accurate.
	*/
	ands	tmp1, count, #0x30
	b.eq	.Ltiny15
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	ldp1	A_l, A_h, src, #16
	stp1	A_l, A_h, dst, #16
1:
	ldp1	A_l, A_h, src, #16
	stp1	A_l, A_h, dst, #16
2:
	ldp1	A_l, A_h, src, #16
	stp1	A_l, A_h, dst, #16
.Ltiny15:
	/*
	* Prefer to break one ldp/stp into several load/store to access
	* memory in an increasing address order,rather than to load/store 16
	* bytes from (src-16) to (dst-16) and to backward the src to aligned
	* address,which way is used in original cortex memcpy. If keeping
	* the original memcpy process here, memmove need to satisfy the
	* precondition that src address is at least 16 bytes bigger than dst
	* address,otherwise some source data will be overwritten when memove
	* call memcpy directly. To make memmove simpler and decouple the
	* memcpy's dependency on memmove, withdrew the original process.
	*/
	tbz	count, #3, 1f
	ldr1	tmp1, src, #8
	str1	tmp1, dst, #8
1:
	tbz	count, #2, 2f
	ldr1	tmp1w, src, #4
	str1	tmp1w, dst, #4
2:
	tbz	count, #1, 3f
	ldrh1	tmp1w, src, #2
	strh1	tmp1w, dst, #2
3:
	tbz	count, #0, .Lexitfunc
	ldrb1	tmp1w, src, #1
	strb1	tmp1w, dst, #1

	b	.Lexitfunc

.Lcpy_over64:
	subs	count, count, #128
	b.ge	.Lcpy_body_large
	/*
	* Less than 128 bytes to copy, so handle 64 here and then jump
	* to the tail.
	*/
	ldp1	A_l, A_h, src, #16
	stp1	A_l, A_h, dst, #16
	ldp1	B_l, B_h, src, #16
	ldp1	C_l, C_h, src, #16
	stp1	B_l, B_h, dst, #16
	stp1	C_l, C_h, dst, #16
	ldp1	D_l, D_h, src, #16
	stp1	D_l, D_h, dst, #16

	tst	count, #0x3f
	b.ne	.Ltail63
	b	.Lexitfunc

	/*
	* Critical loop.  Start at a new cache line boundary.  Assuming
	* 64 bytes per line this ensures the entire loop is in one line.
	*/
	.p2align	L1_CACHE_SHIFT
.Lcpy_body_large:
	/* pre-get 64 bytes data. */
	ldp1	A_l, A_h, src, #16
	ldp1	B_l, B_h, src, #16
	ldp1	C_l, C_h, src, #16
	ldp1	D_l, D_h, src, #16
1:
	/*
	* interlace the load of next 64 bytes data block with store of the last
	* loaded 64 bytes data.
	*/
	stp1	A_l, A_h, dst, #16
	ldp1	A_l, A_h, src, #16
	stp1	B_l, B_h, dst, #16
	ldp1	B_l, B_h, src, #16
	stp1	C_l, C_h, dst, #16
	ldp1	C_l, C_h, src, #16
	stp1	D_l, D_h, dst, #16
	ldp1	D_l, D_h, src, #16
	subs	count, count, #64
	b.ge	1b
	stp1	A_l, A_h, dst, #16
	stp1	B_l, B_h, dst, #16
	stp1	C_l, C_h, dst, #16
	stp1	D_l, D_h, dst, #16

	tst	count, #0x3f
	b.ne	.Ltail63
.Lexitfunc:
1 2024-01-30 10:43:28 +00:00			`/* SPDX-License-Identifier: GPL-2.0-only */`
			`/*`
			`* Copyright (C) 2013 ARM Ltd.`
			`* Copyright (C) 2013 Linaro.`
			`*`
			`* This code is based on glibc cortex strings work originally authored by Linaro`
			`* be found @`
			`*`
			`* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/`
			`* files/head:/src/aarch64/`
			`*/`


			`/*`
			`* Copy a buffer from src to dest (alignment handled by the hardware)`
			`*`
			`* Parameters:`
			`* x0 - dest`
			`* x1 - src`
			`* x2 - n`
			`* Returns:`
			`* x0 - dest`
			`*/`
			`dstin .req x0`
			`src .req x1`
			`count .req x2`
			`tmp1 .req x3`
			`tmp1w .req w3`
			`tmp2 .req x4`
			`tmp2w .req w4`
			`dst .req x6`

			`A_l .req x7`
			`A_h .req x8`
			`B_l .req x9`
			`B_h .req x10`
			`C_l .req x11`
			`C_h .req x12`
			`D_l .req x13`
			`D_h .req x14`

			`mov dst, dstin`
			`cmp count, #16`
			`/When memory length is less than 16, the accessed are not aligned./`
			`b.lo .Ltiny15`

			`neg tmp2, src`
			`ands tmp2, tmp2, #15/* Bytes to reach alignment. */`
			`b.eq .LSrcAligned`
			`sub count, count, tmp2`
			`/*`
			`* Copy the leading memory data from src to dst in an increasing`
			`* address order.By this way,the risk of overwriting the source`
			`* memory data is eliminated when the distance between src and`
			`* dst is less than 16. The memory accesses here are alignment.`
			`*/`
			`tbz tmp2, #0, 1f`
			`ldrb1 tmp1w, src, #1`
			`strb1 tmp1w, dst, #1`
			`1:`
			`tbz tmp2, #1, 2f`
			`ldrh1 tmp1w, src, #2`
			`strh1 tmp1w, dst, #2`
			`2:`
			`tbz tmp2, #2, 3f`
			`ldr1 tmp1w, src, #4`
			`str1 tmp1w, dst, #4`
			`3:`
			`tbz tmp2, #3, .LSrcAligned`
			`ldr1 tmp1, src, #8`
			`str1 tmp1, dst, #8`

			`.LSrcAligned:`
			`cmp count, #64`
			`b.ge .Lcpy_over64`
			`/*`
			`* Deal with small copies quickly by dropping straight into the`
			`* exit block.`
			`*/`
			`.Ltail63:`
			`/*`
			`* Copy up to 48 bytes of data. At this point we only need the`
			`* bottom 6 bits of count to be accurate.`
			`*/`
			`ands tmp1, count, #0x30`
			`b.eq .Ltiny15`
			`cmp tmp1w, #0x20`
			`b.eq 1f`
			`b.lt 2f`
			`ldp1 A_l, A_h, src, #16`
			`stp1 A_l, A_h, dst, #16`
			`1:`
			`ldp1 A_l, A_h, src, #16`
			`stp1 A_l, A_h, dst, #16`
			`2:`
			`ldp1 A_l, A_h, src, #16`
			`stp1 A_l, A_h, dst, #16`
			`.Ltiny15:`
			`/*`
			`* Prefer to break one ldp/stp into several load/store to access`
			`* memory in an increasing address order,rather than to load/store 16`
			`* bytes from (src-16) to (dst-16) and to backward the src to aligned`
			`* address,which way is used in original cortex memcpy. If keeping`
			`* the original memcpy process here, memmove need to satisfy the`
			`* precondition that src address is at least 16 bytes bigger than dst`
			`* address,otherwise some source data will be overwritten when memove`
			`* call memcpy directly. To make memmove simpler and decouple the`
			`* memcpy's dependency on memmove, withdrew the original process.`
			`*/`
			`tbz count, #3, 1f`
			`ldr1 tmp1, src, #8`
			`str1 tmp1, dst, #8`
			`1:`
			`tbz count, #2, 2f`
			`ldr1 tmp1w, src, #4`
			`str1 tmp1w, dst, #4`
			`2:`
			`tbz count, #1, 3f`
			`ldrh1 tmp1w, src, #2`
			`strh1 tmp1w, dst, #2`
			`3:`
			`tbz count, #0, .Lexitfunc`
			`ldrb1 tmp1w, src, #1`
			`strb1 tmp1w, dst, #1`

			`b .Lexitfunc`

			`.Lcpy_over64:`
			`subs count, count, #128`
			`b.ge .Lcpy_body_large`
			`/*`
			`* Less than 128 bytes to copy, so handle 64 here and then jump`
			`* to the tail.`
			`*/`
			`ldp1 A_l, A_h, src, #16`
			`stp1 A_l, A_h, dst, #16`
			`ldp1 B_l, B_h, src, #16`
			`ldp1 C_l, C_h, src, #16`
			`stp1 B_l, B_h, dst, #16`
			`stp1 C_l, C_h, dst, #16`
			`ldp1 D_l, D_h, src, #16`
			`stp1 D_l, D_h, dst, #16`

			`tst count, #0x3f`
			`b.ne .Ltail63`
			`b .Lexitfunc`

			`/*`
			`* Critical loop. Start at a new cache line boundary. Assuming`
			`* 64 bytes per line this ensures the entire loop is in one line.`
			`*/`
			`.p2align L1_CACHE_SHIFT`
			`.Lcpy_body_large:`
			`/* pre-get 64 bytes data. */`
			`ldp1 A_l, A_h, src, #16`
			`ldp1 B_l, B_h, src, #16`
			`ldp1 C_l, C_h, src, #16`
			`ldp1 D_l, D_h, src, #16`
			`1:`
			`/*`
			`* interlace the load of next 64 bytes data block with store of the last`
			`* loaded 64 bytes data.`
			`*/`
			`stp1 A_l, A_h, dst, #16`
			`ldp1 A_l, A_h, src, #16`
			`stp1 B_l, B_h, dst, #16`
			`ldp1 B_l, B_h, src, #16`
			`stp1 C_l, C_h, dst, #16`
			`ldp1 C_l, C_h, src, #16`
			`stp1 D_l, D_h, dst, #16`
			`ldp1 D_l, D_h, src, #16`
			`subs count, count, #64`
			`b.ge 1b`
			`stp1 A_l, A_h, dst, #16`
			`stp1 B_l, B_h, dst, #16`
			`stp1 C_l, C_h, dst, #16`
			`stp1 D_l, D_h, dst, #16`

			`tst count, #0x3f`
			`b.ne .Ltail63`
			`.Lexitfunc:`