ubuntu-buildroot/output/build/glibc-2.36-81-g4f4d7a13edfd.../sysdeps/x86_64/multiarch/memrchr-evex.S

/* memrchr optimized with 256-bit EVEX instructions.
   Copyright (C) 2021-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <isa-level.h>

#if ISA_SHOULD_BUILD (4)

# include <sysdep.h>
# include "evex256-vecs.h"
# if VEC_SIZE != 32
#  error "VEC_SIZE != 32 unimplemented"
# endif

# ifndef MEMRCHR
#  define MEMRCHR				__memrchr_evex
# endif

# define PAGE_SIZE			4096
# define VECMATCH			VEC(0)

	.section SECTION(.text), "ax", @progbits
ENTRY_P2ALIGN(MEMRCHR, 6)
# ifdef __ILP32__
	/* Clear upper bits.  */
	and	%RDX_LP, %RDX_LP
# else
	test	%RDX_LP, %RDX_LP
# endif
	jz	L(zero_0)

	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
	   correct page cross check and 2) it correctly sets up end ptr to be
	   subtract by lzcnt aligned.  */
	leaq	-1(%rdi, %rdx), %rax
	vpbroadcastb %esi, %VECMATCH

	/* Check if we can load 1x VEC without cross a page.  */
	testl	$(PAGE_SIZE - VEC_SIZE), %eax
	jz	L(page_cross)

	/* Don't use rax for pointer here because EVEX has better encoding with
	   offset % VEC_SIZE == 0.  */
	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
	kmovd	%k0, %ecx

	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
	cmpq	$VEC_SIZE, %rdx
	ja	L(more_1x_vec)
L(ret_vec_x0_test):

	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
	   will guarantee edx (len) is less than it.  */
	lzcntl	%ecx, %ecx
	cmpl	%ecx, %edx
	jle	L(zero_0)
	subq	%rcx, %rax
	ret

	/* Fits in aligning bytes of first cache line.  */
L(zero_0):
	xorl	%eax, %eax
	ret

	.p2align 4,, 9
L(ret_vec_x0_dec):
	decq	%rax
L(ret_vec_x0):
	lzcntl	%ecx, %ecx
	subq	%rcx, %rax
	ret

	.p2align 4,, 10
L(more_1x_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x0)

	/* Align rax (pointer to string).  */
	andq	$-VEC_SIZE, %rax

	/* Recompute length after aligning.  */
	movq	%rax, %rdx

	/* Need no matter what.  */
	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	subq	%rdi, %rdx

	cmpq	$(VEC_SIZE * 2), %rdx
	ja	L(more_2x_vec)
L(last_2x_vec):

	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
	decq	%rax
	cmpl	$VEC_SIZE, %edx
	jbe	L(ret_vec_x0_test)

	testl	%ecx, %ecx
	jnz	L(ret_vec_x0)

	/* Don't use rax for pointer here because EVEX has better encoding with
	   offset % VEC_SIZE == 0.  */
	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
	kmovd	%k0, %ecx
	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
	lzcntq	%rcx, %rcx
	cmpl	%ecx, %edx
	jle	L(zero_0)
	subq	%rcx, %rax
	ret

	/* Inexpensive place to put this regarding code size / target alignments
	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
	   in first cache line.  */
L(page_cross):
	movq	%rax, %rsi
	andq	$-VEC_SIZE, %rsi
	vpcmpb	$0, (%rsi), %VECMATCH, %k0
	kmovd	%k0, %r8d
	/* Shift out negative alignment (because we are starting from endptr and
	   working backwards).  */
	movl	%eax, %ecx
	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
	notl	%ecx
	shlxl	%ecx, %r8d, %ecx
	cmpq	%rdi, %rsi
	ja	L(more_1x_vec)
	lzcntl	%ecx, %ecx
	cmpl	%ecx, %edx
	jle	L(zero_1)
	subq	%rcx, %rax
	ret

	/* Continue creating zero labels that fit in aligning bytes and get
	   2-byte encoding / are in the same cache line as condition.  */
L(zero_1):
	xorl	%eax, %eax
	ret

	.p2align 4,, 8
L(ret_vec_x1):
	/* This will naturally add 32 to position.  */
	bsrl	%ecx, %ecx
	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(more_2x_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x0_dec)

	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx
	testl	%ecx, %ecx
	jnz	L(ret_vec_x1)

	/* Need no matter what.  */
	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	subq	$(VEC_SIZE * 4), %rdx
	ja	L(more_4x_vec)

	cmpl	$(VEC_SIZE * -1), %edx
	jle	L(ret_vec_x2_test)
L(last_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x2)


	/* Need no matter what.  */
	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx
	lzcntl	%ecx, %ecx
	subq	$(VEC_SIZE * 3 + 1), %rax
	subq	%rcx, %rax
	cmpq	%rax, %rdi
	ja	L(zero_1)
	ret

	.p2align 4,, 8
L(ret_vec_x2_test):
	lzcntl	%ecx, %ecx
	subq	$(VEC_SIZE * 2 + 1), %rax
	subq	%rcx, %rax
	cmpq	%rax, %rdi
	ja	L(zero_1)
	ret

	.p2align 4,, 8
L(ret_vec_x2):
	bsrl	%ecx, %ecx
	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(ret_vec_x3):
	bsrl	%ecx, %ecx
	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
	ret

	.p2align 4,, 8
L(more_4x_vec):
	testl	%ecx, %ecx
	jnz	L(ret_vec_x2)

	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	testl	%ecx, %ecx
	jnz	L(ret_vec_x3)

	/* Check if near end before re-aligning (otherwise might do an
	   unnecessary loop iteration).  */
	addq	$-(VEC_SIZE * 4), %rax
	cmpq	$(VEC_SIZE * 4), %rdx
	jbe	L(last_4x_vec)

	decq	%rax
	andq	$-(VEC_SIZE * 4), %rax
	movq	%rdi, %rdx
	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
	   lengths that overflow can be valid and break the comparison.  */
	andq	$-(VEC_SIZE * 4), %rdx

	.p2align 4
L(loop_4x_vec):
	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
	   on).  */
	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1

	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4

	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
	vptestnmb %VEC(3), %VEC(3), %k2

	/* Any 1s and we found CHAR.  */
	kortestd %k2, %k4
	jnz	L(loop_end)

	addq	$-(VEC_SIZE * 4), %rax
	cmpq	%rdx, %rax
	jne	L(loop_4x_vec)

	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
	subq	$-(VEC_SIZE * 4), %rdx
	movq	%rdx, %rax
	subl	%edi, %edx
L(last_4x_vec):

	/* Used no matter what.  */
	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	cmpl	$(VEC_SIZE * 2), %edx
	jbe	L(last_2x_vec)

	testl	%ecx, %ecx
	jnz	L(ret_vec_x0_dec)


	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	testl	%ecx, %ecx
	jnz	L(ret_vec_x1)

	/* Used no matter what.  */
	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
	kmovd	%k0, %ecx

	cmpl	$(VEC_SIZE * 3), %edx
	ja	L(last_vec)

	lzcntl	%ecx, %ecx
	subq	$(VEC_SIZE * 2 + 1), %rax
	subq	%rcx, %rax
	cmpq	%rax, %rdi
	jbe	L(ret_1)
	xorl	%eax, %eax
L(ret_1):
	ret

	.p2align 4,, 6
L(loop_end):
	kmovd	%k1, %ecx
	notl	%ecx
	testl	%ecx, %ecx
	jnz	L(ret_vec_x0_end)

	vptestnmb %VEC(2), %VEC(2), %k0
	kmovd	%k0, %ecx
	testl	%ecx, %ecx
	jnz	L(ret_vec_x1_end)

	kmovd	%k2, %ecx
	kmovd	%k4, %esi
	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
	   then it won't affect the result in esi (VEC4). If ecx is non-zero
	   then CHAR in VEC3 and bsrq will use that position.  */
	salq	$32, %rcx
	orq	%rsi, %rcx
	bsrq	%rcx, %rcx
	addq	%rcx, %rax
	ret
	.p2align 4,, 4
L(ret_vec_x0_end):
	addq	$(VEC_SIZE), %rax
L(ret_vec_x1_end):
	bsrl	%ecx, %ecx
	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
	ret

END(MEMRCHR)
#endif
1 2024-04-01 15:19:46 +00:00			`/* memrchr optimized with 256-bit EVEX instructions.`
			`Copyright (C) 2021-2022 Free Software Foundation, Inc.`
			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`

			`#include <isa-level.h>`

			`#if ISA_SHOULD_BUILD (4)`

			`# include <sysdep.h>`
			`# include "evex256-vecs.h"`
			`# if VEC_SIZE != 32`
			`# error "VEC_SIZE != 32 unimplemented"`
			`# endif`

			`# ifndef MEMRCHR`
			`# define MEMRCHR __memrchr_evex`
			`# endif`

			`# define PAGE_SIZE 4096`
			`# define VECMATCH VEC(0)`

			`.section SECTION(.text), "ax", @progbits`
			`ENTRY_P2ALIGN(MEMRCHR, 6)`
			`# ifdef __ILP32__`
			`/* Clear upper bits. */`
			`and %RDX_LP, %RDX_LP`
			`# else`
			`test %RDX_LP, %RDX_LP`
			`# endif`
			`jz L(zero_0)`

			`/* Get end pointer. Minus one for two reasons. 1) It is necessary for a`
			`correct page cross check and 2) it correctly sets up end ptr to be`
			`subtract by lzcnt aligned. */`
			`leaq -1(%rdi, %rdx), %rax`
			`vpbroadcastb %esi, %VECMATCH`

			`/* Check if we can load 1x VEC without cross a page. */`
			`testl $(PAGE_SIZE - VEC_SIZE), %eax`
			`jz L(page_cross)`

			`/* Don't use rax for pointer here because EVEX has better encoding with`
			`offset % VEC_SIZE == 0. */`
			`vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */`
			`cmpq $VEC_SIZE, %rdx`
			`ja L(more_1x_vec)`
			`L(ret_vec_x0_test):`

			`/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which`
			`will guarantee edx (len) is less than it. */`
			`lzcntl %ecx, %ecx`
			`cmpl %ecx, %edx`
			`jle L(zero_0)`
			`subq %rcx, %rax`
			`ret`

			`/* Fits in aligning bytes of first cache line. */`
			`L(zero_0):`
			`xorl %eax, %eax`
			`ret`

			`.p2align 4,, 9`
			`L(ret_vec_x0_dec):`
			`decq %rax`
			`L(ret_vec_x0):`
			`lzcntl %ecx, %ecx`
			`subq %rcx, %rax`
			`ret`

			`.p2align 4,, 10`
			`L(more_1x_vec):`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x0)`

			`/* Align rax (pointer to string). */`
			`andq $-VEC_SIZE, %rax`

			`/* Recompute length after aligning. */`
			`movq %rax, %rdx`

			`/* Need no matter what. */`
			`vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`subq %rdi, %rdx`

			`cmpq $(VEC_SIZE * 2), %rdx`
			`ja L(more_2x_vec)`
			`L(last_2x_vec):`

			`/* Must dec rax because L(ret_vec_x0_test) expects it. */`
			`decq %rax`
			`cmpl $VEC_SIZE, %edx`
			`jbe L(ret_vec_x0_test)`

			`testl %ecx, %ecx`
			`jnz L(ret_vec_x0)`

			`/* Don't use rax for pointer here because EVEX has better encoding with`
			`offset % VEC_SIZE == 0. */`
			`vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0`
			`kmovd %k0, %ecx`
			`/* NB: 64-bit lzcnt. This will naturally add 32 to position. */`
			`lzcntq %rcx, %rcx`
			`cmpl %ecx, %edx`
			`jle L(zero_0)`
			`subq %rcx, %rax`
			`ret`

			`/* Inexpensive place to put this regarding code size / target alignments`
			`/ ICache NLP. Necessary for 2-byte encoding of jump to page cross`
			`case which in turn is necessary for hot path (len <= VEC_SIZE) to fit`
			`in first cache line. */`
			`L(page_cross):`
			`movq %rax, %rsi`
			`andq $-VEC_SIZE, %rsi`
			`vpcmpb $0, (%rsi), %VECMATCH, %k0`
			`kmovd %k0, %r8d`
			`/* Shift out negative alignment (because we are starting from endptr and`
			`working backwards). */`
			`movl %eax, %ecx`
			`/* notl because eax already has endptr - 1. (-x = ~(x - 1)). */`
			`notl %ecx`
			`shlxl %ecx, %r8d, %ecx`
			`cmpq %rdi, %rsi`
			`ja L(more_1x_vec)`
			`lzcntl %ecx, %ecx`
			`cmpl %ecx, %edx`
			`jle L(zero_1)`
			`subq %rcx, %rax`
			`ret`

			`/* Continue creating zero labels that fit in aligning bytes and get`
			`2-byte encoding / are in the same cache line as condition. */`
			`L(zero_1):`
			`xorl %eax, %eax`
			`ret`

			`.p2align 4,, 8`
			`L(ret_vec_x1):`
			`/* This will naturally add 32 to position. */`
			`bsrl %ecx, %ecx`
			`leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax`
			`ret`

			`.p2align 4,, 8`
			`L(more_2x_vec):`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x0_dec)`

			`vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x1)`

			`/* Need no matter what. */`
			`vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`subq $(VEC_SIZE * 4), %rdx`
			`ja L(more_4x_vec)`

			`cmpl $(VEC_SIZE * -1), %edx`
			`jle L(ret_vec_x2_test)`
			`L(last_vec):`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x2)`


			`/* Need no matter what. */`
			`vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`
			`lzcntl %ecx, %ecx`
			`subq $(VEC_SIZE * 3 + 1), %rax`
			`subq %rcx, %rax`
			`cmpq %rax, %rdi`
			`ja L(zero_1)`
			`ret`

			`.p2align 4,, 8`
			`L(ret_vec_x2_test):`
			`lzcntl %ecx, %ecx`
			`subq $(VEC_SIZE * 2 + 1), %rax`
			`subq %rcx, %rax`
			`cmpq %rax, %rdi`
			`ja L(zero_1)`
			`ret`

			`.p2align 4,, 8`
			`L(ret_vec_x2):`
			`bsrl %ecx, %ecx`
			`leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax`
			`ret`

			`.p2align 4,, 8`
			`L(ret_vec_x3):`
			`bsrl %ecx, %ecx`
			`leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
			`ret`

			`.p2align 4,, 8`
			`L(more_4x_vec):`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x2)`

			`vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`testl %ecx, %ecx`
			`jnz L(ret_vec_x3)`

			`/* Check if near end before re-aligning (otherwise might do an`
			`unnecessary loop iteration). */`
			`addq $-(VEC_SIZE * 4), %rax`
			`cmpq $(VEC_SIZE * 4), %rdx`
			`jbe L(last_4x_vec)`

			`decq %rax`
			`andq $-(VEC_SIZE * 4), %rax`
			`movq %rdi, %rdx`
			`/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because`
			`lengths that overflow can be valid and break the comparison. */`
			`andq $-(VEC_SIZE * 4), %rdx`

			`.p2align 4`
			`L(loop_4x_vec):`
			`/* Store 1 were not-equals and 0 where equals in k1 (used to mask later`
			`on). */`
			`vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1`

			`/* VEC(2/3) will have zero-byte where we found a CHAR. */`
			`vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)`
			`vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)`
			`vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4`

			`/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where`
			`CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */`
			`vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}`
			`vptestnmb %VEC(3), %VEC(3), %k2`

			`/* Any 1s and we found CHAR. */`
			`kortestd %k2, %k4`
			`jnz L(loop_end)`

			`addq $-(VEC_SIZE * 4), %rax`
			`cmpq %rdx, %rax`
			`jne L(loop_4x_vec)`

			`/* Need to re-adjust rdx / rax for L(last_4x_vec). */`
			`subq $-(VEC_SIZE * 4), %rdx`
			`movq %rdx, %rax`
			`subl %edi, %edx`
			`L(last_4x_vec):`

			`/* Used no matter what. */`
			`vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`cmpl $(VEC_SIZE * 2), %edx`
			`jbe L(last_2x_vec)`

			`testl %ecx, %ecx`
			`jnz L(ret_vec_x0_dec)`


			`vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`testl %ecx, %ecx`
			`jnz L(ret_vec_x1)`

			`/* Used no matter what. */`
			`vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0`
			`kmovd %k0, %ecx`

			`cmpl $(VEC_SIZE * 3), %edx`
			`ja L(last_vec)`

			`lzcntl %ecx, %ecx`
			`subq $(VEC_SIZE * 2 + 1), %rax`
			`subq %rcx, %rax`
			`cmpq %rax, %rdi`
			`jbe L(ret_1)`
			`xorl %eax, %eax`
			`L(ret_1):`
			`ret`

			`.p2align 4,, 6`
			`L(loop_end):`
			`kmovd %k1, %ecx`
			`notl %ecx`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x0_end)`

			`vptestnmb %VEC(2), %VEC(2), %k0`
			`kmovd %k0, %ecx`
			`testl %ecx, %ecx`
			`jnz L(ret_vec_x1_end)`

			`kmovd %k2, %ecx`
			`kmovd %k4, %esi`
			`/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)`
			`then it won't affect the result in esi (VEC4). If ecx is non-zero`
			`then CHAR in VEC3 and bsrq will use that position. */`
			`salq $32, %rcx`
			`orq %rsi, %rcx`
			`bsrq %rcx, %rcx`
			`addq %rcx, %rax`
			`ret`
			`.p2align 4,, 4`
			`L(ret_vec_x0_end):`
			`addq $(VEC_SIZE), %rax`
			`L(ret_vec_x1_end):`
			`bsrl %ecx, %ecx`
			`leaq (VEC_SIZE * 2)(%rax, %rcx), %rax`
			`ret`

			`END(MEMRCHR)`
			`#endif`