Replacing pcmpestrm on Arm
This function is used to check whether each byte element (m128i_u8[index]) in str2 exists in str1. If yes, the corresponding bit in the result is set to 1.
For details about the intrinsic function corresponding to pcmpestrm, see Intrinsics Guide.
- Code on x86:
template<int MODE> static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { #ifdef __clang__ /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 - /// clang doesn't support Y-prefixed asm constraints. register volatile __m128i result asm ("xmm0"); __asm__ __volatile__ ("pcmpestrm %5, %2, %1": "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); #else __m128i result; __asm__ __volatile__ ("pcmpestrm %5, %2, %1": "=Yz"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); #endif return result; } - Alternative for Kunpeng processors:
#include <arm_neon.h> typedef union __attribute__((aligned(16))) __oword{ int32x4_t m128i; uint8_t m128i_u8[16]; } __oword; template<int MODE> static inline uint16_t SSE4_cmpestrm(int32x4_t str1, int len1, int32x4_t str2, int len2) { __oword a, b; a.m128i = str1; b.m128i = str2; uint16_t result = 0; uint16_t i = 0; uint16_t j = 0; / / Mode used in Impala: STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS for (i = 0; i < len2; i++) { for ( j = 0; j < len1; j++) { if (a.m128i_u8[j] == b.m128i_u8[i]) { result |= (1 << i); } } } return result; }
Parent topic: Source Code Modification Cases