鲲鹏社区首页
中文
注册
我要评分
文档获取效率
文档正确性
内容完整性
文档易理解
在线提单
论坛求助

编译和安装

操作步骤

  1. 使用PuTTY工具,以root用户登录服务器。
  2. 执行以下命令下载源码到“/path/to”目录。
    cd /path/to
    git clone -b v5.0.1428 https://github.com/rcedgar/muscle.git
  3. 执行以下命令进入“src”目录。
    cd /path/to/muscle/src
  4. 执行以下命令修改“Makefile”文件。
    1. 打开“Makefile”文件。
      vi Makefile
    2. 按“i”进入编辑模式,修改第1、2、4、5和8行,删除ccache、-msse、-mfpmath=sse、-static关键字。

      修改前:

      修改后:

      执行4.a之后,按“Esc”键,输入“:set nu”,按“Enter”即可显示出行号。

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  5. 执行以下命令修改“pairhmm.h”文件。
    1. 打开“pairhmm.h”文件。
      vi pairhmm.h
    2. 按“i”进入编辑模式,修改第34行,在“>>”中间增加空格。

      修改前:

      修改后:

      执行5.a之后,按“Esc”键,输入“:set nu”,按“Enter”即可显示出行号。

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  6. 执行以下命令修改“pairhmm.cpp”文件。
    1. 打开“pairhmm.cpp”文件。
      vi pairhmm.cpp
    2. 按“i”进入编辑模式,修改第70行,在“>>”中间增加空格。

      修改前:

      修改后:

      执行6.a之后,按“Esc”键,输入“:set nu”,按“Enter”即可显示出行号。

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  7. 执行以下命令修改“perturbhmm.cpp”文件。
    1. 打开“perturbhmm.cpp”文件。
      vi perturbhmm.cpp
    2. 按“i”进入编辑模式,修改第51和62行,将“abs”改成“fabs”。

      修改前:

      修改后:

      执行7.a之后,按“Esc”键,输入“:set nu”,按“Enter”即可显示出行号。

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  8. 执行以下命令修改“multisequence.cpp”文件。
    1. 打开“multisequence.cpp”文件。
      vi multisequence.cpp
    2. 按“i”进入编辑模式,增加头文件列表“#include <errno.h>”。

      修改前:

      修改后:

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  9. 执行以下命令修改“myutils.cpp”文件。
    1. 打开“myutils.cpp”文件。
      vi myutils.cpp
    2. 按“i”进入编辑模式,删除第2055行“GetVersionString”的整段代码,该段代码有缺陷,不影响程序使用。

      修改前:

      修改后:

      执行9.a之后,按“Esc”键,输入“:set nu”,按“Enter”即可显示出行号。

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  10. 执行以下命令修改“timing.h”文件。
    1. 打开“timing.h”文件。
      vi timing.h
    2. 按“i”进入编辑模式。
      • 在文件开头添加一个宏定义。
        #if defined(__aarch64__)
        #include "KunpengTrans.h"
        #endif

        修改前:

        修改后:

      • 修改第25行的相关代码“__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi))”,修改代码如下。
        #if defined(__x86_64__)
                      __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
         
        #elif defined(__aarch64__)
               // Description: Replace with the converted code block suggested. Note:
               // Suggestion: 
               //    {
               unsigned int _kp_tmp_CPUFreq = KUNPENG_CPU_FREQUENCY_MHZ; 
               unsigned long long _kp_tmp_TSCCount; 
               unsigned long long _kp_tmp_TSCFreq; 
               __asm__ __volatile__ ("mrs %0, cntfrq_el0" : "=r" (_kp_tmp_TSCFreq)); 
               __asm__ __volatile__ ("mrs %0, cntvct_el0" : "=r" (_kp_tmp_TSCCount)); 
               lo = (unsigned int)(_kp_tmp_TSCCount * _kp_tmp_CPUFreq / _kp_tmp_TSCFreq); 
               hi = (unsigned int)((_kp_tmp_TSCCount * _kp_tmp_CPUFreq / _kp_tmp_TSCFreq) >> 32); 
               // }
        #endif

        修改前:

        修改后:

        执行10.a之后,按“Esc”键,输入“:set nu”,按“Enter”即可显示出行号。

    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  11. 执行以下命令创建“KunpengTrans.h”文件。
    1. 创建“KunpengTrans.h”文件。

      vi KunpengTrans.h

    2. 按“i”进入编辑模式,添加如下内容。
      /*
       * @Description: KunpengTrans.h
       * @Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
       */
      #ifndef KUNPENG_TRANS_H
      #define KUNPENG_TRANS_H
      
      #include <string.h>
      
      
      /* ATTENTION:
       * Please set KUNPENG_CPU_FREQUENCY_MHZ to the actual cpu frequecy of your running environment.
       */
      const int KUNPENG_CPU_FREQUENCY_MHZ = 2600;
      const int EAX_LEAF = 7;
      const int SHIFT_THREE = 3;
      /* CASE eax = 0; Highest Function Parameter and Manufacturer ID
       */
      void GetCPUManuID(unsigned int *ebx, unsigned int *edx, unsigned int *ecx);
      
      
      // %ecx
      #define bit_SSE3 (1 << 0)
      #define bit_PCLMUL (1 << 1)
      #define bit_LZCNT (1 << 5)
      #define bit_SSSE3 (1 << 9)
      #define bit_FMA (1 << 12)
      #define bit_CMPXCHG16B (1 << 13)
      #define bit_SSE4_1 (1 << 19)
      #define bit_SSE4_2 (1 << 20)
      #define bit_MOVBE (1 << 22)
      #define bit_POPCNT (1 << 23)
      #define bit_AES (1 << 25)
      #define bit_XSAVE (1 << 26)
      #define bit_OSXSAVE (1 << 27)
      #define bit_AVX (1 << 28)
      #define bit_F16C (1 << 29)
      #define bit_RDRND (1 << 30)
      
      // %edx
      #define bit_CMPXCHG8B (1 << 8)
      #define bit_CMOV (1 << 15)
      #define bit_MMX (1 << 23)
      #define bit_FXSAVE (1 << 24)
      #define bit_SSE (1 << 25)
      #define bit_SSE2 (1 << 26)
      /* CASE eax = 1; Processor Info and Feature Bits
       * Skylake:       eax = 0x00050654, ebx = 0x43400800, ecx = 0x7ffefbf7, edx = 0xbfebfbff (采样于6148)
       * Cascade Lake : eax = 0x00050657, ebx = 0x08400800, ecx = 0x7ffefbf7, edx = 0xbfebfbff (采样于6248)
      */
      void GetCPUFeature(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx);
      
      
      /* %ebx */
      #define bit_FSGSBASE (1 << 0)
      #define bit_SGX (1 << 2)
      #define bit_BMI (1 << 3)
      #define bit_HLE (1 << 4)
      #define bit_AVX2 (1 << 5)
      #define bit_BMI2 (1 << 8)
      #define bit_RTM (1 << 11)
      #define bit_MPX (1 << 14)
      #define bit_AVX512F (1 << 16)
      #define bit_AVX512DQ (1 << 17)
      #define bit_RDSEED (1 << 18)
      #define bit_ADX (1 << 19)
      #define bit_AVX512IFMA (1 << 21)
      #define bit_CLFLUSHOPT (1 << 23)
      #define bit_CLWB (1 << 24)
      #define bit_AVX512PF (1 << 26)
      #define bit_AVX512ER (1 << 27)
      #define bit_AVX512CD (1 << 28)
      #define bit_SHA (1 << 29)
      #define bit_AVX512BW (1 << 30)
      #define bit_AVX512VL (1u << 31)
      
      /* %ecx */
      #define bit_PREFETCHWT1 (1 << 0)
      #define bit_AVX512VBMI (1 << 1)
      #define bit_PKU (1 << 3)
      #define bit_OSPKE (1 << 4)
      #define bit_AVX512VBMI2 (1 << 6)
      #define bit_SHSTK (1 << 7)
      #define bit_GFNI (1 << 8)
      #define bit_VAES (1 << 9)
      #define bit_AVX512VNNI (1 << 11)
      #define bit_VPCLMULQDQ (1 << 10)
      #define bit_AVX512BITALG (1 << 12)
      #define bit_AVX512VPOPCNTDQ (1 << 14)
      #define bit_RDPID (1 << 22)
      #define bit_MOVDIRI (1 << 27)
      #define bit_MOVDIR64B (1 << 28)
      
      /* %edx */
      #define bit_AVX5124VNNIW (1 << 2)
      #define bit_AVX5124FMAPS (1 << 3)
      #define bit_IBT (1 << 20)
      #define bit_PCONFIG (1 << 18)
      /* CASE eax = 7, ecx = 0; Extended Features
       * Skylake:       eax = 0x00000000, ebx = 0xd39ffffb, ecx = 0x00000018, edx = 0x9c002400 (采样于6148)
       * Cascade Lake : eax = 0x00000000, ebx = 0xd39ffffb, ecx = 0x00000818, edx = 0xbc000400 (采样于6248)
       */
      void GetExtendCPUFeature(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx);
      
      
      /*
       * GetSupportedCPUID only supported while eax = 0, eax = 1 , eax = 7 and ecx = 0
       * In other cases always return 0x0;
       * chipID support 1,2 -- 1 for skylake server eg. 6148; 2 for Cascade Lake 6248
       */
      void GetSupportedCPUID(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx);
      
      void RepStos(void *dest, unsigned long long src, unsigned long long len, unsigned width, int df);
      
      
      void RepStosB(void *dest, unsigned long long src, unsigned long long len, int DF);
      
      
      void RepStosW(void *dest, unsigned long long src, unsigned long long len, int DF);
      
      
      void RepStosD(void *dest, unsigned long long src, unsigned long long len, int DF);
      
      
      void RepStosQ(void *dest, unsigned long long src, unsigned long long len, int DF);
      
      void RepMovs(void *dest, void *src, unsigned long long len, unsigned width, int df);
      
      
      #include <arm_neon.h>
      
      #define KP_FORCE_INLINE static inline __attribute__((always_inline))
      
      typedef union {
          int8x16_t vect_s8;
          int16x8_t vect_s16;
          int32x4_t vect_s32;
          int64x2_t vect_s64;
          uint8x16_t vect_u8;
          uint16x8_t vect_u16;
          uint32x4_t vect_u32;
          uint64x2_t vect_u64;
      } __kp_m128i;
      
      #define _KP_SIDD_NEGATIVE_POLARITY 0x10        // negate results
      #define _KP_SIDD_MASKED_NEGATIVE_POLARITY 0x30 // negate results only before end of string
      
      static uint16_t g_kp_mask_epi16[8] __attribute__((aligned(16))) = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
      static uint8_t g_kp_mask_epi8[16] __attribute__((aligned(16))) = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
                                                                      0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
      
      #define KP_PCMPSTR_EQ_16x8(a, b, mtx)                                                           \
          {                                                                                        \
              mtx[0].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 0)), a.vect_u16); \
              mtx[1].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 1)), a.vect_u16); \
              mtx[2].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 2)), a.vect_u16); \
              mtx[3].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 3)), a.vect_u16); \
              mtx[4].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 4)), a.vect_u16); \
              mtx[5].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 5)), a.vect_u16); \
              mtx[6].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 6)), a.vect_u16); \
              mtx[7].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 7)), a.vect_u16); \
          }
      
      #define KP_PCMPSTR_EQ_8x16(a, b, mtx)                                                       \
          {                                                                                    \
              mtx[0].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 0)), a.vect_u8);   \
              mtx[1].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 1)), a.vect_u8);   \
              mtx[2].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 2)), a.vect_u8);   \
              mtx[3].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 3)), a.vect_u8);   \
              mtx[4].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 4)), a.vect_u8);   \
              mtx[5].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 5)), a.vect_u8);   \
              mtx[6].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 6)), a.vect_u8);   \
              mtx[7].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 7)), a.vect_u8);   \
              mtx[8].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 8)), a.vect_u8);   \
              mtx[9].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 9)), a.vect_u8);   \
              mtx[10].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 10)), a.vect_u8); \
              mtx[11].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 11)), a.vect_u8); \
              mtx[12].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 12)), a.vect_u8); \
              mtx[13].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 13)), a.vect_u8); \
              mtx[14].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 14)), a.vect_u8); \
              mtx[15].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 15)), a.vect_u8); \
          }
      
      #define KP_PCMPSTR_RNG_U16x8(a, b, mtx)                                                                          \
          {                                                                                                         \
              uint16x8_t vect_b[8];                                                                                 \
              __kp_m128i mask;                                                                                         \
              mask.vect_u32 = vdupq_n_u32(0xffff);                                                                  \
              vect_b[0] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 0));                                               \
              vect_b[1] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 1));                                               \
              vect_b[2] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 2));                                               \
              vect_b[3] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 3));                                               \
              vect_b[4] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 4));                                               \
              vect_b[5] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 5));                                               \
              vect_b[6] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 6));                                               \
              vect_b[7] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 7));                                               \
              int i;                                                                                                \
              for (i = 0; i < 8; i++) {                                                                             \
                  mtx[i].vect_u16 = vbslq_u16(mask.vect_u16, vcgeq_u16(vect_b[i], a.vect_u16),                      \
                  vcleq_u16(vect_b[i], a.vect_u16));                                                                \
              }                                                                                                     \
          }
      #define KP_PCMPSTR_RNG_S16x8(a, b, mtx)                                                                          \
          {                                                                                                         \
              int16x8_t vect_b[8];                                                                                  \
              __kp_m128i mask;                                                                                         \
              mask.vect_u32 = vdupq_n_u32(0xffff);                                                                  \
              vect_b[0] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 0));                                               \
              vect_b[1] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 1));                                               \
              vect_b[2] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 2));                                               \
              vect_b[3] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 3));                                               \
              vect_b[4] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 4));                                               \
              vect_b[5] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 5));                                               \
              vect_b[6] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 6));                                               \
              vect_b[7] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 7));                                               \
              int i;                                                                                                \
              for (i = 0; i < 8; i++) {                                                                             \
                  mtx[i].vect_u16 = vbslq_u16(mask.vect_u16, vcgeq_s16(vect_b[i], a.vect_s16),                      \
                  vcleq_s16(vect_b[i], a.vect_s16));                                                                \
              }                                                                                                     \
          }
      
      #define KP_PCMPSTR_RNG_U8x16(a, b, mtx)                                                                                 \
          {                                                                                                                \
              uint8x16_t vect_b[16];                                                                                       \
              __kp_m128i mask;                                                                                                \
              mask.vect_u16 = vdupq_n_u16(0xff);                                                                           \
              vect_b[0] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 0));                                                         \
              vect_b[1] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 1));                                                         \
              vect_b[2] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 2));                                                         \
              vect_b[3] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 3));                                                         \
              vect_b[4] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 4));                                                         \
              vect_b[5] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 5));                                                         \
              vect_b[6] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 6));                                                         \
              vect_b[7] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 7));                                                         \
              vect_b[8] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 8));                                                         \
              vect_b[9] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 9));                                                         \
              vect_b[10] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 10));                                                       \
              vect_b[11] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 11));                                                       \
              vect_b[12] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 12));                                                       \
              vect_b[13] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 13));                                                       \
              vect_b[14] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 14));                                                       \
              vect_b[15] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 15));                                                       \
              int i;                                                                                                       \
              for (i = 0; i < 16; i++) {                                                                                   \
                  mtx[i].vect_u8 = vbslq_u8(mask.vect_u8, vcgeq_u8(vect_b[i], a.vect_u8), vcleq_u8(vect_b[i], a.vect_u8)); \
              }                                                                                                            \
          }
      
      #define KP_PCMPSTR_RNG_S8x16(a, b, mtx)                                                                                 \
          {                                                                                                                \
              int8x16_t vect_b[16];                                                                                        \
              __kp_m128i mask;                                                                                                \
              mask.vect_u16 = vdupq_n_u16(0xff);                                                                           \
              vect_b[0] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 0));                                                         \
              vect_b[1] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 1));                                                         \
              vect_b[2] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 2));                                                         \
              vect_b[3] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 3));                                                         \
              vect_b[4] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 4));                                                         \
              vect_b[5] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 5));                                                         \
              vect_b[6] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 6));                                                         \
              vect_b[7] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 7));                                                         \
              vect_b[8] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 8));                                                         \
              vect_b[9] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 9));                                                         \
              vect_b[10] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 10));                                                       \
              vect_b[11] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 11));                                                       \
              vect_b[12] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 12));                                                       \
              vect_b[13] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 13));                                                       \
              vect_b[14] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 14));                                                       \
              vect_b[15] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 15));                                                       \
              int i;                                                                                                       \
              for (i = 0; i < 16; i++) {                                                                                   \
                  mtx[i].vect_u8 = vbslq_u8(mask.vect_u8, vcgeq_s8(vect_b[i], a.vect_s8), vcleq_s8(vect_b[i], a.vect_s8)); \
              }                                                                                                            \
          }
      
      static int kp_aggregate_equal_any_8x16(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint8x8_t vect_mask = vld1_u8(g_kp_mask_epi8);
          uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vect_mask);
          uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vect_mask);
          uint8x16_t vect = vcombine_u8(t_lo, t_hi);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u8 = vandq_u8(vect, mtx[j].vect_u8);
              mtx[j].vect_u8 = vshrq_n_u8(mtx[j].vect_u8, 7);
              int tmp = vaddvq_u8(mtx[j].vect_u8) ? 1 : 0;
              res |= (tmp << j);
          }
          return res;
      }
      
      static int kp_aggregate_equal_any_16x8(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint16x8_t vect = vtstq_u16(vdupq_n_u16(m), vld1q_u16(g_kp_mask_epi16));
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u16 = vandq_u16(vect, mtx[j].vect_u16);
              mtx[j].vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 15);
              int tmp = vaddvq_u16(mtx[j].vect_u16) ? 1 : 0;
              res |= (tmp << j);
          }
          return res;
      }
      
      static int kp_cal_res_byte_equal_any(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_8x16(a, b, mtx);
          return kp_aggregate_equal_any_8x16(la, lb, mtx);
      }
      
      static int kp_cal_res_word_equal_any(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_16x8(a, b, mtx);
          return kp_aggregate_equal_any_16x8(la, lb, mtx);
      }
      
      static int kp_aggregate_ranges_16x8(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint16x8_t vect = vtstq_u16(vdupq_n_u16(m), vld1q_u16(g_kp_mask_epi16));
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u16 = vandq_u16(vect, mtx[j].vect_u16);
              mtx[j].vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 15);
              __kp_m128i tmp;
              tmp.vect_u32 = vshrq_n_u32(mtx[j].vect_u32, 16);
              uint32x4_t vect_res = vandq_u32(mtx[j].vect_u32, tmp.vect_u32);
              int t = vaddvq_u32(vect_res) ? 1 : 0;
              res |= (t << j);
          }
          return res;
      }
      
      static int kp_aggregate_ranges_8x16(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint8x8_t vect_mask = vld1_u8(g_kp_mask_epi8);
          uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vect_mask);
          uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vect_mask);
          uint8x16_t vect = vcombine_u8(t_lo, t_hi);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u8 = vandq_u8(vect, mtx[j].vect_u8);
              mtx[j].vect_u8 = vshrq_n_u8(mtx[j].vect_u8, 7);
              __kp_m128i tmp;
              tmp.vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 8);
              uint16x8_t vect_res = vandq_u16(mtx[j].vect_u16, tmp.vect_u16);
              int t = vaddvq_u16(vect_res) ? 1 : 0;
              res |= (t << j);
          }
          return res;
      }
      
      static int kp_cal_res_ubyte_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_U8x16(a, b, mtx);
          return kp_aggregate_ranges_8x16(la, lb, mtx);
      }
      
      static int kp_cal_res_sbyte_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_S8x16(a, b, mtx);
          return kp_aggregate_ranges_8x16(la, lb, mtx);
      }
      
      static int kp_cal_res_uword_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_U16x8(a, b, mtx);
          return kp_aggregate_ranges_16x8(la, lb, mtx);
      }
      
      static int kp_cal_res_sword_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_S16x8(a, b, mtx);
          return kp_aggregate_ranges_16x8(la, lb, mtx);
      }
      
      static int kp_cal_res_byte_equal_each(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          uint8x16_t mtx = vceqq_u8(a.vect_u8, b.vect_u8);
          int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
          int m1 = 0x10000 - (1 << la);
          int tb = 0x10000 - (1 << lb);
          uint8x8_t vect_mask, vect0_lo, vect0_hi, vect1_lo, vect1_hi;
          uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
          vect_mask = vld1_u8(g_kp_mask_epi8);
          vect0_lo = vtst_u8(vdup_n_u8(m0), vect_mask);
          vect0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vect_mask);
          vect1_lo = vtst_u8(vdup_n_u8(m1), vect_mask);
          vect1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vect_mask);
          tmp_lo = vtst_u8(vdup_n_u8(tb), vect_mask);
          tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vect_mask);
      
          res_lo = vbsl_u8(vect0_lo, vdup_n_u8(0), vget_low_u8(mtx));
          res_hi = vbsl_u8(vect0_hi, vdup_n_u8(0), vget_high_u8(mtx));
          res_lo = vbsl_u8(vect1_lo, tmp_lo, res_lo);
          res_hi = vbsl_u8(vect1_hi, tmp_hi, res_hi);
          res_lo = vand_u8(res_lo, vect_mask);
          res_hi = vand_u8(res_hi, vect_mask);
      
          int res = vaddv_u8(res_lo) + (vaddv_u8(res_hi) << 8);
          return res;
      }
      
      static int kp_cal_res_word_equal_each(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          uint16x8_t mtx = vceqq_u16(a.vect_u16, b.vect_u16);
          int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
          int m1 = 0x100 - (1 << la);
          int tb = 0x100 - (1 << lb);
          uint16x8_t vect_mask = vld1q_u16(g_kp_mask_epi16);
          uint16x8_t vect0 = vtstq_u16(vdupq_n_u16(m0), vect_mask);
          uint16x8_t vect1 = vtstq_u16(vdupq_n_u16(m1), vect_mask);
          uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vect_mask);
          mtx = vbslq_u16(vect0, vdupq_n_u16(0), mtx);
          mtx = vbslq_u16(vect1, tmp, mtx);
          mtx = vandq_u16(mtx, vect_mask);
          return vaddvq_u16(mtx);
      }
      
      static int kp_aggregate_equal_ordered_8x16(int bound, int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j, k;
          int m1 = 0x10000 - (1 << la);
          uint8x16_t vect_mask = vld1q_u8(g_kp_mask_epi8);
          uint8x16_t vect1 = vtstq_u8(vdupq_n_u8(m1), vect_mask);
          uint8x16_t vect_minusone = vdupq_n_u8(-1);
          uint8x16_t vect_zero = vdupq_n_u8(0);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u8 = vbslq_u8(vect1, vect_minusone, mtx[j].vect_u8);
          }
          for (j = lb; j < bound; j++) {
              mtx[j].vect_u8 = vbslq_u8(vect1, vect_minusone, vect_zero);
          }
          uint8_t enable[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
          for (j = 0; j < bound; j++) {
              int val = 1;
              uint8x16_t vect_en = vld1q_u8(enable);
              for (k = j; k < bound && val == 1; k++) {
                  int t = vaddvq_u8(vandq_u8(mtx[j].vect_u8, vect_en));
                  val = (t == bound - j) ? 1 : 0;
              }
              res = (val << j) + res;
              enable[bound - 1 - j] = 0;
          }
          return res;
      }
      
      static int kp_aggregate_equal_ordered_16x8(int bound, int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j, k;
          int m1 = 0x100 - (1 << la);
          uint16x8_t vect_mask = vld1q_u16(g_kp_mask_epi16);
          uint16x8_t vect1 = vtstq_u16(vdupq_n_u16(m1), vect_mask);
          uint16x8_t vect_minusone = vdupq_n_u16(-1);
          uint16x8_t vect_zero = vdupq_n_u16(0);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u16 = vbslq_u16(vect1, vect_minusone, mtx[j].vect_u16);
          }
          for (j = lb; j < bound; j++) {
              mtx[j].vect_u16 = vbslq_u16(vect1, vect_minusone, vect_zero);
          }
          uint16_t enable[8] = {1, 1, 1, 1, 1, 1, 1, 1};
          for (j = 0; j < bound; j++) {
              int val = 1;
              uint16x8_t vect_en = vld1q_u16(enable);
              for (k = j; k < bound && val == 1; k++) {
                  int t = vaddvq_u16(vandq_u16(mtx[j].vect_u16, vect_en));
                  val = (t == bound - j) ? 1 : 0;
              }
              res = (val << j) + res;
              enable[bound - 1 - j] = 0;
          }
          return res;
      }
      
      static int kp_cal_res_byte_equal_ordered(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_8x16(a, b, mtx);
          return kp_aggregate_equal_ordered_8x16(16, la, lb, mtx);
      }
      
      static int kp_cal_res_word_equal_ordered(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_16x8(a, b, mtx);
          return kp_aggregate_equal_ordered_16x8(8, la, lb, mtx);
      }
      
      typedef enum {
          KP_CMP_UBYTE_EQUAL_ANY,
          KP_CMP_UWORD_EQUAL_ANY,
          KP_CMP_SBYTE_EQUAL_ANY,
          KP_CMP_SWORD_EQUAL_ANY,
          KP_CMP_UBYTE_RANGES,
          KP_CMP_UWORD_RANGES,
          KP_CMP_SBYTE_RANGES,
          KP_CMP_SWORD_RANGES,
          KP_CMP_UBYTE_EQUAL_EACH,
          KP_CMP_UWORD_EQUAL_EACH,
          KP_CMP_SBYTE_EQUAL_EACH,
          KP_CMP_SWORD_EQUAL_EACH,
          KP_CMP_UBYTE_EQUAL_ORDERED,
          KP_CMP_UWORD_EQUAL_ORDERED,
          KP_CMP_SBYTE_EQUAL_ORDERED,
          KP_CMP_SWORD_EQUAL_ORDERED
      } _KP_MM_CMPESTR_ENUM;
      typedef int (*KP_CMPESTR)(__kp_m128i a, int la, __kp_m128i b, int lb);
      typedef struct {
          _KP_MM_CMPESTR_ENUM cmpintEnum;
          KP_CMPESTR cmpFun;
      } KP_CmpestrFuncList;
      static KP_CmpestrFuncList g_kp_cmpestrFuncList[] = {{KP_CMP_UBYTE_EQUAL_ANY, kp_cal_res_byte_equal_any},
                                                    {KP_CMP_UWORD_EQUAL_ANY, kp_cal_res_word_equal_any},
                                                    {KP_CMP_SBYTE_EQUAL_ANY, kp_cal_res_byte_equal_any},
                                                    {KP_CMP_SWORD_EQUAL_ANY, kp_cal_res_word_equal_any},
                                                    {KP_CMP_UBYTE_RANGES, kp_cal_res_ubyte_ranges},
                                                    {KP_CMP_UWORD_RANGES, kp_cal_res_uword_ranges},
                                                    {KP_CMP_SBYTE_RANGES, kp_cal_res_sbyte_ranges},
                                                    {KP_CMP_SWORD_RANGES, kp_cal_res_sword_ranges},
                                                    {KP_CMP_UBYTE_EQUAL_EACH, kp_cal_res_byte_equal_each},
                                                    {KP_CMP_UWORD_EQUAL_EACH, kp_cal_res_word_equal_each},
                                                    {KP_CMP_SBYTE_EQUAL_EACH, kp_cal_res_byte_equal_each},
                                                    {KP_CMP_SWORD_EQUAL_EACH, kp_cal_res_word_equal_each},
                                                    {KP_CMP_UBYTE_EQUAL_ORDERED, kp_cal_res_byte_equal_ordered},
                                                    {KP_CMP_UWORD_EQUAL_ORDERED, kp_cal_res_word_equal_ordered},
                                                    {KP_CMP_SBYTE_EQUAL_ORDERED, kp_cal_res_byte_equal_ordered},
                                                    {KP_CMP_SWORD_EQUAL_ORDERED, kp_cal_res_word_equal_ordered}};
      
      KP_FORCE_INLINE int kp_neg_fun(int res, int lb, int imm8, int bound)
      {
          int m;
          switch (imm8 & 0x30) {
              case _KP_SIDD_NEGATIVE_POLARITY:
                  res ^= 0xffffffff;
                  break;
              case _KP_SIDD_MASKED_NEGATIVE_POLARITY:
                  m = (1 << lb) - 1;
                  res ^= m;
                  break;
              default:
                  break;
          }
      
          return res & ((bound == 8) ? 0xFF : 0xFFFF);
      }
      
      int __remill_simd__mm_cmpestri(__kp_m128i a, int la, __kp_m128i b, int lb, const int imm8, int *intRes2);
      
      
      __kp_m128i __remill_simd__mm_cmpestrm(__kp_m128i a, int la, __kp_m128i b, int lb, const int imm8, int *intRes2);
      
      
      #endif // KUNPENG_TRANS_H
    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  12. 执行以下命令创建“KunpengTrans.cpp”文件。
    1. 创建“KunpengTrans.cpp”文件。

      vi KunpengTrans.cpp

    2. 按“i”进入编辑模式,添加如下内容。
      /*
       * @Description: KunpengTrans.h
       * @Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
       */
      #ifndef KUNPENG_TRANS_H
      #define KUNPENG_TRANS_H
      
      #include <string.h>
      #include <KunpengTrans.h>
      
      /* ATTENTION:
       * Please set KUNPENG_CPU_FREQUENCY_MHZ to the actual cpu frequecy of your running environment.
       */
      const int KUNPENG_CPU_FREQUENCY_MHZ = 2600;
      const int EAX_LEAF = 7;
      const int SHIFT_THREE = 3;
      /* CASE eax = 0; Highest Function Parameter and Manufacturer ID
       */
      void GetCPUManuID(unsigned int *ebx, unsigned int *edx, unsigned int *ecx)
      {
          // ID str = "KunpengHisil"
          char b_str[] = "Kunp";
          char d_str[] = "engH";
          char c_str[] = "isil";
          *ebx = *(unsigned int *)b_str;
          *edx = *(unsigned int *)d_str;
          *ecx = *(unsigned int *)c_str;
      }
      
      // %ecx
      #define bit_SSE3 (1 << 0)
      #define bit_PCLMUL (1 << 1)
      #define bit_LZCNT (1 << 5)
      #define bit_SSSE3 (1 << 9)
      #define bit_FMA (1 << 12)
      #define bit_CMPXCHG16B (1 << 13)
      #define bit_SSE4_1 (1 << 19)
      #define bit_SSE4_2 (1 << 20)
      #define bit_MOVBE (1 << 22)
      #define bit_POPCNT (1 << 23)
      #define bit_AES (1 << 25)
      #define bit_XSAVE (1 << 26)
      #define bit_OSXSAVE (1 << 27)
      #define bit_AVX (1 << 28)
      #define bit_F16C (1 << 29)
      #define bit_RDRND (1 << 30)
      
      // %edx
      #define bit_CMPXCHG8B (1 << 8)
      #define bit_CMOV (1 << 15)
      #define bit_MMX (1 << 23)
      #define bit_FXSAVE (1 << 24)
      #define bit_SSE (1 << 25)
      #define bit_SSE2 (1 << 26)
      /* CASE eax = 1; Processor Info and Feature Bits
       * Skylake:       eax = 0x00050654, ebx = 0x43400800, ecx = 0x7ffefbf7, edx = 0xbfebfbff (采样于6148)
       * Cascade Lake : eax = 0x00050657, ebx = 0x08400800, ecx = 0x7ffefbf7, edx = 0xbfebfbff (采样于6248)
      */
      void GetCPUFeature(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
      {
          *eax = 0x0; // Processor Info not defined in kunpeng;
          *ebx = 0x0; // Additional Info not defined in kunpeng;
          *ecx = bit_SSE3 | bit_LZCNT | bit_SSSE3 | bit_SSE4_1 | bit_SSE4_2 | bit_POPCNT;
          *edx = bit_MMX | bit_SSE | bit_SSE2;
      }
      
      /* %ebx */
      #define bit_FSGSBASE (1 << 0)
      #define bit_SGX (1 << 2)
      #define bit_BMI (1 << 3)
      #define bit_HLE (1 << 4)
      #define bit_AVX2 (1 << 5)
      #define bit_BMI2 (1 << 8)
      #define bit_RTM (1 << 11)
      #define bit_MPX (1 << 14)
      #define bit_AVX512F (1 << 16)
      #define bit_AVX512DQ (1 << 17)
      #define bit_RDSEED (1 << 18)
      #define bit_ADX (1 << 19)
      #define bit_AVX512IFMA (1 << 21)
      #define bit_CLFLUSHOPT (1 << 23)
      #define bit_CLWB (1 << 24)
      #define bit_AVX512PF (1 << 26)
      #define bit_AVX512ER (1 << 27)
      #define bit_AVX512CD (1 << 28)
      #define bit_SHA (1 << 29)
      #define bit_AVX512BW (1 << 30)
      #define bit_AVX512VL (1u << 31)
      
      /* %ecx */
      #define bit_PREFETCHWT1 (1 << 0)
      #define bit_AVX512VBMI (1 << 1)
      #define bit_PKU (1 << 3)
      #define bit_OSPKE (1 << 4)
      #define bit_AVX512VBMI2 (1 << 6)
      #define bit_SHSTK (1 << 7)
      #define bit_GFNI (1 << 8)
      #define bit_VAES (1 << 9)
      #define bit_AVX512VNNI (1 << 11)
      #define bit_VPCLMULQDQ (1 << 10)
      #define bit_AVX512BITALG (1 << 12)
      #define bit_AVX512VPOPCNTDQ (1 << 14)
      #define bit_RDPID (1 << 22)
      #define bit_MOVDIRI (1 << 27)
      #define bit_MOVDIR64B (1 << 28)
      
      /* %edx */
      #define bit_AVX5124VNNIW (1 << 2)
      #define bit_AVX5124FMAPS (1 << 3)
      #define bit_IBT (1 << 20)
      #define bit_PCONFIG (1 << 18)
      /* CASE eax = 7, ecx = 0; Extended Features
       * Skylake:       eax = 0x00000000, ebx = 0xd39ffffb, ecx = 0x00000018, edx = 0x9c002400 (采样于6148)
       * Cascade Lake : eax = 0x00000000, ebx = 0xd39ffffb, ecx = 0x00000818, edx = 0xbc000400 (采样于6248)
       */
      void GetExtendCPUFeature(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
      {
          *eax = 0x0;
          *ebx = 0x0;
          *ecx = 0x0;
          *edx = 0x0;
      }
      
      /*
       * GetSupportedCPUID only supported while eax = 0, eax = 1 , eax = 7 and ecx = 0
       * In other cases always return 0x0;
       * chipID support 1,2 -- 1 for skylake server eg. 6148; 2 for Cascade Lake 6248
       */
      void GetSupportedCPUID(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
      {
          unsigned int leaf = *eax;
          unsigned int count = *ecx;
          if (leaf == 0) {
              *eax = 0x7;
              GetCPUManuID(ebx, edx, ecx);
          } else if (leaf == 1) {
              GetCPUFeature(eax, ebx, ecx, edx);
          } else if (leaf == EAX_LEAF && count == 0) {
              GetExtendCPUFeature(eax, ebx, ecx, edx);
          } else {
              *eax = 0x0;
              *ebx = 0x0;
              *ecx = 0x0;
              *edx = 0x0;
          }
      }
      
      void RepStos(void *dest, unsigned long long src, unsigned long long len, unsigned width, int df)
      {
          unsigned n = width >> SHIFT_THREE;
          unsigned char *d = (unsigned char *)dest;
          unsigned i;
          unsigned j;
          if (df == 1) {
              for (i = 0; i < len; i++) {
                  for (j = 0; j < n; j++) {
                      d[j] = src >> (j << SHIFT_THREE);
                  }
                  d -= n;
              }
          } else {
              for (i = 0; i < len; i++) {
                  for (j = 0; j < n; j++) {
                      d[j] = src >> (j << SHIFT_THREE);
                  }
                  d += n;
              }
          }
          return;
      }
      
      void RepStosB(void *dest, unsigned long long src, unsigned long long len, int DF)
      {
          unsigned char *s = (unsigned char *)dest;
          unsigned char *e = (unsigned char *)dest;
          if (DF) {
              s = s - len;
          } else {
              e = e + len;
          }
          while (s < e) {
              *s++ = src;
          }
      }
      
      void RepStosW(void *dest, unsigned long long src, unsigned long long len, int DF)
      {
          unsigned short *s = (unsigned short *)dest;
          unsigned short *e = (unsigned short *)dest;
          if (DF) {
              s = s - len;
          } else {
              e = e + len;
          }
          while (s < e) {
              *s++ = src;
          }
      }
      
      void RepStosD(void *dest, unsigned long long src, unsigned long long len, int DF)
      {
          unsigned int *s = (unsigned int *)dest;
          unsigned int *e = (unsigned int *)dest;
          if (DF) {
              s = s - len;
          } else {
              e = e + len;
          }
          while (s < e) {
              *s++ = src;
          }
      }
      
      void RepStosQ(void *dest, unsigned long long src, unsigned long long len, int DF)
      {
          unsigned long long *s = (unsigned long long *)dest;
          unsigned long long *e = (unsigned long long *)dest;
          if (DF) {
              s = s - len;
          } else {
              e = e + len;
          }
          while (s < e) {
              *s++ = src;
          }
      }
      
      void RepMovs(void *dest, void *src, unsigned long long len, unsigned width, int df)
      {
          unsigned n = len * (width >> SHIFT_THREE);
          char *d = NULL;
          char *s = NULL;
          if (df == 1) {
              d = (char *)dest - (n - 1);
              s = (char *)src - (n - 1);
          } else {
              d = (char *)dest;
              s = (char *)src;
          }
          memcpy(d, s, n); // 非安全函数, 如有需要,请修改为安全函数
          return;
      }
      
      #include <arm_neon.h>
      
      #define KP_FORCE_INLINE static inline __attribute__((always_inline))
      
      typedef union {
          int8x16_t vect_s8;
          int16x8_t vect_s16;
          int32x4_t vect_s32;
          int64x2_t vect_s64;
          uint8x16_t vect_u8;
          uint16x8_t vect_u16;
          uint32x4_t vect_u32;
          uint64x2_t vect_u64;
      } __kp_m128i;
      
      #define _KP_SIDD_NEGATIVE_POLARITY 0x10        // negate results
      #define _KP_SIDD_MASKED_NEGATIVE_POLARITY 0x30 // negate results only before end of string
      
      static uint16_t g_kp_mask_epi16[8] __attribute__((aligned(16))) = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
      static uint8_t g_kp_mask_epi8[16] __attribute__((aligned(16))) = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
                                                                      0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
      
      #define KP_PCMPSTR_EQ_16x8(a, b, mtx)                                                           \
          {                                                                                        \
              mtx[0].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 0)), a.vect_u16); \
              mtx[1].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 1)), a.vect_u16); \
              mtx[2].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 2)), a.vect_u16); \
              mtx[3].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 3)), a.vect_u16); \
              mtx[4].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 4)), a.vect_u16); \
              mtx[5].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 5)), a.vect_u16); \
              mtx[6].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 6)), a.vect_u16); \
              mtx[7].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 7)), a.vect_u16); \
          }
      
      #define KP_PCMPSTR_EQ_8x16(a, b, mtx)                                                       \
          {                                                                                    \
              mtx[0].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 0)), a.vect_u8);   \
              mtx[1].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 1)), a.vect_u8);   \
              mtx[2].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 2)), a.vect_u8);   \
              mtx[3].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 3)), a.vect_u8);   \
              mtx[4].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 4)), a.vect_u8);   \
              mtx[5].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 5)), a.vect_u8);   \
              mtx[6].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 6)), a.vect_u8);   \
              mtx[7].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 7)), a.vect_u8);   \
              mtx[8].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 8)), a.vect_u8);   \
              mtx[9].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 9)), a.vect_u8);   \
              mtx[10].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 10)), a.vect_u8); \
              mtx[11].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 11)), a.vect_u8); \
              mtx[12].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 12)), a.vect_u8); \
              mtx[13].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 13)), a.vect_u8); \
              mtx[14].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 14)), a.vect_u8); \
              mtx[15].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 15)), a.vect_u8); \
          }
      
      #define KP_PCMPSTR_RNG_U16x8(a, b, mtx)                                                                          \
          {                                                                                                         \
              uint16x8_t vect_b[8];                                                                                 \
              __kp_m128i mask;                                                                                         \
              mask.vect_u32 = vdupq_n_u32(0xffff);                                                                  \
              vect_b[0] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 0));                                               \
              vect_b[1] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 1));                                               \
              vect_b[2] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 2));                                               \
              vect_b[3] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 3));                                               \
              vect_b[4] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 4));                                               \
              vect_b[5] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 5));                                               \
              vect_b[6] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 6));                                               \
              vect_b[7] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 7));                                               \
              int i;                                                                                                \
              for (i = 0; i < 8; i++) {                                                                             \
                  mtx[i].vect_u16 = vbslq_u16(mask.vect_u16, vcgeq_u16(vect_b[i], a.vect_u16),                      \
                  vcleq_u16(vect_b[i], a.vect_u16));                                                                \
              }                                                                                                     \
          }
      #define KP_PCMPSTR_RNG_S16x8(a, b, mtx)                                                                          \
          {                                                                                                         \
              int16x8_t vect_b[8];                                                                                  \
              __kp_m128i mask;                                                                                         \
              mask.vect_u32 = vdupq_n_u32(0xffff);                                                                  \
              vect_b[0] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 0));                                               \
              vect_b[1] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 1));                                               \
              vect_b[2] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 2));                                               \
              vect_b[3] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 3));                                               \
              vect_b[4] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 4));                                               \
              vect_b[5] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 5));                                               \
              vect_b[6] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 6));                                               \
              vect_b[7] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 7));                                               \
              int i;                                                                                                \
              for (i = 0; i < 8; i++) {                                                                             \
                  mtx[i].vect_u16 = vbslq_u16(mask.vect_u16, vcgeq_s16(vect_b[i], a.vect_s16),                      \
                  vcleq_s16(vect_b[i], a.vect_s16));                                                                \
              }                                                                                                     \
          }
      
      #define KP_PCMPSTR_RNG_U8x16(a, b, mtx)                                                                                 \
          {                                                                                                                \
              uint8x16_t vect_b[16];                                                                                       \
              __kp_m128i mask;                                                                                                \
              mask.vect_u16 = vdupq_n_u16(0xff);                                                                           \
              vect_b[0] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 0));                                                         \
              vect_b[1] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 1));                                                         \
              vect_b[2] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 2));                                                         \
              vect_b[3] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 3));                                                         \
              vect_b[4] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 4));                                                         \
              vect_b[5] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 5));                                                         \
              vect_b[6] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 6));                                                         \
              vect_b[7] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 7));                                                         \
              vect_b[8] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 8));                                                         \
              vect_b[9] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 9));                                                         \
              vect_b[10] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 10));                                                       \
              vect_b[11] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 11));                                                       \
              vect_b[12] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 12));                                                       \
              vect_b[13] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 13));                                                       \
              vect_b[14] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 14));                                                       \
              vect_b[15] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 15));                                                       \
              int i;                                                                                                       \
              for (i = 0; i < 16; i++) {                                                                                   \
                  mtx[i].vect_u8 = vbslq_u8(mask.vect_u8, vcgeq_u8(vect_b[i], a.vect_u8), vcleq_u8(vect_b[i], a.vect_u8)); \
              }                                                                                                            \
          }
      
      #define KP_PCMPSTR_RNG_S8x16(a, b, mtx)                                                                                 \
          {                                                                                                                \
              int8x16_t vect_b[16];                                                                                        \
              __kp_m128i mask;                                                                                                \
              mask.vect_u16 = vdupq_n_u16(0xff);                                                                           \
              vect_b[0] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 0));                                                         \
              vect_b[1] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 1));                                                         \
              vect_b[2] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 2));                                                         \
              vect_b[3] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 3));                                                         \
              vect_b[4] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 4));                                                         \
              vect_b[5] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 5));                                                         \
              vect_b[6] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 6));                                                         \
              vect_b[7] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 7));                                                         \
              vect_b[8] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 8));                                                         \
              vect_b[9] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 9));                                                         \
              vect_b[10] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 10));                                                       \
              vect_b[11] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 11));                                                       \
              vect_b[12] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 12));                                                       \
              vect_b[13] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 13));                                                       \
              vect_b[14] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 14));                                                       \
              vect_b[15] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 15));                                                       \
              int i;                                                                                                       \
              for (i = 0; i < 16; i++) {                                                                                   \
                  mtx[i].vect_u8 = vbslq_u8(mask.vect_u8, vcgeq_s8(vect_b[i], a.vect_s8), vcleq_s8(vect_b[i], a.vect_s8)); \
              }                                                                                                            \
          }
      
      static int kp_aggregate_equal_any_8x16(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint8x8_t vect_mask = vld1_u8(g_kp_mask_epi8);
          uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vect_mask);
          uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vect_mask);
          uint8x16_t vect = vcombine_u8(t_lo, t_hi);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u8 = vandq_u8(vect, mtx[j].vect_u8);
              mtx[j].vect_u8 = vshrq_n_u8(mtx[j].vect_u8, 7);
              int tmp = vaddvq_u8(mtx[j].vect_u8) ? 1 : 0;
              res |= (tmp << j);
          }
          return res;
      }
      
      static int kp_aggregate_equal_any_16x8(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint16x8_t vect = vtstq_u16(vdupq_n_u16(m), vld1q_u16(g_kp_mask_epi16));
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u16 = vandq_u16(vect, mtx[j].vect_u16);
              mtx[j].vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 15);
              int tmp = vaddvq_u16(mtx[j].vect_u16) ? 1 : 0;
              res |= (tmp << j);
          }
          return res;
      }
      
      static int kp_cal_res_byte_equal_any(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_8x16(a, b, mtx);
          return kp_aggregate_equal_any_8x16(la, lb, mtx);
      }
      
      static int kp_cal_res_word_equal_any(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_16x8(a, b, mtx);
          return kp_aggregate_equal_any_16x8(la, lb, mtx);
      }
      
      static int kp_aggregate_ranges_16x8(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint16x8_t vect = vtstq_u16(vdupq_n_u16(m), vld1q_u16(g_kp_mask_epi16));
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u16 = vandq_u16(vect, mtx[j].vect_u16);
              mtx[j].vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 15);
              __kp_m128i tmp;
              tmp.vect_u32 = vshrq_n_u32(mtx[j].vect_u32, 16);
              uint32x4_t vect_res = vandq_u32(mtx[j].vect_u32, tmp.vect_u32);
              int t = vaddvq_u32(vect_res) ? 1 : 0;
              res |= (t << j);
          }
          return res;
      }
      
      static int kp_aggregate_ranges_8x16(int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j;
          int m = (1 << la) - 1;
          uint8x8_t vect_mask = vld1_u8(g_kp_mask_epi8);
          uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vect_mask);
          uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vect_mask);
          uint8x16_t vect = vcombine_u8(t_lo, t_hi);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u8 = vandq_u8(vect, mtx[j].vect_u8);
              mtx[j].vect_u8 = vshrq_n_u8(mtx[j].vect_u8, 7);
              __kp_m128i tmp;
              tmp.vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 8);
              uint16x8_t vect_res = vandq_u16(mtx[j].vect_u16, tmp.vect_u16);
              int t = vaddvq_u16(vect_res) ? 1 : 0;
              res |= (t << j);
          }
          return res;
      }
      
      static int kp_cal_res_ubyte_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_U8x16(a, b, mtx);
          return kp_aggregate_ranges_8x16(la, lb, mtx);
      }
      
      static int kp_cal_res_sbyte_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_S8x16(a, b, mtx);
          return kp_aggregate_ranges_8x16(la, lb, mtx);
      }
      
      static int kp_cal_res_uword_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_U16x8(a, b, mtx);
          return kp_aggregate_ranges_16x8(la, lb, mtx);
      }
      
      static int kp_cal_res_sword_ranges(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_RNG_S16x8(a, b, mtx);
          return kp_aggregate_ranges_16x8(la, lb, mtx);
      }
      
      static int kp_cal_res_byte_equal_each(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          uint8x16_t mtx = vceqq_u8(a.vect_u8, b.vect_u8);
          int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
          int m1 = 0x10000 - (1 << la);
          int tb = 0x10000 - (1 << lb);
          uint8x8_t vect_mask, vect0_lo, vect0_hi, vect1_lo, vect1_hi;
          uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
          vect_mask = vld1_u8(g_kp_mask_epi8);
          vect0_lo = vtst_u8(vdup_n_u8(m0), vect_mask);
          vect0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vect_mask);
          vect1_lo = vtst_u8(vdup_n_u8(m1), vect_mask);
          vect1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vect_mask);
          tmp_lo = vtst_u8(vdup_n_u8(tb), vect_mask);
          tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vect_mask);
      
          res_lo = vbsl_u8(vect0_lo, vdup_n_u8(0), vget_low_u8(mtx));
          res_hi = vbsl_u8(vect0_hi, vdup_n_u8(0), vget_high_u8(mtx));
          res_lo = vbsl_u8(vect1_lo, tmp_lo, res_lo);
          res_hi = vbsl_u8(vect1_hi, tmp_hi, res_hi);
          res_lo = vand_u8(res_lo, vect_mask);
          res_hi = vand_u8(res_hi, vect_mask);
      
          int res = vaddv_u8(res_lo) + (vaddv_u8(res_hi) << 8);
          return res;
      }
      
      static int kp_cal_res_word_equal_each(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          uint16x8_t mtx = vceqq_u16(a.vect_u16, b.vect_u16);
          int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
          int m1 = 0x100 - (1 << la);
          int tb = 0x100 - (1 << lb);
          uint16x8_t vect_mask = vld1q_u16(g_kp_mask_epi16);
          uint16x8_t vect0 = vtstq_u16(vdupq_n_u16(m0), vect_mask);
          uint16x8_t vect1 = vtstq_u16(vdupq_n_u16(m1), vect_mask);
          uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vect_mask);
          mtx = vbslq_u16(vect0, vdupq_n_u16(0), mtx);
          mtx = vbslq_u16(vect1, tmp, mtx);
          mtx = vandq_u16(mtx, vect_mask);
          return vaddvq_u16(mtx);
      }
      
      static int kp_aggregate_equal_ordered_8x16(int bound, int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j, k;
          int m1 = 0x10000 - (1 << la);
          uint8x16_t vect_mask = vld1q_u8(g_kp_mask_epi8);
          uint8x16_t vect1 = vtstq_u8(vdupq_n_u8(m1), vect_mask);
          uint8x16_t vect_minusone = vdupq_n_u8(-1);
          uint8x16_t vect_zero = vdupq_n_u8(0);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u8 = vbslq_u8(vect1, vect_minusone, mtx[j].vect_u8);
          }
          for (j = lb; j < bound; j++) {
              mtx[j].vect_u8 = vbslq_u8(vect1, vect_minusone, vect_zero);
          }
          uint8_t enable[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
          for (j = 0; j < bound; j++) {
              int val = 1;
              uint8x16_t vect_en = vld1q_u8(enable);
              for (k = j; k < bound && val == 1; k++) {
                  int t = vaddvq_u8(vandq_u8(mtx[j].vect_u8, vect_en));
                  val = (t == bound - j) ? 1 : 0;
              }
              res = (val << j) + res;
              enable[bound - 1 - j] = 0;
          }
          return res;
      }
      
      static int kp_aggregate_equal_ordered_16x8(int bound, int la, int lb, __kp_m128i mtx[16])
      {
          int res = 0;
          int j, k;
          int m1 = 0x100 - (1 << la);
          uint16x8_t vect_mask = vld1q_u16(g_kp_mask_epi16);
          uint16x8_t vect1 = vtstq_u16(vdupq_n_u16(m1), vect_mask);
          uint16x8_t vect_minusone = vdupq_n_u16(-1);
          uint16x8_t vect_zero = vdupq_n_u16(0);
          for (j = 0; j < lb; j++) {
              mtx[j].vect_u16 = vbslq_u16(vect1, vect_minusone, mtx[j].vect_u16);
          }
          for (j = lb; j < bound; j++) {
              mtx[j].vect_u16 = vbslq_u16(vect1, vect_minusone, vect_zero);
          }
          uint16_t enable[8] = {1, 1, 1, 1, 1, 1, 1, 1};
          for (j = 0; j < bound; j++) {
              int val = 1;
              uint16x8_t vect_en = vld1q_u16(enable);
              for (k = j; k < bound && val == 1; k++) {
                  int t = vaddvq_u16(vandq_u16(mtx[j].vect_u16, vect_en));
                  val = (t == bound - j) ? 1 : 0;
              }
              res = (val << j) + res;
              enable[bound - 1 - j] = 0;
          }
          return res;
      }
      
      static int kp_cal_res_byte_equal_ordered(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_8x16(a, b, mtx);
          return kp_aggregate_equal_ordered_8x16(16, la, lb, mtx);
      }
      
      static int kp_cal_res_word_equal_ordered(__kp_m128i a, int la, __kp_m128i b, int lb)
      {
          __kp_m128i mtx[16];
          KP_PCMPSTR_EQ_16x8(a, b, mtx);
          return kp_aggregate_equal_ordered_16x8(8, la, lb, mtx);
      }
      
      typedef enum {
          KP_CMP_UBYTE_EQUAL_ANY,
          KP_CMP_UWORD_EQUAL_ANY,
          KP_CMP_SBYTE_EQUAL_ANY,
          KP_CMP_SWORD_EQUAL_ANY,
          KP_CMP_UBYTE_RANGES,
          KP_CMP_UWORD_RANGES,
          KP_CMP_SBYTE_RANGES,
          KP_CMP_SWORD_RANGES,
          KP_CMP_UBYTE_EQUAL_EACH,
          KP_CMP_UWORD_EQUAL_EACH,
          KP_CMP_SBYTE_EQUAL_EACH,
          KP_CMP_SWORD_EQUAL_EACH,
          KP_CMP_UBYTE_EQUAL_ORDERED,
          KP_CMP_UWORD_EQUAL_ORDERED,
          KP_CMP_SBYTE_EQUAL_ORDERED,
          KP_CMP_SWORD_EQUAL_ORDERED
      } _KP_MM_CMPESTR_ENUM;
      typedef int (*KP_CMPESTR)(__kp_m128i a, int la, __kp_m128i b, int lb);
      typedef struct {
          _KP_MM_CMPESTR_ENUM cmpintEnum;
          KP_CMPESTR cmpFun;
      } KP_CmpestrFuncList;
      static KP_CmpestrFuncList g_kp_cmpestrFuncList[] = {{KP_CMP_UBYTE_EQUAL_ANY, kp_cal_res_byte_equal_any},
                                                    {KP_CMP_UWORD_EQUAL_ANY, kp_cal_res_word_equal_any},
                                                    {KP_CMP_SBYTE_EQUAL_ANY, kp_cal_res_byte_equal_any},
                                                    {KP_CMP_SWORD_EQUAL_ANY, kp_cal_res_word_equal_any},
                                                    {KP_CMP_UBYTE_RANGES, kp_cal_res_ubyte_ranges},
                                                    {KP_CMP_UWORD_RANGES, kp_cal_res_uword_ranges},
                                                    {KP_CMP_SBYTE_RANGES, kp_cal_res_sbyte_ranges},
                                                    {KP_CMP_SWORD_RANGES, kp_cal_res_sword_ranges},
                                                    {KP_CMP_UBYTE_EQUAL_EACH, kp_cal_res_byte_equal_each},
                                                    {KP_CMP_UWORD_EQUAL_EACH, kp_cal_res_word_equal_each},
                                                    {KP_CMP_SBYTE_EQUAL_EACH, kp_cal_res_byte_equal_each},
                                                    {KP_CMP_SWORD_EQUAL_EACH, kp_cal_res_word_equal_each},
                                                    {KP_CMP_UBYTE_EQUAL_ORDERED, kp_cal_res_byte_equal_ordered},
                                                    {KP_CMP_UWORD_EQUAL_ORDERED, kp_cal_res_word_equal_ordered},
                                                    {KP_CMP_SBYTE_EQUAL_ORDERED, kp_cal_res_byte_equal_ordered},
                                                    {KP_CMP_SWORD_EQUAL_ORDERED, kp_cal_res_word_equal_ordered}};
      
      KP_FORCE_INLINE int kp_neg_fun(int res, int lb, int imm8, int bound)
      {
          int m;
          switch (imm8 & 0x30) {
              case _KP_SIDD_NEGATIVE_POLARITY:
                  res ^= 0xffffffff;
                  break;
              case _KP_SIDD_MASKED_NEGATIVE_POLARITY:
                  m = (1 << lb) - 1;
                  res ^= m;
                  break;
              default:
                  break;
          }
      
          return res & ((bound == 8) ? 0xFF : 0xFFFF);
      }
      
      int __remill_simd__mm_cmpestri(__kp_m128i a, int la, __kp_m128i b, int lb, const int imm8, int *intRes2)
      {
          int bound = (imm8 & 0x01) ? 8 : 16;
          __asm__ __volatile__ (
              "eor w0, %w[a], %w[a], asr31          \n\t"
              "sub %w[a], w0, %w[a], asr31          \n\t"
              "eor w1, %w[b], %w[b], asr31          \n\t"
              "sub %w[b], w1, %w[b], asr31          \n\t"
              "cmp %w[a], %w[bd]                    \n\t"
              "csel %w[a], %w[bd], %w[a], gt        \n\t"
              "cmp %w[b], %w[bd]                    \n\t"
              "csel %w[b], %w[bd], %w[b], gt        \n\t"
              :[a]"+r"(la), [b]"+r"(lb)
              :[bd]"r"(bound)
              :"w0", "w1");
      
          int r2 = g_kp_cmpestrFuncList[imm8 & 0x0f].cmpFun(a, la, b, lb);
          r2 = kp_neg_fun(r2, lb, imm8, bound);
          *intRes2 = r2;
          return (r2 == 0) ? bound : ((imm8 & 0x40) ? (31 - __builtin_clz(r2)) : __builtin_ctz(r2));
      }
      
      __kp_m128i __remill_simd__mm_cmpestrm(__kp_m128i a, int la, __kp_m128i b, int lb, const int imm8, int *intRes2)
      {
          __kp_m128i dst;
          int bound = (imm8 & 0x01) ? 8 : 16;
          __asm__ __volatile__ (
              "eor w0, %w[a], %w[a], asr31          \n\t"
              "sub %w[a], w0, %w[a], asr31          \n\t"
              "eor w1, %w[b], %w[b], asr31          \n\t"
              "sub %w[b], w1, %w[b], asr31          \n\t"
              "cmp %w[a], %w[bd]                    \n\t"
              "csel %w[a], %w[bd], %w[a], gt        \n\t"
              "cmp %w[b], %w[bd]                    \n\t"
              "csel %w[b], %w[bd], %w[b], gt        \n\t"
              :[a]"+r"(la), [b]"+r"(lb)
              :[bd]"r"(bound)
              :"w0", "w1");
      
          int r2 = g_kp_cmpestrFuncList[imm8 & 0x0f].cmpFun(a, la, b, lb);
          r2 = kp_neg_fun(r2, lb, imm8, bound);
          *intRes2 = r2;
          dst.vect_u8 = vdupq_n_u8(0);
          if (imm8 & 0x40) {
              if (bound == 8) {
                  uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), vld1q_u16(g_kp_mask_epi16));
                  dst.vect_u16 = vbslq_u16(tmp, vdupq_n_u16(-1), dst.vect_u16);
              } else {
                  uint8x16_t vect_r2 = vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));
                  uint8x16_t tmp = vtstq_u8(vect_r2, vld1q_u8(g_kp_mask_epi8));
                  dst.vect_u8 = vbslq_u8(tmp, vdupq_n_u8(-1), dst.vect_u8);
              }
          } else {
              if (bound == 16) {
                  dst.vect_u16 = vsetq_lane_u16(r2 & 0xffff, dst.vect_u16, 0);
              } else {
                  dst.vect_u8 = vsetq_lane_u8(r2 & 0xff, dst.vect_u8, 0);
              }
          }
      
          return dst;
      }
      
      #endif // KUNPENG_TRANS_H
    3. 按“Esc”键,输入:wq!,按“Enter”保存并退出编辑。
  13. 将“KunpengTrans.h”和“KunpengTrans.cpp”上传到Muscle源码的“src”目录。
  14. 执行以下命令进行编译。
    make

    编译成功后,可执行文件生成在“/muscle/src/o”目录下。