/* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
   pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
   aspect. */

#include <string.h>
#include <stdio.h>
#include <assert.h>

typedef  unsigned int   UInt;
typedef  signed int     Int;
typedef  unsigned char  UChar;
typedef  unsigned long long int ULong;
typedef  UChar          Bool;
#define False ((Bool)0)
#define True  ((Bool)1)

//typedef  unsigned char  V128[16];
typedef
   union {
      UChar uChar[16];
      UInt  uInt[4];
   }
   V128;

#define SHIFT_O   11
#define SHIFT_S   7
#define SHIFT_Z   6
#define SHIFT_A   4
#define SHIFT_C   0
#define SHIFT_P   2

#define MASK_O    (1ULL << SHIFT_O)
#define MASK_S    (1ULL << SHIFT_S)
#define MASK_Z    (1ULL << SHIFT_Z)
#define MASK_A    (1ULL << SHIFT_A)
#define MASK_C    (1ULL << SHIFT_C)
#define MASK_P    (1ULL << SHIFT_P)


UInt clz32 ( UInt x )
{
   Int y, m, n;
   y = -(x >> 16);
   m = (y >> 16) & 16;
   n = 16 - m;
   x = x >> m;
   y = x - 0x100;
   m = (y >> 16) & 8;
   n = n + m;
   x = x << m;
   y = x - 0x1000;
   m = (y >> 16) & 4;
   n = n + m;
   x = x << m;
   y = x - 0x4000;
   m = (y >> 16) & 2;
   n = n + m;
   x = x << m;
   y = x >> 14;
   m = y & ~(y >> 1);
   return n + 2 - m;
}

UInt ctz32 ( UInt x )
{
   return 32 - clz32((~x) & (x-1));
}

void expand ( V128* dst, char* summary )
{
   Int i;
   assert( strlen(summary) == 16 );
   for (i = 0; i < 16; i++) {
      UChar xx = 0;
      UChar x = summary[15-i];
      if      (x >= '0' && x <= '9') { xx = x - '0'; }
      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
      else assert(0);

      assert(xx < 16);
      xx = (xx << 4) | xx;
      assert(xx < 256);
      dst->uChar[i] = xx;
   }
}

void try_istri ( char* which,
                 UInt(*h_fn)(V128*,V128*),
                 UInt(*s_fn)(V128*,V128*),
                 char* summL, char* summR )
{
   assert(strlen(which) == 2);
   V128 argL, argR;
   expand(&argL, summL);
   expand(&argR, summR);
   UInt h_res = h_fn(&argL, &argR);
   UInt s_res = s_fn(&argL, &argR);
   printf("istri %s  %s %s -> %08x %08x %s\n",
          which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
}

UInt zmask_from_V128 ( V128* arg )
{
   UInt i, res = 0;
   for (i = 0; i < 16; i++) {
      res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
   }
   return res;
}

//////////////////////////////////////////////////////////
//                                                      //
//                       GENERAL                        //
//                                                      //
//////////////////////////////////////////////////////////


/* Given partial results from a pcmpXstrX operation (intRes1,
   basically), generate an I format (index value for ECX) output, and
   also the new OSZACP flags.
*/
static
void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
                                    /*OUT*/UInt* resOSZACP,
                                    UInt intRes1,
                                    UInt zmaskL, UInt zmaskR,
                                    UInt validL,
                                    UInt pol, UInt idx )
{
   assert((pol >> 2) == 0);
   assert((idx >> 1) == 0);

   UInt intRes2 = 0;
   switch (pol) {
      case 0: intRes2 = intRes1;          break; // pol +
      case 1: intRes2 = ~intRes1;         break; // pol -
      case 2: intRes2 = intRes1;          break; // pol m+
      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
   }
   intRes2 &= 0xFFFF;

   // generate ecx value
   UInt newECX = 0;
   if (idx) {
     // index of ms-1-bit
     newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
   } else {
     // index of ls-1-bit
     newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
   }

   *(UInt*)(&resV[0]) = newECX;

   // generate new flags, common to all ISTRI and ISTRM cases
   *resOSZACP    // A, P are zero
     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
}


/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
   variants.

   For xSTRI variants, the new ECX value is placed in the 32 bits
   pointed to by *resV.  For xSTRM variants, the result is a 128 bit
   value and is placed at *resV in the obvious way.

   For all variants, the new OSZACP value is placed at *resOSZACP.

   argLV and argRV are the vector args.  The caller must prepare a
   16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
   must be 1 for each zero byte of of the respective arg.  For ESTRx
   variants this is derived from the explicit length indication, and
   must be 0 in all places except at the bit index corresponding to
   the valid length (0 .. 16).  If the valid length is 16 then the
   mask must be all zeroes.  In all cases, bits 31:16 must be zero.

   imm8 is the original immediate from the instruction.  isSTRM
   indicates whether this is a xSTRM or xSTRI variant, which controls
   how much of *res is written.

   If the given imm8 case can be handled, the return value is True.
   If not, False is returned, and neither *res not *resOSZACP are
   altered.
*/

Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
                     /*OUT*/UInt* resOSZACP,
                     V128* argLV,  V128* argRV,
                     UInt zmaskL, UInt zmaskR,
                     UInt imm8,   Bool isSTRM )
{
   assert(imm8 < 0x80);
   assert((zmaskL >> 16) == 0);
   assert((zmaskR >> 16) == 0);

   /* Explicitly reject any imm8 values that haven't been validated,
      even if they would probably work.  Life is too short to have
      unvalidated cases in the code base. */
   switch (imm8) {
      case 0x00:
      case 0x02: case 0x08: case 0x0C: case 0x12: case 0x1A:
      case 0x38: case 0x3A: case 0x44: case 0x4A:
         break;
      default:
         return False;
   }

   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask

   /*----------------------------------------*/
   /*-- strcmp on byte data                --*/
   /*----------------------------------------*/

   if (agg == 2/*equal each, aka strcmp*/
       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
       && !isSTRM) {
      Int    i;
      UChar* argL = (UChar*)argLV;
      UChar* argR = (UChar*)argRV;
      UInt boolResII = 0;
      for (i = 15; i >= 0; i--) {
         UChar cL  = argL[i];
         UChar cR  = argR[i];
         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
      }
      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))

      // do invalidation, common to all equal-each cases
      UInt intRes1
         = (boolResII & validL & validR)  // if both valid, use cmpres
           | (~ (validL | validR));       // if both invalid, force 1
                                          // else force 0
      intRes1 &= 0xFFFF;

      // generate I-format output
      pcmpXstrX_WRK_gen_output_fmt_I(
         resV, resOSZACP,
         intRes1, zmaskL, zmaskR, validL, pol, idx
      );

      return True;
   }

   /*----------------------------------------*/
   /*-- set membership on byte data        --*/
   /*----------------------------------------*/

   if (agg == 0/*equal any, aka find chars in a set*/
       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
       && !isSTRM) {
      /* argL: the string,  argR: charset */
      UInt   si, ci;
      UChar* argL    = (UChar*)argLV;
      UChar* argR    = (UChar*)argRV;
      UInt   boolRes = 0;
      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))

      for (si = 0; si < 16; si++) {
         if ((validL & (1 << si)) == 0)
            // run off the end of the string.
            break;
         UInt m = 0;
         for (ci = 0; ci < 16; ci++) {
            if ((validR & (1 << ci)) == 0) break;
            if (argR[ci] == argL[si]) { m = 1; break; }
         }
         boolRes |= (m << si);
      }

      // boolRes is "pre-invalidated"
      UInt intRes1 = boolRes & 0xFFFF;
   
      // generate I-format output
      pcmpXstrX_WRK_gen_output_fmt_I(
         resV, resOSZACP,
         intRes1, zmaskL, zmaskR, validL, pol, idx
      );

      return True;
   }

   /*----------------------------------------*/
   /*-- substring search on byte data      --*/
   /*----------------------------------------*/

   if (agg == 3/*equal ordered, aka substring search*/
       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
       && !isSTRM) {

      /* argL: haystack,  argR: needle */
      UInt   ni, hi;
      UChar* argL    = (UChar*)argLV;
      UChar* argR    = (UChar*)argRV;
      UInt   boolRes = 0;
      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
      for (hi = 0; hi < 16; hi++) {
         UInt m = 1;
         for (ni = 0; ni < 16; ni++) {
            if ((validR & (1 << ni)) == 0) break;
            UInt i = ni + hi;
            if (i >= 16) break;
            if (argL[i] != argR[ni]) { m = 0; break; }
         }
         boolRes |= (m << hi);
         if ((validL & (1 << hi)) == 0)
            // run off the end of the haystack
            break;
      }

      // boolRes is "pre-invalidated"
      UInt intRes1 = boolRes & 0xFFFF;

      // generate I-format output
      pcmpXstrX_WRK_gen_output_fmt_I(
         resV, resOSZACP,
         intRes1, zmaskL, zmaskR, validL, pol, idx
      );

      return True;
   }

   /*----------------------------------------*/
   /*-- ranges, unsigned byte data         --*/
   /*----------------------------------------*/

   if (agg == 1/*ranges*/
       && fmt == 0/*ub*/
       && !isSTRM) {

      /* argL: string,  argR: range-pairs */
      UInt   ri, si;
      UChar* argL    = (UChar*)argLV;
      UChar* argR    = (UChar*)argRV;
      UInt   boolRes = 0;
      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
      for (si = 0; si < 16; si++) {
         if ((validL & (1 << si)) == 0)
            // run off the end of the string
            break;
         UInt m = 0;
         for (ri = 0; ri < 16; ri += 2) {
            if ((validR & (3 << ri)) != (3 << ri)) break;
            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { 
               m = 1; break;
            }
         }
         boolRes |= (m << si);
      }

      // boolRes is "pre-invalidated"
      UInt intRes1 = boolRes & 0xFFFF;

      // generate I-format output
      pcmpXstrX_WRK_gen_output_fmt_I(
         resV, resOSZACP,
         intRes1, zmaskL, zmaskR, validL, pol, idx
      );

      return True;
   }

   return False;
}


//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_4A                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_4A ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x4A, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_4A ( void )
{
   char* wot = "4A";
   UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
   UInt(*s)(V128*,V128*) = s_pcmpistri_4A;

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
}

//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_3A                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_3A ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x3A, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_3A ( void )
{
   char* wot = "3A";
   UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
   UInt(*s)(V128*,V128*) = s_pcmpistri_3A;

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
}



//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_0C                       //
//                                                      //
//////////////////////////////////////////////////////////

__attribute__((noinline))
UInt h_pcmpistri_0C ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res = 0, flags = 0;
   __asm__ __volatile__(
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
      //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
      //"movd %%xmm0, %%ecx" "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x0C, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_0C ( void )
{
   char* wot = "0C";
   UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
   UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
   
   try_istri(wot,h,s, "111111111abcde11", "00000000000abcde"); 

   try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde"); 

   try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde"); 
   try_istri(wot,h,s, "11111111111abcde", "00000000000abcde"); 
   try_istri(wot,h,s, "111111111111abcd", "00000000000abcde"); 

   try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde"); 

   try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde"); 

   try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde"); 

   try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde"); 
   try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde"); 

   try_istri(wot,h,s, "1111111111111234", "0000000000000000"); 
   try_istri(wot,h,s, "1111111111111234", "0000000000000001"); 
   try_istri(wot,h,s, "1111111111111234", "0000000000000011"); 

   try_istri(wot,h,s, "1111111111111234", "1111111111111234"); 
   try_istri(wot,h,s, "a111111111111111", "000000000000000a"); 
   try_istri(wot,h,s, "b111111111111111", "000000000000000a"); 

   try_istri(wot,h,s, "b111111111111111", "0000000000000000");
   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
   try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
}


//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_08                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_08 ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x08, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_08 ( void )
{
   char* wot = "08";
   UInt(*h)(V128*,V128*) = h_pcmpistri_08;
   UInt(*s)(V128*,V128*) = s_pcmpistri_08;

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
}



//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_1A                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_1A ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x1A, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_1A ( void )
{
   char* wot = "1A";
   UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
   UInt(*s)(V128*,V128*) = s_pcmpistri_1A;

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
}



//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_02                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_02 ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
//"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
//"movd %%xmm0, %%ecx" "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x02, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_02 ( void )
{
   char* wot = "02";
   UInt(*h)(V128*,V128*) = h_pcmpistri_02;
   UInt(*s)(V128*,V128*) = s_pcmpistri_02;

   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab"); 
   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd"); 

   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd"); 

   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0"); 

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba"); 

   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0"); 

   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe"); 
   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe"); 
}


//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_12                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_12 ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
//"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
//"movd %%xmm0, %%ecx" "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x12, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_12 ( void )
{
   char* wot = "12";
   UInt(*h)(V128*,V128*) = h_pcmpistri_12;
   UInt(*s)(V128*,V128*) = s_pcmpistri_12;

   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab"); 
   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd"); 

   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd"); 

   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0"); 

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba"); 

   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0"); 

   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe"); 
   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe"); 
}



//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_44                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_44 ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
//"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
//"movd %%xmm0, %%ecx" "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x44, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_44 ( void )
{
   char* wot = "44";
   UInt(*h)(V128*,V128*) = h_pcmpistri_44;
   UInt(*s)(V128*,V128*) = s_pcmpistri_44;

   try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc"); 
   try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb"); 
   try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb"); 
   try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb"); 

   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb"); 
   try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb"); 
   try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb"); 
   try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb"); 
   try_istri(wot,h,s, "0000000000000000", "00000000000000cb"); 

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 

   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb"); 
   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b"); 
   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb"); 

   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb"); 
   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb"); 
   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");

   try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");

   try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
}


//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_00                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_00 ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
//"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
//"movd %%xmm0, %%ecx" "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x00, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_00 ( void )
{
   char* wot = "00";
   UInt(*h)(V128*,V128*) = h_pcmpistri_00;
   UInt(*s)(V128*,V128*) = s_pcmpistri_00;

   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab"); 
   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd"); 

   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd"); 

   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d"); 
   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0"); 

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb"); 
   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba"); 

   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0"); 

   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe"); 
   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe"); 
}


//////////////////////////////////////////////////////////
//                                                      //
//                       ISTRI_38                       //
//                                                      //
//////////////////////////////////////////////////////////

UInt h_pcmpistri_38 ( V128* argL, V128* argR )
{
   V128 block[2];
   memcpy(&block[0], argL, sizeof(V128));
   memcpy(&block[1], argR, sizeof(V128));
   ULong res, flags;
   __asm__ __volatile__(
      "subq      $1024,  %%rsp"             "\n\t"
      "movdqu    0(%2),  %%xmm2"            "\n\t"
      "movdqu    16(%2), %%xmm11"           "\n\t"
      "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
      "pushfq"                              "\n\t"
      "popq      %%rdx"                     "\n\t"
      "movq      %%rcx,  %0"                "\n\t"
      "movq      %%rdx,  %1"                "\n\t"
      "addq      $1024,  %%rsp"             "\n\t"
      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   );
   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
}

UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
{
   V128 resV;
   UInt resOSZACP, resECX;
   Bool ok
      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
                       zmask_from_V128(argLU),
                       zmask_from_V128(argRU),
                       0x38, False/*!isSTRM*/
        );
   assert(ok);
   resECX = resV.uInt[0];
   return (resOSZACP << 16) | resECX;
}

void istri_38 ( void )
{
   char* wot = "38";
   UInt(*h)(V128*,V128*) = h_pcmpistri_38;
   UInt(*s)(V128*,V128*) = s_pcmpistri_38;

   try_istri(wot,h,s, "0000000000000000", "0000000000000000"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa"); 
   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa"); 
   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa"); 

   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa"); 
   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa"); 

   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa"); 
   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000"); 
}



//////////////////////////////////////////////////////////
//                                                      //
//                         main                         //
//                                                      //
//////////////////////////////////////////////////////////

int main ( void )
{
   istri_4A();
   istri_3A();
   istri_08();
   istri_1A();
   istri_02();
   istri_0C();
   istri_12();
   istri_44();
   istri_00();
   istri_38();
   return 0;
}