#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

typedef  unsigned char  UChar;
typedef  unsigned int   UInt;

static UInt randomUInt ( void )
{
   static UInt n = 0;
   /* From "Numerical Recipes in C" 2nd Edition */
   n = 1664525UL * n + 1013904223UL;
   return n >> 17;
}

void maskmovq_mmx ( UChar* regL, UChar* regR )
{
   int i;
   UChar* dst = malloc(8);
   assert(dst);
   for (i = 0; i < 8; i++)
      dst[i] = 17 * (i+1);
   __asm__ __volatile__(
      "emms\n\t"
      "movq (%0), %%mm1\n\t"
      "movq (%1), %%mm2\n\t"
      "movq %2, %%rdi\n\t"
      "maskmovq %%mm1,%%mm2"
      : /*out*/ 
      : /*in*/ "r"(regL), "r"(regR), "r"(&dst[0])
      : /*trash*/ "rdi", "memory", "cc"
   );
   for (i = 0; i < 8; i++)
      printf("%02x", dst[i]);
   free(dst);
}

void maskmovdqu_sse ( UChar* regL, UChar* regR )
{
   int i;
   UChar* dst = malloc(16);
   assert(dst);
   for (i = 0; i < 16; i++)
      dst[i] = i;
   __asm__ __volatile__(
      "movups (%0), %%xmm1\n\t"
      "movups (%1), %%xmm12\n\t"
      "movq %2, %%rdi\n\t"
      "maskmovdqu %%xmm12,%%xmm1\n\t"
      "sfence"
      : /*out*/ 
      : /*in*/ "r"(regL), "r"(regR), "r"(dst)
      : /*trash*/ "rdi", "memory", "cc"
   );
   for (i = 0; i < 16; i++)
      printf("%02x", dst[i]);
   free(dst);
}

int main ( int argc, char** argv )
{
   int i, j;

   /* mmx test */
   {
      UChar* regL = malloc(8);
      UChar* regR = malloc(8);
      assert(regL);
      assert(regR);
      for (i = 0; i < 10; i++) {
         for (j = 0; j < 8; j++) {
            regL[j] = (UChar)randomUInt();
            printf("%02x", regL[j]);
         }
         printf(" ");
         for (j = 0; j < 8; j++) {
            regR[j] = (UChar)randomUInt();
            printf("%02x", regR[j]);
         }
         printf(" ");
         maskmovq_mmx( regR, regL );
         printf("\n");
      }
   }

   /* sse test */
   {
      UChar* regL = malloc(16);
      UChar* regR = malloc(16);
      assert(regL);
      assert(regR);
      for (i = 0; i < 10; i++) {
         for (j = 0; j < 16; j++) {
            regL[j] = (UChar)randomUInt();
            printf("%02x", regL[j]);
         }
         printf(" ");
         for (j = 0; j < 16; j++) {
            regR[j] = (UChar)randomUInt();
            printf("%02x", regR[j]);
         }
         printf(" ");
         maskmovdqu_sse( regR, regL );
         printf("\n");
      }
   }

   return 0;
}