; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW

define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-LABEL: mulhuw_v4i16:
; SSE2:       # %bb.0:
; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT:    pmulhuw %xmm1, %xmm0
; SSE2-NEXT:    pxor %xmm1, %xmm1
; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT:    retq
;
; SSE41-LABEL: mulhuw_v4i16:
; SSE41:       # %bb.0:
; SSE41-NEXT:    pxor %xmm2, %xmm2
; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; SSE41-NEXT:    pmulld %xmm1, %xmm0
; SSE41-NEXT:    psrld $16, %xmm0
; SSE41-NEXT:    retq
;
; AVX-LABEL: mulhuw_v4i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
; AVX-NEXT:    retq
  %a1 = zext <4 x i16> %a to <4 x i32>
  %b1 = zext <4 x i16> %b to <4 x i32>
  %c = mul <4 x i32> %a1, %b1
  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
  %e = trunc <4 x i32> %d to <4 x i16>
  ret <4 x i16> %e
}

define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-LABEL: mulhw_v4i16:
; SSE2:       # %bb.0:
; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT:    pmulhw %xmm1, %xmm0
; SSE2-NEXT:    pxor %xmm1, %xmm1
; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT:    retq
;
; SSE41-LABEL: mulhw_v4i16:
; SSE41:       # %bb.0:
; SSE41-NEXT:    pslld $16, %xmm0
; SSE41-NEXT:    psrad $16, %xmm0
; SSE41-NEXT:    pslld $16, %xmm1
; SSE41-NEXT:    psrad $16, %xmm1
; SSE41-NEXT:    pmulld %xmm1, %xmm0
; SSE41-NEXT:    psrld $16, %xmm0
; SSE41-NEXT:    retq
;
; AVX-LABEL: mulhw_v4i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
; AVX-NEXT:    vpsrad $16, %xmm0, %xmm0
; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
; AVX-NEXT:    retq
  %a1 = sext <4 x i16> %a to <4 x i32>
  %b1 = sext <4 x i16> %b to <4 x i32>
  %c = mul <4 x i32> %a1, %b1
  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
  %e = trunc <4 x i32> %d to <4 x i16>
  ret <4 x i16> %e
}

define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: mulhuw_v8i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhuw %xmm1, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: mulhuw_v8i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
  %a1 = zext <8 x i16> %a to <8 x i32>
  %b1 = zext <8 x i16> %b to <8 x i32>
  %c = mul <8 x i32> %a1, %b1
  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <8 x i32> %d to <8 x i16>
  ret <8 x i16> %e
}

define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: mulhw_v8i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhw %xmm1, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: mulhw_v8i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
  %a1 = sext <8 x i16> %a to <8 x i32>
  %b1 = sext <8 x i16> %b to <8 x i32>
  %c = mul <8 x i32> %a1, %b1
  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <8 x i32> %d to <8 x i16>
  ret <8 x i16> %e
}

define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: mulhuw_v16i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhuw %xmm2, %xmm0
; SSE-NEXT:    pmulhuw %xmm3, %xmm1
; SSE-NEXT:    retq
;
; AVX-LABEL: mulhuw_v16i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
; AVX-NEXT:    retq
  %a1 = zext <16 x i16> %a to <16 x i32>
  %b1 = zext <16 x i16> %b to <16 x i32>
  %c = mul <16 x i32> %a1, %b1
  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <16 x i32> %d to <16 x i16>
  ret <16 x i16> %e
}

define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: mulhw_v16i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhw %xmm2, %xmm0
; SSE-NEXT:    pmulhw %xmm3, %xmm1
; SSE-NEXT:    retq
;
; AVX-LABEL: mulhw_v16i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
; AVX-NEXT:    retq
  %a1 = sext <16 x i16> %a to <16 x i32>
  %b1 = sext <16 x i16> %b to <16 x i32>
  %c = mul <16 x i32> %a1, %b1
  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <16 x i32> %d to <16 x i16>
  ret <16 x i16> %e
}

define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: mulhuw_v32i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhuw %xmm4, %xmm0
; SSE-NEXT:    pmulhuw %xmm5, %xmm1
; SSE-NEXT:    pmulhuw %xmm6, %xmm2
; SSE-NEXT:    pmulhuw %xmm7, %xmm3
; SSE-NEXT:    retq
;
; AVX2-LABEL: mulhuw_v32i16:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: mulhuw_v32i16:
; AVX512F:       # %bb.0:
; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT:    retq
;
; AVX512BW-LABEL: mulhuw_v32i16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retq
  %a1 = zext <32 x i16> %a to <32 x i32>
  %b1 = zext <32 x i16> %b to <32 x i32>
  %c = mul <32 x i32> %a1, %b1
  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <32 x i32> %d to <32 x i16>
  ret <32 x i16> %e
}

define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: mulhw_v32i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhw %xmm4, %xmm0
; SSE-NEXT:    pmulhw %xmm5, %xmm1
; SSE-NEXT:    pmulhw %xmm6, %xmm2
; SSE-NEXT:    pmulhw %xmm7, %xmm3
; SSE-NEXT:    retq
;
; AVX2-LABEL: mulhw_v32i16:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: mulhw_v32i16:
; AVX512F:       # %bb.0:
; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT:    retq
;
; AVX512BW-LABEL: mulhw_v32i16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retq
  %a1 = sext <32 x i16> %a to <32 x i32>
  %b1 = sext <32 x i16> %b to <32 x i32>
  %c = mul <32 x i32> %a1, %b1
  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <32 x i32> %d to <32 x i16>
  ret <32 x i16> %e
}

define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
; SSE-LABEL: mulhuw_v64i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
; SSE-NEXT:    movdqa %xmm0, (%rdi)
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    retq
;
; AVX2-LABEL: mulhuw_v64i16:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
; AVX2-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
; AVX2-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
; AVX2-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: mulhuw_v64i16:
; AVX512F:       # %bb.0:
; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
; AVX512F-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
; AVX512F-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
; AVX512F-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
; AVX512F-NEXT:    retq
;
; AVX512BW-LABEL: mulhuw_v64i16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpmulhuw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT:    vpmulhuw %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT:    retq
  %a1 = zext <64 x i16> %a to <64 x i32>
  %b1 = zext <64 x i16> %b to <64 x i32>
  %c = mul <64 x i32> %a1, %b1
  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <64 x i32> %d to <64 x i16>
  ret <64 x i16> %e
}

define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
; SSE-LABEL: mulhw_v64i16:
; SSE:       # %bb.0:
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
; SSE-NEXT:    movdqa %xmm0, (%rdi)
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    retq
;
; AVX2-LABEL: mulhw_v64i16:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: mulhw_v64i16:
; AVX512F:       # %bb.0:
; AVX512F-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
; AVX512F-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
; AVX512F-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
; AVX512F-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
; AVX512F-NEXT:    retq
;
; AVX512BW-LABEL: mulhw_v64i16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT:    retq
  %a1 = sext <64 x i16> %a to <64 x i32>
  %b1 = sext <64 x i16> %b to <64 x i32>
  %c = mul <64 x i32> %a1, %b1
  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %e = trunc <64 x i32> %d to <64 x i16>
  ret <64 x i16> %e
}