// This file is part of the ustl library, an STL implementation.
//
// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
// This file is free software, distributed under the MIT License.
//
/// \file simd.h
/// \brief SIMD-type algorithms, with hardware acceleration, if available.
///
/// All algorithms are container-based because iterator syntax is just too
/// damn verbose and because the specializations need to be able to tell
/// how many elements are in the container in order to choose proper SIMD
/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
/// Specializations are only for the tuple template because the container
/// must be of a fixed and compile-time-known size for the compiler to be
/// able to choose the specialization.
///
#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
#include "uassert.h"
#include "ulimits.h"
#if HAVE_MATH_H
#include <math.h>
#endif
#if PLATFORM_ANDROID
#include <stdio.h>
#undef CPU_HAS_MMX
#endif
namespace ustl {
namespace simd {
//----------------------------------------------------------------------
// Generic algorithms
//----------------------------------------------------------------------
/// Applies \p op to each element in \p op1.
template <typename Ctr, typename UnaryOperation>
inline void packop (Ctr& op1, UnaryOperation op)
{
foreach (typename Ctr::iterator, i, op1)
op (*i);
}
/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
template <typename Ctr, typename BinaryOperation>
inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
{
assert (op2.size() <= op1.size());
typename Ctr::const_iterator i1 (op1.begin());
typename Ctr::iterator i2 (op2.begin());
for (; i2 != op2.end(); ++i1, ++i2)
*i2 = op (*i2, *i1);
}
/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
template <typename Ctr, typename BinaryOperation>
inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
{
assert (op1.size() <= op2.size() && op1.size() <= result.size());
passign (op1, result);
packop (op2, result);
}
/// Copies \p op1 into \p result.
template <typename Ctr>
inline void passign (const Ctr& op1, Ctr& result)
{
assert (op1.size() <= result.size());
typename Ctr::iterator d (result.begin());
foreach (typename Ctr::const_iterator, s, op1)
*d++ = *s;
}
/// Copies \p result.size() elements from \p op1 to \p result.
template <typename Ctr>
inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
{
foreach (typename Ctr::iterator, d, result)
*d = *op1++;
}
template <typename Ctr1, typename Ctr2, typename ConvertFunction>
inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
{
assert (op1.size() <= op2.size());
typename Ctr1::const_iterator i1 (op1.begin());
typename Ctr2::iterator i2 (op2.begin());
for (; i1 != op1.end(); ++i1, ++i2)
*i2 = f (*i1);
}
// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
STD_BINARY_FUNCTOR (fpshl, T, (a << b))
STD_BINARY_FUNCTOR (fpshr, T, (a >> b))
STD_BINARY_FUNCTOR (fpmin, T, (min (a, b)))
STD_BINARY_FUNCTOR (fpmax, T, (max (a, b)))
STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2))
STD_CONVERSION_FUNCTOR (fcast, (D(a)))
#if HAVE_MATH_H
STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a))))
STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a))))
STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a))))
STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a))))
#if HAVE_RINTF
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
#else
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
#endif
template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
#endif
template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
#define SIMD_PACKEDOP1(name, operation) \
template <typename Ctr> \
inline void name (Ctr& op1) \
{ \
typedef typename Ctr::value_type value_t; \
packop (op1, operation<value_t>()); \
}
#define SIMD_PACKEDOP2(name, operation) \
template <typename Ctr> \
inline void name (const Ctr& op1, Ctr& op2) \
{ \
typedef typename Ctr::value_type value_t; \
packop (op1, op2, operation<value_t>()); \
}
#define SIMD_PACKEDOP3(name, operation) \
template <typename Ctr> \
inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \
{ \
typedef typename Ctr::value_type value_t; \
packop (op1, op2, result, operation<value_t>()); \
}
#define SIMD_SINGLEOP1(name, operation) \
template <typename T> \
inline T name (T op) \
{ \
operation<T> obj; \
return (obj(op)); \
}
#define SIMD_CONVERTOP(name, operation) \
template <typename Ctr1, typename Ctr2> \
inline void name (const Ctr1& op1, Ctr2& op2) \
{ \
typedef typename Ctr1::value_type value1_t; \
typedef typename Ctr2::value_type value2_t; \
pconvert (op1, op2, operation<value1_t, value2_t>());\
}
SIMD_PACKEDOP2 (padd, plus)
SIMD_PACKEDOP2 (psub, minus)
SIMD_PACKEDOP2 (pmul, multiplies)
SIMD_PACKEDOP2 (pdiv, divides)
SIMD_PACKEDOP2 (pand, bitwise_and)
SIMD_PACKEDOP2 (por, bitwise_or)
SIMD_PACKEDOP2 (pxor, bitwise_xor)
SIMD_PACKEDOP2 (pshl, fpshl)
SIMD_PACKEDOP2 (pshr, fpshr)
SIMD_PACKEDOP2 (psubs, fpsubs)
SIMD_PACKEDOP2 (pmin, fpmin)
SIMD_PACKEDOP2 (pmax, fpmax)
SIMD_PACKEDOP2 (pavg, fpavg)
SIMD_PACKEDOP3 (padd, plus)
SIMD_PACKEDOP3 (psub, minus)
SIMD_PACKEDOP3 (pmul, multiplies)
SIMD_PACKEDOP3 (pdiv, divides)
SIMD_PACKEDOP3 (pand, bitwise_and)
SIMD_PACKEDOP3 (por, bitwise_or)
SIMD_PACKEDOP3 (pxor, bitwise_xor)
SIMD_PACKEDOP3 (pshl, fpshl)
SIMD_PACKEDOP3 (pshr, fpshr)
SIMD_PACKEDOP3 (padds, fpadds)
SIMD_PACKEDOP3 (psubs, fpsubs)
SIMD_PACKEDOP3 (pmin, fpmin)
SIMD_PACKEDOP3 (pmax, fpmax)
SIMD_PACKEDOP3 (pavg, fpavg)
#if HAVE_MATH_H
SIMD_PACKEDOP1 (precip, fpreciprocal)
SIMD_PACKEDOP1 (psqrt, fpsqrt)
SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
SIMD_PACKEDOP1 (psin, fsin)
SIMD_PACKEDOP1 (pcos, fcos)
SIMD_PACKEDOP1 (ptan, ftan)
SIMD_SINGLEOP1 (srecip, fpreciprocal)
SIMD_SINGLEOP1 (ssqrt, fpsqrt)
SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
SIMD_SINGLEOP1 (ssin, fsin)
SIMD_SINGLEOP1 (scos, fcos)
SIMD_SINGLEOP1 (stan, ftan)
SIMD_CONVERTOP (pround, fround)
template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
#endif
#undef SIMD_SINGLEOP1
#undef SIMD_PACKEDOP3
#undef SIMD_PACKEDOP2
#undef SIMD_PACKEDOP1
//----------------------------------------------------------------------
// Vector types to cast tuple data to
//----------------------------------------------------------------------
#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
#define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs)))
#else
#define VECTOR_ATTRIBUTE(mode,vs)
#endif
typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
#if HAVE_INT64_T
typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
#endif
typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
#undef VECTOR_ATTRIBUTE
//----------------------------------------------------------------------
// Hardware accelerated specializations
//----------------------------------------------------------------------
#define SIMD_PKOP2_SPEC(n, type, optype) \
template <> \
inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
#define SIMD_PASSIGN_SPEC(n, type) \
template <> \
inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
#define SIMD_IPASSIGN_SPEC(n, type) \
template <> \
inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
#define SIMD_CONVERT_SPEC(n, type1, type2, optype) \
template <> \
inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
#if CPU_HAS_MMX
#define STD_MMX_ARGS "=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
#define DBL_MMX_ARGS "=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
#define MMX_PKOP2_SPEC(n,type,optype,instruction) \
SIMD_PKOP2_SPEC(n,type,optype) \
{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \
SIMD_PKOP2_SPEC(n,type,optype) \
{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
#define MMX_PASSIGN_SPEC(n,type) \
SIMD_PASSIGN_SPEC(n,type) \
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_PASSIGN_SPEC(n,type) \
SIMD_PASSIGN_SPEC(n,type) \
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
#define MMX_IPASSIGN_SPEC(n,type) \
SIMD_IPASSIGN_SPEC(n,type) \
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_IPASSIGN_SPEC(n,type) \
SIMD_IPASSIGN_SPEC(n,type) \
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
MMX_PASSIGN_SPEC(8,uint8_t)
MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
MMX_PASSIGN_SPEC(8,int8_t)
MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
MMX_PASSIGN_SPEC(4,uint16_t)
MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
/// \todo psllw does not work like other operations, it uses the first element for shift count.
//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
MMX_PASSIGN_SPEC(4,int16_t)
MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
MMX_PASSIGN_SPEC(2,uint32_t)
MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
MMX_PASSIGN_SPEC(2,int32_t)
MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
#if CPU_HAS_SSE || CPU_HAS_3DNOW
MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
#endif // CPU_HAS_SSE || CPU_HAS_3DNOW
#if CPU_HAS_3DNOW
MMX_PASSIGN_SPEC(2,float)
MMX_PKOP2_SPEC(2,float,plus,pfadd)
MMX_PKOP2_SPEC(2,float,minus,pfsub)
MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
#ifndef CPU_HAS_SSE
MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
#endif
#endif // CPU_HAS_3DNOW
MMX_IPASSIGN_SPEC(8,uint8_t)
MMX_IPASSIGN_SPEC(4,uint16_t)
MMX_IPASSIGN_SPEC(2,uint32_t)
MMX_IPASSIGN_SPEC(2,float)
#ifndef CPU_HAS_SSE
MMX_DBL_PASSIGN_SPEC(4,float)
MMX_DBL_PASSIGN_SPEC(4,uint32_t)
MMX_DBL_PASSIGN_SPEC(4,int32_t)
MMX_DBL_IPASSIGN_SPEC(4,float)
MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
MMX_DBL_IPASSIGN_SPEC(4,int32_t)
#endif
#undef MMX_IPASSIGN_SPEC
#undef MMX_PASSIGN_SPEC
#undef MMX_PKOP2_SPEC
#undef STD_MMX_ARGS
#endif // CPU_HAS_MMX
#if CPU_HAS_SSE
#define STD_SSE_ARGS "=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
#define SSE_PKOP2_SPEC(n,type,optype,instruction) \
SIMD_PKOP2_SPEC(n,type,optype) \
{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
#define SSE_PASSIGN_SPEC(n,type) \
SIMD_PASSIGN_SPEC(n,type) \
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
#define SSE_IPASSIGN_SPEC(n,type) \
SIMD_IPASSIGN_SPEC(n,type) \
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
SSE_PASSIGN_SPEC(4,float)
SSE_PASSIGN_SPEC(4,int32_t)
SSE_PASSIGN_SPEC(4,uint32_t)
SSE_PKOP2_SPEC(4,float,plus,addps)
SSE_PKOP2_SPEC(4,float,minus,subps)
SSE_PKOP2_SPEC(4,float,multiplies,mulps)
SSE_PKOP2_SPEC(4,float,divides,divps)
SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
SSE_PKOP2_SPEC(4,float,fpmax,maxps)
SSE_PKOP2_SPEC(4,float,fpmin,minps)
SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
asm ("cvtps2pi %2, %%mm0\n\t"
"cvtps2pi %3, %%mm1\n\t"
"movq %%mm0, %0\n\t"
"movq %%mm1, %1"
: DBL_MMX_ARGS);
reset_mmx();
}
SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
asm ("cvtpi2ps %2, %%xmm0\n\t"
"shufps $0x4E,%%xmm0,%%xmm0\n\t"
"cvtpi2ps %1, %%xmm0\n\t"
"movups %%xmm0, %0"
: "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
}
template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
register int32_t rv;
asm ("movss %1, %%xmm0\n\t"
"cvtss2si %%xmm0, %0"
: "=r"(rv) : "m"(a) : "xmm0" );
return (rv);
}
template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
register uint32_t rv;
asm ("movss %1, %%xmm0\n\t"
"cvtss2si %%xmm0, %0"
: "=r"(rv) : "m"(a) : "xmm0" );
return (rv);
}
SSE_IPASSIGN_SPEC(4,float)
SSE_IPASSIGN_SPEC(4,int32_t)
SSE_IPASSIGN_SPEC(4,uint32_t)
#undef SSE_IPASSIGN_SPEC
#undef SSE_PASSIGN_SPEC
#undef SSE_PKOP2_SPEC
#undef STD_SSE_ARGS
#endif // CPU_HAS_SSE
#undef SIMD_PACKEDOP_SPEC
} // namespace simd
} // namespace ustl
#endif