/*
 * Copyright 2014 The Android Open Source Project
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#ifndef SkMath_opts_SSE2_DEFINED
#define SkMath_opts_SSE2_DEFINED

#include <emmintrin.h>

// Because no _mm_div_epi32() in SSE2, we use float division to emulate.
// When using this function, make sure a and b don't exceed float's precision.
static inline __m128i shim_mm_div_epi32(const __m128i& a, const __m128i& b) {
    __m128 x = _mm_cvtepi32_ps(a);
    __m128 y = _mm_cvtepi32_ps(b);
    return _mm_cvttps_epi32(_mm_div_ps(x, y));
}

// Portable version of SkSqrtBits is in SkMath.cpp.
static inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) {
    __m128i root =  _mm_setzero_si128();
    __m128i remHi = _mm_setzero_si128();
    __m128i remLo = x;
    __m128i one128 = _mm_set1_epi32(1);

    do {
        root = _mm_slli_epi32(root, 1);

        remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2),
                             _mm_srli_epi32(remLo, 30));
        remLo = _mm_slli_epi32(remLo, 2);

        __m128i testDiv = _mm_slli_epi32(root, 1);
        testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1));

        __m128i cmp = _mm_cmplt_epi32(remHi, testDiv);
        __m128i remHi1 = _mm_and_si128(cmp, remHi);
        __m128i root1 = _mm_and_si128(cmp, root);
        __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv));
        __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128));

        remHi = _mm_or_si128(remHi1, remHi2);
        root = _mm_or_si128(root1, root2);
    } while (--count >= 0);

    return root;
}

#endif // SkMath_opts_SSE2_DEFINED