/*
* Copyright (c) 2009-2011 Intel Corporation. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <emmintrin.h>
#include <x86intrin.h>
inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size)
{
bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0;
if (!isAligned) {
memcpy(dst_buff, src_buff, size);
return;
}
static const size_t regs_count = 8;
__m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3;
__m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7;
size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1);
size_t end_position = 0;
__m128i* pWb_buff = (__m128i*)dst_buff;
__m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4);
__m128i* pWc_buff = (__m128i*)src_buff;
/*sync the wc memory data*/
_mm_mfence();
while (pWb_buff < pWb_buff_end)
{
xmm_data0 = _mm_stream_load_si128(pWc_buff);
xmm_data1 = _mm_stream_load_si128(pWc_buff + 1);
xmm_data2 = _mm_stream_load_si128(pWc_buff + 2);
xmm_data3 = _mm_stream_load_si128(pWc_buff + 3);
xmm_data4 = _mm_stream_load_si128(pWc_buff + 4);
xmm_data5 = _mm_stream_load_si128(pWc_buff + 5);
xmm_data6 = _mm_stream_load_si128(pWc_buff + 6);
xmm_data7 = _mm_stream_load_si128(pWc_buff + 7);
pWc_buff += regs_count;
_mm_store_si128(pWb_buff, xmm_data0);
_mm_store_si128(pWb_buff + 1, xmm_data1);
_mm_store_si128(pWb_buff + 2, xmm_data2);
_mm_store_si128(pWb_buff + 3, xmm_data3);
_mm_store_si128(pWb_buff + 4, xmm_data4);
_mm_store_si128(pWb_buff + 5, xmm_data5);
_mm_store_si128(pWb_buff + 6, xmm_data6);
_mm_store_si128(pWb_buff + 7, xmm_data7);
pWb_buff += regs_count;
}
/*copy data by 16 bytes step from the remainder*/
if (remain_data >= 16)
{
size = remain_data;
remain_data = size & 15;
end_position = size >> 4;
for (size_t i = 0; i < end_position; ++i)
{
pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i);
}
}
/*copy the remainder data, if it still existed*/
if (remain_data)
{
__m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position);
char* psrc_buf = (char*)(&temp_data);
char* pdst_buf = (char*)(pWb_buff + end_position);
for (size_t i = 0; i < remain_data; ++i)
{
pdst_buf[i] = psrc_buf[i];
}
}
}