diff --git uppsrc/Core/Cpu.cpp uppsrc/Core/Cpu.cpp --- uppsrc/Core/Cpu.cpp +++ uppsrc/Core/Cpu.cpp @@ -149,7 +149,10 @@ void EndianSwap(int *v, size_t count) ENDIAN_SWAP void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP -#ifdef CPU_X86 +#if defined(CPU_X86) && defined(__SSE2__) + +#include + void huge_memsetd(void *p, dword c, int len) { // bypasses the cache, good for >4MB dword *t = (dword *)p; @@ -173,6 +176,48 @@ void huge_memsetd(void *p, dword c, int len) while(len--) *t++ = c; } + +void memsetd(void *p, dword data, int len) +{ + dword *t = (dword *)p; + if(len < 4) { + if(len & 2) { + t[0] = t[1] = data; + t += 2; + } + if(len & 1) + t[0] = data; + return; + } + + __m128i val4 = _mm_set1_epi32(data); + auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); }; + + Set4(len - 4); // fill tail + if(len >= 32) { + if(len >= 1024*1024) { // for really huge data, bypass the cache + huge_memsetd(t, data, len); + return; + } + const dword *e = t + len - 32; + do { + Set4(0); Set4(4); Set4(8); Set4(12); + Set4(16); Set4(20); Set4(24); Set4(28); + t += 32; + } + while(t <= e); + } + if(len & 16) { + Set4(0); Set4(4); Set4(8); Set4(12); + t += 16; + } + if(len & 8) { + Set4(0); Set4(4); + t += 8; + } + if(len & 4) + Set4(0); +} #endif } diff --git uppsrc/Core/Ops.h uppsrc/Core/Ops.h --- uppsrc/Core/Ops.h +++ uppsrc/Core/Ops.h @@ -342,54 +342,9 @@ inline bool FitsInInt64(double x) return x >= -9223372036854775808.0 && x < 9223372036854775808.0; } -#ifdef CPU_X86 - -#include - +#if defined(CPU_X86) && defined(__SSE2__) void huge_memsetd(void *p, dword data, int len); - -inline -void memsetd(void *p, dword data, int len) -{ - dword *t = (dword *)p; - if(len < 4) { - if(len & 2) { - t[0] = t[1] = data; - t += 2; - } - if(len & 1) - t[0] = data; - return; - } - - __m128i val4 = _mm_set1_epi32(data); - auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); }; - - Set4(len - 4); // fill tail - if(len >= 32) { - if(len >= 1024*1024) { // for really huge data, bypass the cache - huge_memsetd(t, data, len); - return; - } - const dword *e = t + len - 32; - do { - Set4(0); Set4(4); Set4(8); Set4(12); - Set4(16); Set4(20); Set4(24); Set4(28); - t += 32; - } - while(t <= e); - } - if(len & 16) { - Set4(0); Set4(4); Set4(8); Set4(12); - t += 16; - } - if(len & 8) { - Set4(0); Set4(4); - t += 8; - } - if(len & 4) - Set4(0); -} +void memsetd(void *p, dword data, int len); #else inline void memsetd(void *p, RGBA c, int len)