uppsrc_Core_r14486_v2.diff
uppsrc/Core/Cpu.cpp | ||
---|---|---|
149 | 149 |
void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP |
150 | 150 |
void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP |
151 | 151 | |
152 |
#ifdef CPU_X86 |
|
152 |
#if defined(CPU_X86) && defined(__SSE2__) |
|
153 | ||
154 |
#include <emmintrin.h> |
|
155 | ||
153 | 156 |
void huge_memsetd(void *p, dword c, int len) |
154 | 157 |
{ // bypasses the cache, good for >4MB |
155 | 158 |
dword *t = (dword *)p; |
... | ... | |
173 | 176 |
while(len--) |
174 | 177 |
*t++ = c; |
175 | 178 |
} |
179 | ||
180 |
void memsetd(void *p, dword data, int len) |
|
181 |
{ |
|
182 |
dword *t = (dword *)p; |
|
183 |
if(len < 4) { |
|
184 |
if(len & 2) { |
|
185 |
t[0] = t[1] = data; |
|
186 |
t += 2; |
|
187 |
} |
|
188 |
if(len & 1) |
|
189 |
t[0] = data; |
|
190 |
return; |
|
191 |
} |
|
192 | ||
193 |
__m128i val4 = _mm_set1_epi32(data); |
|
194 |
auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); }; |
|
195 | ||
196 |
Set4(len - 4); // fill tail |
|
197 |
if(len >= 32) { |
|
198 |
if(len >= 1024*1024) { // for really huge data, bypass the cache |
|
199 |
huge_memsetd(t, data, len); |
|
200 |
return; |
|
201 |
} |
|
202 |
const dword *e = t + len - 32; |
|
203 |
do { |
|
204 |
Set4(0); Set4(4); Set4(8); Set4(12); |
|
205 |
Set4(16); Set4(20); Set4(24); Set4(28); |
|
206 |
t += 32; |
|
207 |
} |
|
208 |
while(t <= e); |
|
209 |
} |
|
210 |
if(len & 16) { |
|
211 |
Set4(0); Set4(4); Set4(8); Set4(12); |
|
212 |
t += 16; |
|
213 |
} |
|
214 |
if(len & 8) { |
|
215 |
Set4(0); Set4(4); |
|
216 |
t += 8; |
|
217 |
} |
|
218 |
if(len & 4) |
|
219 |
Set4(0); |
|
220 |
} |
|
176 | 221 |
#endif |
177 | 222 | |
178 | 223 |
} |
uppsrc/Core/Ops.h | ||
---|---|---|
342 | 342 |
return x >= -9223372036854775808.0 && x < 9223372036854775808.0; |
343 | 343 |
} |
344 | 344 |
|
345 |
#ifdef CPU_X86 |
|
346 |
|
|
347 |
#include <smmintrin.h> |
|
348 |
|
|
345 |
#if defined(CPU_X86) && defined(__SSE2__) |
|
349 | 346 |
void huge_memsetd(void *p, dword data, int len); |
350 |
|
|
351 |
inline |
|
352 |
void memsetd(void *p, dword data, int len) |
|
353 |
{ |
|
354 |
dword *t = (dword *)p; |
|
355 |
if(len < 4) { |
|
356 |
if(len & 2) { |
|
357 |
t[0] = t[1] = data; |
|
358 |
t += 2; |
|
359 |
} |
|
360 |
if(len & 1) |
|
361 |
t[0] = data; |
|
362 |
return; |
|
363 |
} |
|
364 |
|
|
365 |
__m128i val4 = _mm_set1_epi32(data); |
|
366 |
auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); }; |
|
367 |
|
|
368 |
Set4(len - 4); // fill tail |
|
369 |
if(len >= 32) { |
|
370 |
if(len >= 1024*1024) { // for really huge data, bypass the cache |
|
371 |
huge_memsetd(t, data, len); |
|
372 |
return; |
|
373 |
} |
|
374 |
const dword *e = t + len - 32; |
|
375 |
do { |
|
376 |
Set4(0); Set4(4); Set4(8); Set4(12); |
|
377 |
Set4(16); Set4(20); Set4(24); Set4(28); |
|
378 |
t += 32; |
|
379 |
} |
|
380 |
while(t <= e); |
|
381 |
} |
|
382 |
if(len & 16) { |
|
383 |
Set4(0); Set4(4); Set4(8); Set4(12); |
|
384 |
t += 16; |
|
385 |
} |
|
386 |
if(len & 8) { |
|
387 |
Set4(0); Set4(4); |
|
388 |
t += 8; |
|
389 |
} |
|
390 |
if(len & 4) |
|
391 |
Set4(0); |
|
392 |
} |
|
347 |
void memsetd(void *p, dword data, int len); |
|
393 | 348 |
#else |
394 | 349 |
inline |
395 | 350 |
void memsetd(void *p, RGBA c, int len) |