uppsrc_Core_r14486_v2.diff

Proposed patch for uppsrc (since 14486 revision, second variant) - Sender Ghost, 05/21/2020 04:43 PM

Download (2.64 KB)

View differences:

uppsrc/Core/Cpu.cpp
149 149
void EndianSwap(int64 *v, size_t count) ENDIAN_SWAP
150 150
void EndianSwap(uint64 *v, size_t count) ENDIAN_SWAP
151 151

  
152
#ifdef CPU_X86
152
#if defined(CPU_X86) && defined(__SSE2__)
153

  
154
#include <emmintrin.h>
155

  
153 156
void huge_memsetd(void *p, dword c, int len)
154 157
{ // bypasses the cache, good for >4MB
155 158
	dword *t = (dword *)p;
......
173 176
	while(len--)
174 177
		*t++ = c;
175 178
}
179

  
180
void memsetd(void *p, dword data, int len)
181
{
182
	dword *t = (dword *)p;
183
	if(len < 4) {
184
		if(len & 2) {
185
			t[0] = t[1] = data;
186
			t += 2;
187
		}
188
		if(len & 1)
189
			t[0] = data;
190
		return;
191
	}
192

  
193
	__m128i val4 = _mm_set1_epi32(data);
194
	auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
195

  
196
	Set4(len - 4); // fill tail
197
	if(len >= 32) {
198
		if(len >= 1024*1024) { // for really huge data, bypass the cache
199
			huge_memsetd(t, data, len);
200
			return;
201
		}
202
		const dword *e = t + len - 32;
203
		do {
204
			Set4(0); Set4(4); Set4(8); Set4(12);
205
			Set4(16); Set4(20); Set4(24); Set4(28);
206
			t += 32;
207
		}
208
		while(t <= e);
209
	}
210
	if(len & 16) {
211
		Set4(0); Set4(4); Set4(8); Set4(12);
212
		t += 16;
213
	}
214
	if(len & 8) {
215
		Set4(0); Set4(4);
216
		t += 8;
217
	}
218
	if(len & 4)
219
		Set4(0);
220
}
176 221
#endif
177 222

  
178 223
}
uppsrc/Core/Ops.h
342 342
	return x >= -9223372036854775808.0 && x < 9223372036854775808.0;
343 343
}
344 344

  
345
#ifdef CPU_X86
346

  
347
#include <smmintrin.h>
348

  
345
#if defined(CPU_X86) && defined(__SSE2__)
349 346
void huge_memsetd(void *p, dword data, int len);
350

  
351
inline
352
void memsetd(void *p, dword data, int len)
353
{
354
	dword *t = (dword *)p;
355
	if(len < 4) {
356
		if(len & 2) {
357
			t[0] = t[1] = data;
358
			t += 2;
359
		}
360
		if(len & 1)
361
			t[0] = data;
362
		return;
363
	}
364

  
365
	__m128i val4 = _mm_set1_epi32(data);
366
	auto Set4 = [&](int at) { _mm_storeu_si128((__m128i *)(t + at), val4); };
367

  
368
	Set4(len - 4); // fill tail
369
	if(len >= 32) {
370
		if(len >= 1024*1024) { // for really huge data, bypass the cache
371
			huge_memsetd(t, data, len);
372
			return;
373
		}
374
		const dword *e = t + len - 32;
375
		do {
376
			Set4(0); Set4(4); Set4(8); Set4(12);
377
			Set4(16); Set4(20); Set4(24); Set4(28);
378
			t += 32;
379
		}
380
		while(t <= e);
381
	}
382
	if(len & 16) {
383
		Set4(0); Set4(4); Set4(8); Set4(12);
384
		t += 16;
385
	}
386
	if(len & 8) {
387
		Set4(0); Set4(4);
388
		t += 8;
389
	}
390
	if(len & 4)
391
		Set4(0);
392
}
347
void memsetd(void *p, dword data, int len);
393 348
#else
394 349
inline
395 350
void memsetd(void *p, RGBA c, int len)