Home » Developing U++ » U++ Developers corner » SSE2 and SVO optimization (Painter, memcpy....)
Re: BufferPainter::Clear() optimization [message #53936 is a reply to message #53935] |
Sat, 16 May 2020 01:59 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Hi,
I've worked on optimizing the new_memsetd() operation through various buffer sizes up to 8M and here's the best I can come up with (at least this night...). With CLANG it seems to be beneficial to use the Mirek's new MemSet() for buffer sizes above about 1M, but below that and also with MSBT19 / MSBT19x64 the result is better without. (This algorithm is especially efficient with small fills and therefore should work well as a BufferPainter filler too.) For best results, there are separate versions for 32-bit and 64-bit code. (The '#ifdef WIN64' obviously only works on Windows, but I think there was some other flag on Linux for detecting a 64-bit environment. Please apply that flag, whatever it is, if you test on Linux, etc...)
#ifdef WIN64
inline void new_memsetd(dword *t, dword data, int len){
#ifdef COMPILER_CLANG
if(len>1024*1024){
MemSet(t,data,len);
return;
}
#endif
if(len&1) *t++=data;
len>>=1;
uint64 *w=(uint64*)t;
uint64 q=data;
q = (q << 32) | data;
switch(len) {
default:{
uint64 *lim = w + len;
while(w < lim) *w++ = q;
break;
}
case 16: w[15] = q;
case 15: w[14] = q;
case 14: w[13] = q;
case 13: w[12] = q;
case 12: w[11] = q;
case 11: w[10] = q;
case 10: w[9] = q;
case 9: w[8] = q;
case 8: w[7] = q;
case 7: w[6] = q;
case 6: w[5] = q;
case 5: w[4] = q;
case 4: w[3] = q;
case 3: w[2] = q;
case 2: w[1] = q;
case 1: w[0] = q;
}
}
#else
inline void new_memsetd(dword *t, dword data, int len){
#ifdef COMPILER_CLANG
if(len>1024*1024){
MemSet(t,data,len);
return;
}
#endif
switch(len) {
default:{
dword *lim = t + len;
while(t < lim) *t++ = data;
break;
}
case 16: t[15] = data;
case 15: t[14] = data;
case 14: t[13] = data;
case 13: t[12] = data;
case 12: t[11] = data;
case 11: t[10] = data;
case 10: t[9] = data;
case 9: t[8] = data;
case 8: t[7] = data;
case 7: t[6] = data;
case 6: t[5] = data;
case 5: t[4] = data;
case 4: t[3] = data;
case 3: t[2] = data;
case 2: t[1] = data;
case 1: t[0] = data;
}
}
#endif
The updated benchmarking code:
RGBA c = Red();
int bsize=8*1024*1024;
Buffer<RGBA> b(bsize,(RGBA)Blue());
String result="\"N\",\"Fill()\",\"new_memsetd()\",\"MemSet()\"\r\n";
for(int len=1;len<=bsize;){
int maximum=100000000/len;
int64 t0=usecs();
for(int i = 0; i < maximum; i++) Fill(~b, c, len);
int64 t1=usecs();
for(int i = 0; i < maximum; i++) new_memsetd((dword*)~b, *(dword*)&(c), len);
int64 t2=usecs();
for(int i = 0; i < maximum; i++) MemSet(~b, c, len);
int64 t3=usecs();
result.Cat(Format("%d,%f,%f,%f\r\n",len,1000.0*(t1-t0)/maximum,1000.0*(t2-t1)/maximum,1000.0*(t3-t2)/maximum));
if(len<32) len++;
else len*=2;
}
SaveFile(GetHomeDirFile("Desktop/memset.csv"),result);
Again, I suggest you plot your results using a log-log chart to clearly see the performance with all different block sizes.
If you have some time to spare, please let me know how this works for you.
Best regards,
Tom
|
|
|
Re: BufferPainter::Clear() optimization [message #53946 is a reply to message #53935] |
Sun, 17 May 2020 00:10 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Hi,
Interestingly, the FillRGBA() that can be found in the current BufferPainter Fillers, is a real performer. It wins below 1M dwords just about anything else. However, Mireks new MemSet is the winner thereafter. This applies on Windows 10 x64 to CLANG/CLANGx64/MSBT19 on my Core i7. Only MSBT19x64 has a different situation and the following code tries to optimize that, in addition to combining FillRGBA and MemSet for the other compilers:
#if defined(WIN64) && defined(COMPILER_MSC)
// for MSBT19x64 only:
inline void new_memsetd(void *b, dword data, int len){
dword *t=(dword *)b;
switch(len){
case 6: t[5] = data;
case 5: t[4] = data;
case 4: t[3] = data;
case 3: t[2] = data;
case 2: t[1] = data;
case 1: t[0] = data;
case 0: return;
default:{
if(len&1) *t++=data;
len>>=1;
uint64 *w=(uint64*)t;
uint64 q=*(dword*)&data;
q |= (q << 32);
switch(len) {
default:{
uint64 *lim = w + len - 32;
while(w < lim) *w++ = q;
}
case 32: w[31] = q;
case 31: w[30] = q;
case 30: w[29] = q;
case 29: w[28] = q;
case 28: w[27] = q;
case 27: w[26] = q;
case 26: w[25] = q;
case 25: w[24] = q;
case 24: w[23] = q;
case 23: w[22] = q;
case 22: w[21] = q;
case 21: w[20] = q;
case 20: w[19] = q;
case 19: w[18] = q;
case 18: w[17] = q;
case 17: w[16] = q;
case 16: w[15] = q;
case 15: w[14] = q;
case 14: w[13] = q;
case 13: w[12] = q;
case 12: w[11] = q;
case 11: w[10] = q;
case 10: w[9] = q;
case 9: w[8] = q;
case 8: w[7] = q;
case 7: w[6] = q;
case 6: w[5] = q;
case 5: w[4] = q;
case 4: w[3] = q;
case 3: w[2] = q;
case 2: w[1] = q;
case 1: w[0] = q;
}
}
}
}
#else
inline void new_memsetd(void *b, dword data, int len){
if(len<=1024*1024) FillRGBA((RGBA*)b,*(RGBA*)&data,len);
else MemSet(b,data,len);
}
#endif
The benchmarking code for various fill sizes now looks like this:
RGBA c = Red();
int bsize=8*1024*1024;
Buffer<RGBA> b(bsize,(RGBA)Blue());
String result="\"N\",\"Fill()\",\"new_memsetd()\",\"MemSet()\",\"FillRGBA()\"\r\n";
for(int len=1;len<=bsize;){
int maximum=100000000/len;
int64 t0=usecs();
for(int i = 0; i < maximum; i++) Fill(~b, c, len);
int64 t1=usecs();
for(int i = 0; i < maximum; i++) new_memsetd(~b, *(dword*)&c, len);
int64 t2=usecs();
for(int i = 0; i < maximum; i++) MemSet(~b, c, len);
int64 t3=usecs();
for(int i = 0; i < maximum; i++) FillRGBA(~b, c, len);
int64 t4=usecs();
result.Cat(Format("%d,%f,%f,%f,%f\r\n",len,1000.0*(t1-t0)/maximum,1000.0*(t2-t1)/maximum,1000.0*(t3-t2)/maximum,1000.0*(t4-t3)/maximum));
if(len<64) len++;
else len*=2;
}
SaveFile(GetHomeDirFile("Desktop/memset.csv"),result);
Best regards,
Tom
|
|
|
|
Re: BufferPainter::Clear() optimization [message #53948 is a reply to message #53947] |
Sun, 17 May 2020 10:01 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
mirek wrote on Sun, 17 May 2020 09:47Tom1 wrote on Sat, 16 May 2020 01:59With CLANG it seems to be beneficial to use the Mirek's new MemSet() for buffer sizes above about 1M
I guess L2 cache size plays a role here. The new trick bypasses the cache so kicks in when cache is exhausted...
Mirek
Hi,
Where can I find the new trick?
BR,
Tom
|
|
|
|
Re: BufferPainter::Clear() optimization [message #53952 is a reply to message #53951] |
Sun, 17 May 2020 18:05 |
|
mirek
Messages: 13975 Registered: November 2005
|
Ultimate Member |
|
|
What about this:
#include <CtrlLib/CtrlLib.h>
#include <smmintrin.h>
using namespace Upp;
void Fill0(RGBA *t, RGBA c, int len)
{
while(len >= 16) {
t[0] = c; t[1] = c; t[2] = c; t[3] = c;
t[4] = c; t[5] = c; t[6] = c; t[7] = c;
t[8] = c; t[9] = c; t[10] = c; t[11] = c;
t[12] = c; t[13] = c; t[14] = c; t[15] = c;
t += 16;
len -= 16;
}
switch(len & 15) {
case 15: t[14] = c;
case 14: t[13] = c;
case 13: t[12] = c;
case 12: t[11] = c;
case 11: t[10] = c;
case 10: t[9] = c;
case 9: t[8] = c;
case 8: t[7] = c;
case 7: t[6] = c;
case 6: t[5] = c;
case 5: t[4] = c;
case 4: t[3] = c;
case 3: t[2] = c;
case 2: t[1] = c;
case 1: t[0] = c;
}
}
void Fill2(RGBA *t, RGBA c, int len)
{
while(len >= 16) {
t[0] = c; t[1] = c; t[2] = c; t[3] = c;
t[4] = c; t[5] = c; t[6] = c; t[7] = c;
t[8] = c; t[9] = c; t[10] = c; t[11] = c;
t[12] = c; t[13] = c; t[14] = c; t[15] = c;
t += 16;
len -= 16;
}
if(len & 8) {
t[0] = t[1] = t[2] = t[3] = t[4] = t[5] = t[6] = t[7] = c;
t += 8;
}
if(len & 4) {
t[0] = t[1] = t[2] = t[3] = c;
t += 4;
}
if(len & 2) {
t[0] = t[1] = c;
t += 2;
}
if(len & 1)
t[0] = c;
}
void Fill3(RGBA *t, RGBA c, int len)
{
dword m[4];
m[0] = m[1] = m[2] = m[3] = *(dword*)&(c);
__m128d val = _mm_loadu_pd((double *)m);
if(len >= 16) {
if(len > 1024*1024 / 16 && ((uintptr_t)t & 3) == 0) { // for really huge data, bypass the cache
while((uintptr_t)t & 15) { // align to 16 bytes for SSE
*t++ = c;
len--;
}
do {
_mm_stream_pd((double *)t, val);
_mm_stream_pd((double *)(t + 4), val);
_mm_stream_pd((double *)(t + 8), val);
_mm_stream_pd((double *)(t + 12), val);
t += 16;
len -= 16;
}
while(len >= 16);
_mm_sfence();
}
else
do {
_mm_storeu_pd((double *)t, val);
_mm_storeu_pd((double *)(t + 4), val);
_mm_storeu_pd((double *)(t + 8), val);
_mm_storeu_pd((double *)(t + 12), val);
t += 16;
len -= 16;
}
while(len >= 16);
}
if(len & 8) {
_mm_storeu_pd((double *)t, val);
_mm_storeu_pd((double *)(t + 4), val);
t += 8;
}
if(len & 4) {
_mm_storeu_pd((double *)t, val);
t += 4;
}
if(len & 2) {
t[0] = t[1] = c;
t += 2;
}
if(len & 1)
t[0] = c;
}
int len = 2000 * 4000;
GUI_APP_MAIN
{
Color c = Red();
Buffer<RGBA> b(2000);
Vector<int> rnd;
for(int i = 0; i < 200; i++)
rnd << Random(100);
for(int i = 0; i < 1000000; i++) {
{
RTIMING("memsetd");
for(int i = 0; i < rnd.GetCount(); i += 2)
memsetd(b + rnd[i], *(dword*)&(c), rnd[i + 1]);
}
{
RTIMING("Fill");
for(int i = 0; i < rnd.GetCount(); i += 2)
Fill(b + rnd[i], c, rnd[i + 1]);
}
{
RTIMING("Fill0");
for(int i = 0; i < rnd.GetCount(); i += 2)
Fill0(b + rnd[i], c, rnd[i + 1]);
}
{
RTIMING("Fill2");
for(int i = 0; i < rnd.GetCount(); i += 2)
Fill2(b + rnd[i], c, rnd[i + 1]);
}
{
RTIMING("Fill3");
for(int i = 0; i < rnd.GetCount(); i += 2)
Fill3(b + rnd[i], c, rnd[i + 1]);
}
{
RTIMING("memset");
for(int i = 0; i < rnd.GetCount(); i += 2)
memset(b + 4 * rnd[i], 255, 4 * rnd[i + 1]);
}
}
b.Alloc(len);
for(int i = 0; i < 20; i++) {
memsetd(b, *(dword*)&(c), len);
{
RTIMING("HUGE memsetd");
memsetd(b, *(dword*)&(c), len);
}
{
RTIMING("HUGE Fill");
Fill(b, c, len);
}
{
RTIMING("HUGE Fill3");
Fill3(b, c, len);
}
{
RTIMING("HUGE memset");
memset(b, c, len * 4);
}
}
BeepExclamation();
}
I believe Fill3 does not have any weakness here... Actually, CLANG produced almost exactly the same code for Fill2 and memsetd for small fills, but I guess providing SSE2 implementation directly does not hurt anything. Plus we still like to have that cache bypass...
So I would go for Fill3 for X86 and Fill2 for non-X86 (in the hope it gets optimized for neon on ARM...)
Mirek
[Updated on: Sun, 17 May 2020 18:09] Report message to a moderator
|
|
|
Re: BufferPainter::Clear() optimization [message #53953 is a reply to message #53952] |
Sun, 17 May 2020 20:56 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Hi Mirek,
Here are my results:
CLANG
TIMING HUGE memset : 27.28 ms - 1.36 ms (27.29 ms / 20 ), min: 1.25 ms, max: 1.66 ms, nesting: 0 - 20
TIMING HUGE Fill3 : 35.01 ms - 1.75 ms (35.01 ms / 20 ), min: 1.63 ms, max: 1.99 ms, nesting: 0 - 20
TIMING HUGE Fill : 73.74 ms - 3.69 ms (73.75 ms / 20 ), min: 3.32 ms, max: 7.43 ms, nesting: 0 - 20
TIMING HUGE memsetd : 72.88 ms - 3.64 ms (72.88 ms / 20 ), min: 3.40 ms, max: 4.51 ms, nesting: 0 - 20
TIMING memset : 1.01 s - 1.01 us ( 1.07 s / 1000000 ), min: 1.00 us, max: 28.00 us, nesting: 0 - 1000000
TIMING Fill3 : 505.44 ms - 505.44 ns (565.98 ms / 1000000 ), min: 0.00 ns, max: 29.00 us, nesting: 0 - 1000000
TIMING Fill2 : 497.06 ms - 497.06 ns (557.61 ms / 1000000 ), min: 0.00 ns, max: 28.00 us, nesting: 0 - 1000000
TIMING Fill0 : 772.53 ms - 772.53 ns (833.07 ms / 1000000 ), min: 0.00 ns, max: 63.00 us, nesting: 0 - 1000000
TIMING Fill : 1.67 s - 1.67 us ( 1.73 s / 1000000 ), min: 1.00 us, max: 58.00 us, nesting: 0 - 1000000
TIMING memsetd : 495.67 ms - 495.67 ns (556.22 ms / 1000000 ), min: 0.00 ns, max: 28.00 us, nesting: 0 - 1000000
CLANGx64
TIMING HUGE memset : 27.76 ms - 1.39 ms (27.76 ms / 20 ), min: 1.28 ms, max: 1.80 ms, nesting: 0 - 20
TIMING HUGE Fill3 : 36.31 ms - 1.82 ms (36.31 ms / 20 ), min: 1.59 ms, max: 2.27 ms, nesting: 0 - 20
TIMING HUGE Fill : 73.42 ms - 3.67 ms (73.42 ms / 20 ), min: 3.41 ms, max: 4.74 ms, nesting: 0 - 20
TIMING HUGE memsetd : 74.52 ms - 3.73 ms (74.52 ms / 20 ), min: 3.47 ms, max: 4.22 ms, nesting: 0 - 20
TIMING memset : 898.49 ms - 898.49 ns (925.83 ms / 1000000 ), min: 0.00 ns, max: 52.00 us, nesting: 0 - 1000000
TIMING Fill3 : 492.59 ms - 492.59 ns (519.92 ms / 1000000 ), min: 0.00 ns, max: 32.00 us, nesting: 0 - 1000000
TIMING Fill2 : 495.82 ms - 495.82 ns (523.15 ms / 1000000 ), min: 0.00 ns, max: 28.00 us, nesting: 0 - 1000000
TIMING Fill0 : 569.61 ms - 569.61 ns (596.95 ms / 1000000 ), min: 0.00 ns, max: 41.00 us, nesting: 0 - 1000000
TIMING Fill : 591.56 ms - 591.56 ns (618.90 ms / 1000000 ), min: 0.00 ns, max: 30.00 us, nesting: 0 - 1000000
TIMING memsetd : 549.04 ms - 549.04 ns (576.37 ms / 1000000 ), min: 0.00 ns, max: 65.00 us, nesting: 0 - 1000000
MSBT19
TIMING HUGE memset : 26.51 ms - 1.33 ms (26.51 ms / 20 ), min: 1.26 ms, max: 1.49 ms, nesting: 0 - 20
TIMING HUGE Fill3 : 35.42 ms - 1.77 ms (35.42 ms / 20 ), min: 1.58 ms, max: 2.14 ms, nesting: 0 - 20
TIMING HUGE Fill : 25.47 ms - 1.27 ms (25.48 ms / 20 ), min: 1.18 ms, max: 1.59 ms, nesting: 0 - 20
TIMING HUGE memsetd : 25.12 ms - 1.26 ms (25.12 ms / 20 ), min: 1.15 ms, max: 1.59 ms, nesting: 0 - 20
TIMING memset : 978.21 ms - 978.21 ns ( 1.05 s / 1000000 ), min: 1.00 us, max: 29.00 us, nesting: 0 - 1000000
TIMING Fill3 : 1.50 s - 1.50 us ( 1.58 s / 1000000 ), min: 1.00 us, max: 29.00 us, nesting: 0 - 1000000
TIMING Fill2 : 1.89 s - 1.89 us ( 1.96 s / 1000000 ), min: 1.00 us, max: 34.00 us, nesting: 0 - 1000000
TIMING Fill0 : 2.02 s - 2.02 us ( 2.09 s / 1000000 ), min: 1.00 us, max: 33.00 us, nesting: 0 - 1000000
TIMING Fill : 2.06 s - 2.06 us ( 2.14 s / 1000000 ), min: 1.00 us, max: 32.00 us, nesting: 0 - 1000000
TIMING memsetd : 1.62 s - 1.62 us ( 1.69 s / 1000000 ), min: 1.00 us, max: 45.00 us, nesting: 0 - 1000000
MSBT19x64
TIMING HUGE memset : 26.96 ms - 1.35 ms (26.96 ms / 20 ), min: 1.27 ms, max: 1.90 ms, nesting: 0 - 20
TIMING HUGE Fill3 : 35.07 ms - 1.75 ms (35.08 ms / 20 ), min: 1.62 ms, max: 2.02 ms, nesting: 0 - 20
TIMING HUGE Fill : 67.09 ms - 3.35 ms (67.09 ms / 20 ), min: 3.17 ms, max: 3.60 ms, nesting: 0 - 20
TIMING HUGE memsetd : 25.64 ms - 1.28 ms (25.64 ms / 20 ), min: 1.19 ms, max: 1.48 ms, nesting: 0 - 20
TIMING memset : 818.75 ms - 818.75 ns (856.11 ms / 1000000 ), min: 0.00 ns, max: 31.00 us, nesting: 0 - 1000000
TIMING Fill3 : 1.36 s - 1.36 us ( 1.40 s / 1000000 ), min: 1.00 us, max: 31.00 us, nesting: 0 - 1000000
TIMING Fill2 : 1.67 s - 1.67 us ( 1.70 s / 1000000 ), min: 1.00 us, max: 30.00 us, nesting: 0 - 1000000
TIMING Fill0 : 1.66 s - 1.66 us ( 1.70 s / 1000000 ), min: 1.00 us, max: 46.00 us, nesting: 0 - 1000000
TIMING Fill : 1.68 s - 1.68 us ( 1.72 s / 1000000 ), min: 1.00 us, max: 50.00 us, nesting: 0 - 1000000
TIMING memsetd : 1.50 s - 1.50 us ( 1.54 s / 1000000 ), min: 1.00 us, max: 29.00 us, nesting: 0 - 1000000
Fill3 is generally the best, but I experience two issues behind the scenes of this benchmark:
1. On MSBT19 / MSBT19x64 there is a significant penalty for small counts. It results in 5 ns per call, whereas in CLANG it is about 0.8 - 1.0 ns per call.
2. On MSBT19x64 the optimal threshold size is 2M counts on my Core i7. However, interestingly the default threshold value works better with MSBT19 on the same computer.
I will continue to investigate this.
Thanks and best regards,
Tom
|
|
|
|
Re: BufferPainter::Clear() optimization [message #53955 is a reply to message #53751] |
Sun, 17 May 2020 22:00 |
Oblivion
Messages: 1092 Registered: August 2007
|
Senior Contributor |
|
|
Here's results on (AMD FX, linux x64):
GCC:
TIMING HUGE memset : 113,98 ms - 5,70 ms (114,00 ms / 20 ), min: 5,00 ms, max: 6,00 ms, nesting: 0 - 20
TIMING HUGE Fill3 : 81,98 ms - 4,10 ms (82,00 ms / 20 ), min: 4,00 ms, max: 5,00 ms, nesting: 0 - 20
TIMING HUGE Fill : 145,98 ms - 7,30 ms (146,00 ms / 20 ), min: 7,00 ms, max: 8,00 ms, nesting: 0 - 20
TIMING HUGE memsetd : 125,98 ms - 6,30 ms (126,00 ms / 20 ), min: 6,00 ms, max: 7,00 ms, nesting: 0 - 20
TIMING memset : 1,24 s - 1,24 us ( 2,23 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill3 : 2,01 s - 2,01 us ( 3,01 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill2 : 2,43 s - 2,43 us ( 3,42 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill0 : 2,77 s - 2,77 us ( 3,76 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill : 2,72 s - 2,72 us ( 3,71 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING memsetd : 1,80 s - 1,80 us ( 2,80 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
ClANG:
TIMING HUGE memset : 120,98 ms - 6,05 ms (121,00 ms / 20 ), min: 5,00 ms, max: 7,00 ms, nesting: 0 - 20
TIMING HUGE Fill3 : 81,98 ms - 4,10 ms (82,00 ms / 20 ), min: 4,00 ms, max: 5,00 ms, nesting: 0 - 20
TIMING HUGE Fill : 130,98 ms - 6,55 ms (131,00 ms / 20 ), min: 6,00 ms, max: 7,00 ms, nesting: 0 - 20
TIMING HUGE memsetd : 133,98 ms - 6,70 ms (134,00 ms / 20 ), min: 6,00 ms, max: 7,00 ms, nesting: 0 - 20
TIMING memset : 1,49 s - 1,49 us ( 2,39 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill3 : 1,74 s - 1,74 us ( 2,64 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill2 : 1,62 s - 1,62 us ( 2,52 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill0 : 2,00 s - 2,00 us ( 2,90 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING Fill : 2,06 s - 2,06 us ( 2,96 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
TIMING memsetd : 2,18 s - 2,18 us ( 3,08 s / 1000000 ), min: 0,00 ns, max: 1,00 ms, nesting: 0 - 1000000
Best regards,
Oblivion
Github page: https://github.com/ismail-yilmaz
upp-components: https://github.com/ismail-yilmaz/upp-components
Bobcat the terminal emulator: https://github.com/ismail-yilmaz/Bobcat
|
|
|
Re: BufferPainter::Clear() optimization [message #53957 is a reply to message #53953] |
Sun, 17 May 2020 23:25 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Mirek,
Please check out this one. It features better performance on MSBT19 / MSBT19x64 with low counts, and works well on CLANG/CLANGx64 too:
inline void new_memset128(void *b, dword data, int len){
switch(len){
case 4: ((dword *)b)[3] = data;
case 3: ((dword *)b)[2] = data;
case 2: ((dword *)b)[1] = data;
case 1: ((dword *)b)[0] = data;
case 0: return;
}
__m128i q = _mm_set1_epi32(*(int*)&data);
__m128i *w = (__m128i*)b;
switch(len>>2){
default:{
__m128i *e = (__m128i*)b + (len>>2) - 4;
if(len <= 2*1024*1024){
while(w<e){
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
}
}
else{
while(w<e){
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
}
}
}
case 4: _mm_storeu_si128(w++, q);
case 3: _mm_storeu_si128(w++, q);
case 2: _mm_storeu_si128(w++, q);
case 1: _mm_storeu_si128(w++, q);
}
switch(len&3){
case 3: ((dword *)b)[len-3] = data;
case 2: ((dword *)b)[len-2] = data;
case 1: ((dword *)b)[len-1] = data;
}
}
Best regards,
Tom
EDIT: Fine tuning...
[Updated on: Sun, 17 May 2020 23:55] Report message to a moderator
|
|
|
|
Re: BufferPainter::Clear() optimization [message #53961 is a reply to message #53960] |
Mon, 18 May 2020 11:13 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Hi,
Yes, you're right: The alignment should be handled. I'll take a look at it... (just need to minimize the code size in order to avoid penalty for short transfers. It is extremely sensitive.)
The cache limit is intentionally 8MB as this is the sweet spot for my Core i7. Probably should get this value from the system to optimize the correct threshold.
The -4 compensates the rest of the samples handled within the leaked default in the switch. (The below cases do the trick).
Best regards,
Tom
|
|
|
Re: BufferPainter::Clear() optimization [message #53962 is a reply to message #53961] |
Mon, 18 May 2020 13:31 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Hi,
Alignment corrected. (Caused obviously a lot of rearranging things to obtain balance.) Threshold is still at 8M, but feel free to experiment.
inline void new_memset128(void *b, dword data, int len){
switch(len){
case 5: ((dword *)b)[4] = data;
case 4: ((dword *)b)[3] = data;
case 3: ((dword *)b)[2] = data;
case 2: ((dword *)b)[1] = data;
case 1: ((dword *)b)[0] = data;
case 0: return;
}
__m128i q = _mm_set1_epi32(*(int*)&data);
__m128i *w = (__m128i*)b;
__m128i *e = (__m128i*)b + (len>>2);
if(len <= 2*1024*1024 || ((uintptr_t)b&3)){
while(w<e-1){
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
}
if(w<e) _mm_storeu_si128(w++, q);
}
else{
int s=(-((int)((uintptr_t)b)>>2))&0x3;
switch(s){
case 3: ((dword *)b)[2] = data;
case 2: ((dword *)b)[1] = data;
case 1: ((dword *)b)[0] = data;
}
w = (__m128i*) ((dword*)b + s);
while(w<e) _mm_stream_si128(w++, q);
_mm_sfence();
}
switch(len&3){
case 3: ((dword *)b)[len-3] = data;
case 2: ((dword *)b)[len-2] = data;
case 1: ((dword *)b)[len-1] = data;
}
}
Best regards,
Tom
|
|
|
|
|
|
|
Re: BufferPainter::Clear() optimization [message #53967 is a reply to message #53966] |
Mon, 18 May 2020 17:08 |
Tom1
Messages: 1212 Registered: March 2007
|
Senior Contributor |
|
|
Mirek,
Here it is: The unconditional alignment. I took your idea, ditched my own, and modified your Fill3a as follows:
void inline Fill3T(void *b, dword data, int len){
switch(len){
case 3: ((dword *)b)[2] = data;
case 2: ((dword *)b)[1] = data;
case 1: ((dword *)b)[0] = data;
case 0: return;
}
__m128i q = _mm_set1_epi32(*(int*)&data);
__m128i *w = (__m128i*)b;
if(len >= 32) {
__m128i *e = (__m128i*)b + (len>>2) - 8;
if(len > 1024*1024 / 16 && ((uintptr_t)w & 3) == 0) { // for really huge data, bypass the cache
_mm_storeu_si128(w, q); // Head align
int s=(-((int)((uintptr_t)b)>>2))&0x3;
w = (__m128i*) ((dword*)b) + s;
do {
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
_mm_stream_si128(w++, q);
}while(w<=e);
_mm_sfence();
}
else
do {
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
}while(w<=e);
}
if(len & 16) {
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
}
if(len & 8) {
_mm_storeu_si128(w++, q);
_mm_storeu_si128(w++, q);
}
if(len & 4) {
_mm_storeu_si128(w, q);
}
_mm_storeu_si128((__m128i*) (((dword*)b) + len - 4), q); // Tail align
}
I made some other changes too and this one is slightly faster on short transfers while equals Fill3a() on longer ones. The improvement is more significant on MSBT19 / MSBT19x64.
In order to get real fast short transfers, the function must be 'inline'. I think this necessitates two variants of the final function. (I have seen that BufferPainter paints most of the time with really short fills, so inlining really makes a difference there.)
Best regards,
Tom
P.S. My cache threshold is still at 8M...
[Updated on: Mon, 18 May 2020 17:11] Report message to a moderator
|
|
|
|
|
Goto Forum:
Current Time: Sat Apr 20 07:47:50 CEST 2024
Total time taken to generate the page: 0.08861 seconds
|