#include <Core/Core.h>
#include <Painter/Painter.h>

using namespace Upp;

never_inline
String AsString(__m128i x)
{
	dword h[4];
	memcpy(h, &x, 16);
	return Sprintf("_%08x_%08x_%08x_%08x", h[3], h[2], h[1], h[0]);
}

#ifdef _DEBUG
#define DUMPS(x) RLOG(#x << " = " << AsString(x));
#else
#define DUMPS(x)
#endif

__m128i LoadRGBA(const RGBA *s)
{
	return _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, *(dword *)s), _mm_setzero_si128());
}

__m128i LoadRGBA2(const RGBA *s)
{
	return _mm_unpacklo_epi8(_mm_set_epi32(0, 0, *(dword *)(s + 1), *(dword *)s), _mm_setzero_si128());
}

__m128i LoadRGBAL(__m128i x)
{
	return _mm_unpacklo_epi8(x, _mm_setzero_si128());
}

__m128i LoadRGBAH(__m128i x)
{
	return _mm_unpackhi_epi8(x, _mm_setzero_si128());
}

__m128i PackRGBA(__m128i l, __m128i h)
{
	return _mm_packus_epi16(l, h);
}

void StoreRGBA(RGBA *rgba, __m128i x)
{
	_mm_storeu_si32(rgba, PackRGBA(x, _mm_setzero_si128()));
}

void StoreRGBA2(RGBA *rgba, __m128i x)
{
	_mm_storeu_si64(rgba, PackRGBA(x, _mm_setzero_si128()));
}

__m128i BroadcastAlpha(__m128i x)
{
	return _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, 0xff), 0xff);
}

__m128i ApplyCover(__m128i x, __m128i cover)  // cover is 0 - 256
{
	return _mm_srli_epi16(_mm_mullo_epi16(cover, x), 8); // int a = c.a * cover >> 8;
}

__m128i MakeAlpha(__m128i x)
{
	x = BroadcastAlpha(x);
	x = _mm_add_epi16(x, _mm_srli_epi16(x, 7)); // int alpha = 256 - a - (a >> 7);
	x = _mm_sub_epi16(_mm_set1_epi16(256), x);
	return x;
}

__m128i AlphaBlendSSE2(__m128i t, __m128i c, __m128i alpha)
{
	return _mm_adds_epi16(c, _mm_srli_epi16(_mm_mullo_epi16(t, alpha), 8)); // t = c + (t * alpha >> 8);
}

__m128i AlphaBlendSSE2(__m128i t, __m128i c)
{
	return AlphaBlendSSE2(t, c, MakeAlpha(c));
}

void AlphaBlend1(RGBA *t, const RGBA& c)
{
	StoreRGBA(t, AlphaBlendSSE2(LoadRGBA(t), LoadRGBA(&c)));
}

void AlphaBlend1(RGBA *t, const RGBA *s)
{
	StoreRGBA(t, AlphaBlendSSE2(LoadRGBA(t), LoadRGBA(s)));
}

void AlphaBlend2(RGBA *t, const RGBA *s)
{
	StoreRGBA2(t, AlphaBlendSSE2(LoadRGBA2(t), LoadRGBA2(s)));
}

void AlphaBlend2(RGBA *t, const RGBA& c)
{
	StoreRGBA2(t, AlphaBlendSSE2(LoadRGBA2(t), LoadRGBA(&c)));
}

void AlphaBlend4(RGBA *t, const RGBA *s)
{
	__m128i t4 = _mm_loadu_si128((__m128i *)t);
	__m128i s4 = _mm_loadu_si128((__m128i *)s);
	
	_mm_storeu_si128((__m128i *)t,
		PackRGBA(
			AlphaBlendSSE2(LoadRGBAL(t4), LoadRGBAL(s4)),
			AlphaBlendSSE2(LoadRGBAH(t4), LoadRGBAH(s4))));
}

void AlphaBlendSSE2(RGBA *t, const RGBA *s, int len)
{
	while(len >= 4) {
		AlphaBlend4(t, s);
		t += 4;
		s += 4;
		len -= 4;
	}
	if(len & 2) {
		AlphaBlend2(t, s);
		t += 2;
		s += 2;
	}
	if(len & 1)
		AlphaBlend1(t, s);
}

RGBA RandomRGBA()
{
	return (int)Random(256) * Color(Random(256), Random(256), Random(256));
}

CONSOLE_APP_MAIN
{
	StdLogSetup(LOG_COUT|LOG_FILE);

	for(int i = 0; i < 1000000; i++) {
		int len = Random(100);
		Vector<RGBA> t;
		Vector<RGBA> s;
		
		for(int i = 0; i < len; i++) {
			t.Add(RandomRGBA());
			s.Add(RandomRGBA());
		}

		Vector<RGBA> t1 = clone(t);

		{
			RTIMING("Non SSE");
			AlphaBlend(t1, s, len);
		}

		{
			RTIMING("SSE");
			AlphaBlendSSE2(t, s, len);
		}
		
		if(t != t1) {
			RDUMP(t);
			RDUMP(t1);
			RDUMP(i);
			RDUMP(len);
			Panic("Failed");
		}
	}
	return;
}
