Assembly Optimization

Doomulation · Feb 27, 2006

Okkkay... so this is my first big chunk written in assembly. Only problem is - it's SLOW! It must become faster! This is incredible number crunching, so any way to speed it up is fine! Currently, my processor only supports MMX/SSE/3DNow!, but later, I shall have one that supports it all: MMX/SEE/SSE2/SSE3/3DNow!/3DNow! Professional/64-but support/X2. So I will take any optimizations you can teach me!

Help me? Please?

Code:

void ByteMismatchA(/*uint& i, BYTE& b2, BYTE& b1*/)
{
	/* From Pass1:
		esi = i
		ecx = nEofSize
		edi = pBufferEof
		ebp = pBufferNEof
		bl = b1;
		bh = b2; */

	// Bytes mismatch. First try to scan further in the file to see if the same pattern of bytes might
	// exist further in the file.
	__asm
	{
		// ecx = nNEofSize
		// edx = lPosNEof
		// bl (bx, ebx) = b
		push ecx;
#ifdef DEBUG
		xor ecx, ecx;
#endif
		mov ecx, nNEofSize;
		mov edx, esi;
		sub ecx, esi;

begin_loop:
		inc edx;
		mov al, [ebp + edx];
		/* Do call to function */
		loop begin_loop; // if (lPosNEof > nNEofSize)

		// Registers used in Pass1 - so save them.
#ifdef DEBUG
		xor eax, eax;
		xor ecx, ecx;
		xor edx, edx;
#endif

		push ebp;
		push edi;
#ifdef DEBUG
		xor ebp, ebp;
		xor edi, edi;
#endif

		// As the loop quits, following registers are now free: ecx, edx, esi, al, ebx, ebp, edi
		// Following are NOT free: bl, eax
		
		// edx = nDiffUsedSize
		// ebp = nDiffBuffSize
		// edi = pBufferDiff
		// esi = i
		mov ecx, esi;
		call AddressSize;
#ifdef DEBUG
		xor ecx, ecx;
#endif

		mov edx, nDiffUsedSize;
		mov ebp, nDiffBuffSize;
		mov edi, pBufferDiff;

		add eax, esi;
		cmp eax, ebp;
		jb pass2; // if (nDiffUsedSize + AddressSize(nPos) < nDiffBuffSize) jump to pass2

		// Allocate more memory
#ifdef DEBUG
		xor eax, eax;
#endif
		push ebp;
		push edi;
		call realloc;
		add esp, 8; // Clean stack
		mov esi, eax; // New pointer
		
pass2:
#ifdef DEBUG
		xor eax, eax;
#endif
		cmp esi, 0xFF;
		jbe pass3; // if (nPos > 255) jump to pass3;
		cmp esi, 0xFFFF;
		jbe pass4;
		cmp esi, 0xFFFFFFFF;
		jbe pass5;

pass3:
		// i <= 255
		mov cl, numof255;
		inc cl;
		mov eax, edi;
		add eax, edx;
		mov [eax], cl; // Copy numof255 to pBufferDiff + nDiffUsedSize
		inc edx;
		mov numof255, cl;
#ifdef DEBUG
		xor eax, eax;
		xor ecx, ecx;
#endif
		jmp pass6;

pass4:
		// i <= 65535
		mov cx, numof65535;
		inc cx;
		mov eax, edi;
		add eax, edx;
		mov [eax], cx;
		add edx, 2;
		mov numof65535, cx;
#ifdef DEBUG
		xor eax, eax;
		xor ecx, ecx;
#endif
		jmp pass6;

pass5:
		// i > 65535
		mov [edi + edx], esi;
		add edx, 4;

pass6:
		//pBufferDiff[nDiffUsedSize++] = b2;
		mov eax, edi;
		add eax, edx;
		mov [eax], ah;
		inc edx;
#ifdef DEBUG
		xor eax, eax;
#endif
		
		// Now synchronize data!
		mov nDiffUsedSize, edx;
		mov nDiffBuffSize, ebp;
		mov pBufferDiff, edi;
#ifdef DEBUG
		xor edx, edx;
		xor ebp, ebp;
		xor edi, edi;
#endif
		pop edi;
		pop ebp;
		pop ecx;
		//pop edi;
		//pop ebp;
		//pop ebx;
	};
}

void Pass1(uint& i, uint& nOneHundredth)
{
	__asm
	{
		// For debugging, we will zero out all registers not in use at the moment to make it easier to see what registers may be used.
#ifdef DEBUG
		xor eax, eax;
		xor ebx, ebx;
		xor ecx, ecx;
		xor edx, edx;
		xor edi, edi;
		xor esi, esi;
		xor ebp, ebp;
#endif

		// esi = i
		// ecx = nEofSize
		// edi = pBufferEof
		// ebp = pBufferNEof
		// bl = b1;
		// bh = b2;
		mov edi, pBufferEof;
		mov ebp, pBufferNEof;
		mov esi, 0;
		mov ecx, nEofSize;
		inc ecx; // Because while (x <= y) == while (x + 1 < y)
		
do_loop:
		inc esi;
		mov bl, [edi + esi];
		mov bh, [ebp + esi];
		cmp bl, bh;
		je redo_loop; // if (b1 == b2) continue;
		call ByteMismatchA;
#ifdef _DEBUG
		xor ebx, ebx;
#endif
		
redo_loop:
		loop do_loop;
	};
}

Compiled with "Omit frame pointers" to release ebp.

Runik · Feb 27, 2006

I'm not an assembly master, I'm just using it from time to time for some specific bits, but you should use a profiler to see what's making your code slow ...
AMD's CodeAnalyst is nice for that, it's free, and it'll work with any CPU (even Intel ones

)

And don't forget that the best optimization lies most of the time in a better algorithm

Doomulation · Feb 27, 2006

I have it and I have run it. I don't know how to use simulation (bad tutorial) and well... the timer profiler indicates the bottlenecks are 3 instructions: namely "inc edx" (ByteMismatchA + 15), mov al [ebp + edx] (ByteMismatchA + 16) and lastly loop begin_loop (ByteMismatchA + 20).

I have no idea on how to optimize that.
Basically, the whole bottleneck is the loop...

Code:

		inc edx;
		mov al, [ebp + edx];
		/* Do call to function */
		loop begin_loop; // if (lPosNEof > nNEofSize)

...which is neccessary code and it will only get slower! That is because for now it fetches the byte, but later it will also examine it!

Search

Search

Assembly Optimization

Doomulation

?????????????????????????

Runik

Saturnin forever !

Doomulation

?????????????????????????