What's new

Assembly Optimization

Doomulation

?????????????????????????
Okkkay... so this is my first big chunk written in assembly. Only problem is - it's SLOW! It must become faster! This is incredible number crunching, so any way to speed it up is fine! Currently, my processor only supports MMX/SSE/3DNow!, but later, I shall have one that supports it all: MMX/SEE/SSE2/SSE3/3DNow!/3DNow! Professional/64-but support/X2. So I will take any optimizations you can teach me!

Help me? Please? :p

Code:
Code:
void ByteMismatchA(/*uint& i, BYTE& b2, BYTE& b1*/)
{
	/* From Pass1:
		esi = i
		ecx = nEofSize
		edi = pBufferEof
		ebp = pBufferNEof
		bl = b1;
		bh = b2; */

	// Bytes mismatch. First try to scan further in the file to see if the same pattern of bytes might
	// exist further in the file.
	__asm
	{
		// ecx = nNEofSize
		// edx = lPosNEof
		// bl (bx, ebx) = b
		push ecx;
#ifdef DEBUG
		xor ecx, ecx;
#endif
		mov ecx, nNEofSize;
		mov edx, esi;
		sub ecx, esi;

begin_loop:
		inc edx;
		mov al, [ebp + edx];
		/* Do call to function */
		loop begin_loop; // if (lPosNEof > nNEofSize)

		// Registers used in Pass1 - so save them.
#ifdef DEBUG
		xor eax, eax;
		xor ecx, ecx;
		xor edx, edx;
#endif

		push ebp;
		push edi;
#ifdef DEBUG
		xor ebp, ebp;
		xor edi, edi;
#endif

		// As the loop quits, following registers are now free: ecx, edx, esi, al, ebx, ebp, edi
		// Following are NOT free: bl, eax
		
		// edx = nDiffUsedSize
		// ebp = nDiffBuffSize
		// edi = pBufferDiff
		// esi = i
		mov ecx, esi;
		call AddressSize;
#ifdef DEBUG
		xor ecx, ecx;
#endif

		mov edx, nDiffUsedSize;
		mov ebp, nDiffBuffSize;
		mov edi, pBufferDiff;

		add eax, esi;
		cmp eax, ebp;
		jb pass2; // if (nDiffUsedSize + AddressSize(nPos) < nDiffBuffSize) jump to pass2

		// Allocate more memory
#ifdef DEBUG
		xor eax, eax;
#endif
		push ebp;
		push edi;
		call realloc;
		add esp, 8; // Clean stack
		mov esi, eax; // New pointer
		
pass2:
#ifdef DEBUG
		xor eax, eax;
#endif
		cmp esi, 0xFF;
		jbe pass3; // if (nPos > 255) jump to pass3;
		cmp esi, 0xFFFF;
		jbe pass4;
		cmp esi, 0xFFFFFFFF;
		jbe pass5;

pass3:
		// i <= 255
		mov cl, numof255;
		inc cl;
		mov eax, edi;
		add eax, edx;
		mov [eax], cl; // Copy numof255 to pBufferDiff + nDiffUsedSize
		inc edx;
		mov numof255, cl;
#ifdef DEBUG
		xor eax, eax;
		xor ecx, ecx;
#endif
		jmp pass6;

pass4:
		// i <= 65535
		mov cx, numof65535;
		inc cx;
		mov eax, edi;
		add eax, edx;
		mov [eax], cx;
		add edx, 2;
		mov numof65535, cx;
#ifdef DEBUG
		xor eax, eax;
		xor ecx, ecx;
#endif
		jmp pass6;

pass5:
		// i > 65535
		mov [edi + edx], esi;
		add edx, 4;

pass6:
		//pBufferDiff[nDiffUsedSize++] = b2;
		mov eax, edi;
		add eax, edx;
		mov [eax], ah;
		inc edx;
#ifdef DEBUG
		xor eax, eax;
#endif
		
		// Now synchronize data!
		mov nDiffUsedSize, edx;
		mov nDiffBuffSize, ebp;
		mov pBufferDiff, edi;
#ifdef DEBUG
		xor edx, edx;
		xor ebp, ebp;
		xor edi, edi;
#endif
		pop edi;
		pop ebp;
		pop ecx;
		//pop edi;
		//pop ebp;
		//pop ebx;
	};
}

void Pass1(uint& i, uint& nOneHundredth)
{
	__asm
	{
		// For debugging, we will zero out all registers not in use at the moment to make it easier to see what registers may be used.
#ifdef DEBUG
		xor eax, eax;
		xor ebx, ebx;
		xor ecx, ecx;
		xor edx, edx;
		xor edi, edi;
		xor esi, esi;
		xor ebp, ebp;
#endif

		// esi = i
		// ecx = nEofSize
		// edi = pBufferEof
		// ebp = pBufferNEof
		// bl = b1;
		// bh = b2;
		mov edi, pBufferEof;
		mov ebp, pBufferNEof;
		mov esi, 0;
		mov ecx, nEofSize;
		inc ecx; // Because while (x <= y) == while (x + 1 < y)
		
do_loop:
		inc esi;
		mov bl, [edi + esi];
		mov bh, [ebp + esi];
		cmp bl, bh;
		je redo_loop; // if (b1 == b2) continue;
		call ByteMismatchA;
#ifdef _DEBUG
		xor ebx, ebx;
#endif
		
redo_loop:
		loop do_loop;
	};
}

Compiled with "Omit frame pointers" to release ebp.
 

Runik

Saturnin forever !
I'm not an assembly master, I'm just using it from time to time for some specific bits, but you should use a profiler to see what's making your code slow ...
AMD's CodeAnalyst is nice for that, it's free, and it'll work with any CPU (even Intel ones ;) )

And don't forget that the best optimization lies most of the time in a better algorithm ;)
 
Last edited:
OP
Doomulation

Doomulation

?????????????????????????
I have it and I have run it. I don't know how to use simulation (bad tutorial) and well... the timer profiler indicates the bottlenecks are 3 instructions: namely "inc edx" (ByteMismatchA + 15), mov al [ebp + edx] (ByteMismatchA + 16) and lastly loop begin_loop (ByteMismatchA + 20).

I have no idea on how to optimize that.
Basically, the whole bottleneck is the loop...

Code:
		inc edx;
		mov al, [ebp + edx];
		/* Do call to function */
		loop begin_loop; // if (lPosNEof > nNEofSize)

...which is neccessary code and it will only get slower! That is because for now it fetches the byte, but later it will also examine it!
 

Top