Doomulation
?????????????????????????
Okkkay... so this is my first big chunk written in assembly. Only problem is - it's SLOW! It must become faster! This is incredible number crunching, so any way to speed it up is fine! Currently, my processor only supports MMX/SSE/3DNow!, but later, I shall have one that supports it all: MMX/SEE/SSE2/SSE3/3DNow!/3DNow! Professional/64-but support/X2. So I will take any optimizations you can teach me!
Help me? Please?
Code:
Compiled with "Omit frame pointers" to release ebp.
Help me? Please?
Code:
Code:
void ByteMismatchA(/*uint& i, BYTE& b2, BYTE& b1*/)
{
/* From Pass1:
esi = i
ecx = nEofSize
edi = pBufferEof
ebp = pBufferNEof
bl = b1;
bh = b2; */
// Bytes mismatch. First try to scan further in the file to see if the same pattern of bytes might
// exist further in the file.
__asm
{
// ecx = nNEofSize
// edx = lPosNEof
// bl (bx, ebx) = b
push ecx;
#ifdef DEBUG
xor ecx, ecx;
#endif
mov ecx, nNEofSize;
mov edx, esi;
sub ecx, esi;
begin_loop:
inc edx;
mov al, [ebp + edx];
/* Do call to function */
loop begin_loop; // if (lPosNEof > nNEofSize)
// Registers used in Pass1 - so save them.
#ifdef DEBUG
xor eax, eax;
xor ecx, ecx;
xor edx, edx;
#endif
push ebp;
push edi;
#ifdef DEBUG
xor ebp, ebp;
xor edi, edi;
#endif
// As the loop quits, following registers are now free: ecx, edx, esi, al, ebx, ebp, edi
// Following are NOT free: bl, eax
// edx = nDiffUsedSize
// ebp = nDiffBuffSize
// edi = pBufferDiff
// esi = i
mov ecx, esi;
call AddressSize;
#ifdef DEBUG
xor ecx, ecx;
#endif
mov edx, nDiffUsedSize;
mov ebp, nDiffBuffSize;
mov edi, pBufferDiff;
add eax, esi;
cmp eax, ebp;
jb pass2; // if (nDiffUsedSize + AddressSize(nPos) < nDiffBuffSize) jump to pass2
// Allocate more memory
#ifdef DEBUG
xor eax, eax;
#endif
push ebp;
push edi;
call realloc;
add esp, 8; // Clean stack
mov esi, eax; // New pointer
pass2:
#ifdef DEBUG
xor eax, eax;
#endif
cmp esi, 0xFF;
jbe pass3; // if (nPos > 255) jump to pass3;
cmp esi, 0xFFFF;
jbe pass4;
cmp esi, 0xFFFFFFFF;
jbe pass5;
pass3:
// i <= 255
mov cl, numof255;
inc cl;
mov eax, edi;
add eax, edx;
mov [eax], cl; // Copy numof255 to pBufferDiff + nDiffUsedSize
inc edx;
mov numof255, cl;
#ifdef DEBUG
xor eax, eax;
xor ecx, ecx;
#endif
jmp pass6;
pass4:
// i <= 65535
mov cx, numof65535;
inc cx;
mov eax, edi;
add eax, edx;
mov [eax], cx;
add edx, 2;
mov numof65535, cx;
#ifdef DEBUG
xor eax, eax;
xor ecx, ecx;
#endif
jmp pass6;
pass5:
// i > 65535
mov [edi + edx], esi;
add edx, 4;
pass6:
//pBufferDiff[nDiffUsedSize++] = b2;
mov eax, edi;
add eax, edx;
mov [eax], ah;
inc edx;
#ifdef DEBUG
xor eax, eax;
#endif
// Now synchronize data!
mov nDiffUsedSize, edx;
mov nDiffBuffSize, ebp;
mov pBufferDiff, edi;
#ifdef DEBUG
xor edx, edx;
xor ebp, ebp;
xor edi, edi;
#endif
pop edi;
pop ebp;
pop ecx;
//pop edi;
//pop ebp;
//pop ebx;
};
}
void Pass1(uint& i, uint& nOneHundredth)
{
__asm
{
// For debugging, we will zero out all registers not in use at the moment to make it easier to see what registers may be used.
#ifdef DEBUG
xor eax, eax;
xor ebx, ebx;
xor ecx, ecx;
xor edx, edx;
xor edi, edi;
xor esi, esi;
xor ebp, ebp;
#endif
// esi = i
// ecx = nEofSize
// edi = pBufferEof
// ebp = pBufferNEof
// bl = b1;
// bh = b2;
mov edi, pBufferEof;
mov ebp, pBufferNEof;
mov esi, 0;
mov ecx, nEofSize;
inc ecx; // Because while (x <= y) == while (x + 1 < y)
do_loop:
inc esi;
mov bl, [edi + esi];
mov bh, [ebp + esi];
cmp bl, bh;
je redo_loop; // if (b1 == b2) continue;
call ByteMismatchA;
#ifdef _DEBUG
xor ebx, ebx;
#endif
redo_loop:
loop do_loop;
};
}
Compiled with "Omit frame pointers" to release ebp.