Results 1 to 3 of 3
  1. #1
    ????????????????????????? Doomulation's Avatar
    Join Date
    Nov 2001
    Location
    ????????????????
    Posts
    8,780

    Assembly Optimization

    Okkkay... so this is my first big chunk written in assembly. Only problem is - it's SLOW! It must become faster! This is incredible number crunching, so any way to speed it up is fine! Currently, my processor only supports MMX/SSE/3DNow!, but later, I shall have one that supports it all: MMX/SEE/SSE2/SSE3/3DNow!/3DNow! Professional/64-but support/X2. So I will take any optimizations you can teach me!



    Help me? Please?

    Code:
    Code:
    void ByteMismatchA(/*uint& i, BYTE& b2, BYTE& b1*/)
    {
    	/* From Pass1:
    		esi = i
    		ecx = nEofSize
    		edi = pBufferEof
    		ebp = pBufferNEof
    		bl = b1;
    		bh = b2; */
    
    	// Bytes mismatch. First try to scan further in the file to see if the same pattern of bytes might
    	// exist further in the file.
    	__asm
    	{
    		// ecx = nNEofSize
    		// edx = lPosNEof
    		// bl (bx, ebx) = b
    		push ecx;
    #ifdef DEBUG
    		xor ecx, ecx;
    #endif
    		mov ecx, nNEofSize;
    		mov edx, esi;
    		sub ecx, esi;
    
    begin_loop:
    		inc edx;
    		mov al, [ebp + edx];
    		/* Do call to function */
    		loop begin_loop; // if (lPosNEof > nNEofSize)
    
    		// Registers used in Pass1 - so save them.
    #ifdef DEBUG
    		xor eax, eax;
    		xor ecx, ecx;
    		xor edx, edx;
    #endif
    
    		push ebp;
    		push edi;
    #ifdef DEBUG
    		xor ebp, ebp;
    		xor edi, edi;
    #endif
    
    		// As the loop quits, following registers are now free: ecx, edx, esi, al, ebx, ebp, edi
    		// Following are NOT free: bl, eax
    		
    		// edx = nDiffUsedSize
    		// ebp = nDiffBuffSize
    		// edi = pBufferDiff
    		// esi = i
    		mov ecx, esi;
    		call AddressSize;
    #ifdef DEBUG
    		xor ecx, ecx;
    #endif
    
    		mov edx, nDiffUsedSize;
    		mov ebp, nDiffBuffSize;
    		mov edi, pBufferDiff;
    
    		add eax, esi;
    		cmp eax, ebp;
    		jb pass2; // if (nDiffUsedSize + AddressSize(nPos) < nDiffBuffSize) jump to pass2
    
    		// Allocate more memory
    #ifdef DEBUG
    		xor eax, eax;
    #endif
    		push ebp;
    		push edi;
    		call realloc;
    		add esp, 8; // Clean stack
    		mov esi, eax; // New pointer
    		
    pass2:
    #ifdef DEBUG
    		xor eax, eax;
    #endif
    		cmp esi, 0xFF;
    		jbe pass3; // if (nPos > 255) jump to pass3;
    		cmp esi, 0xFFFF;
    		jbe pass4;
    		cmp esi, 0xFFFFFFFF;
    		jbe pass5;
    
    pass3:
    		// i <= 255
    		mov cl, numof255;
    		inc cl;
    		mov eax, edi;
    		add eax, edx;
    		mov [eax], cl; // Copy numof255 to pBufferDiff + nDiffUsedSize
    		inc edx;
    		mov numof255, cl;
    #ifdef DEBUG
    		xor eax, eax;
    		xor ecx, ecx;
    #endif
    		jmp pass6;
    
    pass4:
    		// i <= 65535
    		mov cx, numof65535;
    		inc cx;
    		mov eax, edi;
    		add eax, edx;
    		mov [eax], cx;
    		add edx, 2;
    		mov numof65535, cx;
    #ifdef DEBUG
    		xor eax, eax;
    		xor ecx, ecx;
    #endif
    		jmp pass6;
    
    pass5:
    		// i > 65535
    		mov [edi + edx], esi;
    		add edx, 4;
    
    pass6:
    		//pBufferDiff[nDiffUsedSize++] = b2;
    		mov eax, edi;
    		add eax, edx;
    		mov [eax], ah;
    		inc edx;
    #ifdef DEBUG
    		xor eax, eax;
    #endif
    		
    		// Now synchronize data!
    		mov nDiffUsedSize, edx;
    		mov nDiffBuffSize, ebp;
    		mov pBufferDiff, edi;
    #ifdef DEBUG
    		xor edx, edx;
    		xor ebp, ebp;
    		xor edi, edi;
    #endif
    		pop edi;
    		pop ebp;
    		pop ecx;
    		//pop edi;
    		//pop ebp;
    		//pop ebx;
    	};
    }
    
    void Pass1(uint& i, uint& nOneHundredth)
    {
    	__asm
    	{
    		// For debugging, we will zero out all registers not in use at the moment to make it easier to see what registers may be used.
    #ifdef DEBUG
    		xor eax, eax;
    		xor ebx, ebx;
    		xor ecx, ecx;
    		xor edx, edx;
    		xor edi, edi;
    		xor esi, esi;
    		xor ebp, ebp;
    #endif
    
    		// esi = i
    		// ecx = nEofSize
    		// edi = pBufferEof
    		// ebp = pBufferNEof
    		// bl = b1;
    		// bh = b2;
    		mov edi, pBufferEof;
    		mov ebp, pBufferNEof;
    		mov esi, 0;
    		mov ecx, nEofSize;
    		inc ecx; // Because while (x <= y) == while (x + 1 < y)
    		
    do_loop:
    		inc esi;
    		mov bl, [edi + esi];
    		mov bh, [ebp + esi];
    		cmp bl, bh;
    		je redo_loop; // if (b1 == b2) continue;
    		call ByteMismatchA;
    #ifdef _DEBUG
    		xor ebx, ebx;
    #endif
    		
    redo_loop:
    		loop do_loop;
    	};
    }
    Compiled with "Omit frame pointers" to release ebp.
    Atashi wa juu-yon-sai no onna no ko! Atashi no namae wa Miizuki. Yurushiku ne!
    Nani? Atashi o shinjirimasen desu ka? Baka!
    "You're all doomed! Doomed, I say! Hehe... are we approaching the end of the world?"

    shikata ga kaite aru - "the instructions are written above"
    Need to download GoodN64 or instructions to use it? Need to check if it's a good or bad rom?
    Download: Glide64 | Hacktarux's wrapper


    • Advertising

      advertising
      EmuTalk.net
      has no influence
      on the ads that
      are displayed
        
       

  2. #2
    Saturnin forever ! Runik's Avatar
    Join Date
    Aug 2005
    Posts
    45
    I'm not an assembly master, I'm just using it from time to time for some specific bits, but you should use a profiler to see what's making your code slow ...
    AMD's CodeAnalyst is nice for that, it's free, and it'll work with any CPU (even Intel ones )

    And don't forget that the best optimization lies most of the time in a better algorithm
    Last edited by Runik; February 27th, 2006 at 17:29.
    Beware the duck !

  3. #3
    ????????????????????????? Doomulation's Avatar
    Join Date
    Nov 2001
    Location
    ????????????????
    Posts
    8,780
    I have it and I have run it. I don't know how to use simulation (bad tutorial) and well... the timer profiler indicates the bottlenecks are 3 instructions: namely "inc edx" (ByteMismatchA + 15), mov al [ebp + edx] (ByteMismatchA + 16) and lastly loop begin_loop (ByteMismatchA + 20).

    I have no idea on how to optimize that.
    Basically, the whole bottleneck is the loop...

    Code:
    		inc edx;
    		mov al, [ebp + edx];
    		/* Do call to function */
    		loop begin_loop; // if (lPosNEof > nNEofSize)
    ...which is neccessary code and it will only get slower! That is because for now it fetches the byte, but later it will also examine it!
    Atashi wa juu-yon-sai no onna no ko! Atashi no namae wa Miizuki. Yurushiku ne!
    Nani? Atashi o shinjirimasen desu ka? Baka!
    "You're all doomed! Doomed, I say! Hehe... are we approaching the end of the world?"

    shikata ga kaite aru - "the instructions are written above"
    Need to download GoodN64 or instructions to use it? Need to check if it's a good or bad rom?
    Download: Glide64 | Hacktarux's wrapper

Similar Threads

  1. x86 Code Optimization
    By blueshogun96 in forum Programming
    Replies: 17
    Last Post: January 25th, 2006, 15:36
  2. Books on assembly language for Win32 OS's
    By cufunha in forum Programming
    Replies: 10
    Last Post: January 30th, 2005, 03:49
  3. Assembly!
    By Doomulation in forum Programming
    Replies: 10
    Last Post: June 21st, 2004, 19:50
  4. Assembly Language
    By JedUK in forum Programming
    Replies: 13
    Last Post: June 24th, 2003, 15:17
  5. Optimizing in the dynamic recompilation!
    By Norlin in forum Mupen64
    Replies: 2
    Last Post: February 9th, 2003, 10:15

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •