What's new

Bra64 news

OP
A

ashade

New member
ok, so lets play this game...
take a look at my matrix multiplication function and glide 64 multiplication function... compare the speed of them and tell me what u think...

To compare the speed, put both in a huge loop (repeating each function about 1000000 times) and compare the time used to each function...

my function:

typedef struct {
union {
float M[4][4];
struct { //acesso direto dos elementos
float a11, a12, a13, a14,
a21, a22, a23, a24,
a31, a32, a33, a34,
a41, a42, a43, a44;
};
};
} MATRIS;

void MATRIS_dot(MATRIS* mDest, MATRIS* mSrc1, MATRIS*
mSrc2) {

/*
{{a11 b11 + a12 b21 + a13 b31 + a14 b41,
a11 b12 + a12 b22 + a13 b32 + a14 b42,
a11 b13 + a12 b23 + a13 b33 + a14 b43,
a11 b14 + a12 b24 + a13 b34 + a14 b44}, {a21 b11 + a22 b21 + a23 b31 +
a24 b41, a21 b12 + a22 b22 + a23 b32 + a24 b42,
a21 b13 + a22 b23 + a23 b33 + a24 b43,
a21 b14 + a22 b24 + a23 b34 + a24 b44}, {a31 b11 + a32 b21 + a33 b31 +
a34 b41, a31 b12 + a32 b22 + a33 b32 + a34 b42,
a31 b13 + a32 b23 + a33 b33 + a34 b43,
a31 b14 + a32 b24 + a33 b34 + a34 b44}, {a41 b11 + a42 b21 + a43 b31 +
a44 b41, a41 b12 + a42 b22 + a43 b32 + a44 b42,
a41 b13 + a42 b23 + a43 b33 + a44 b43,
a41 b14 + a42 b24 + a43 b34 + a44 b44}}
*/

__asm {
push eax
push ebx
push ecx


mov eax, mSrc2
mov ebx, mSrc1
mov ecx, mDest

fld [eax].a11
fld [eax].a12
fld [eax].a13
fld [eax].a14

fld st(3)
fmul [ebx].a11

fld st(3)
fmul [ebx].a21
faddp st(1), st(0)

fld st(2)
fmul [ebx].a31
faddp st(1), st(0)

fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a11


fld st(3)
fmul [ebx].a12

fld st(3)
fmul [ebx].a22
faddp st(1), st(0)

fld st(2)
fmul [ebx].a32
faddp st(1), st(0)

fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a12


fld st(3)
fmul [ebx].a13

fld st(3)
fmul [ebx].a23
faddp st(1), st(0)

fld st(2)
fmul [ebx].a33
faddp st(1), st(0)

fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a13


fld st(3)
fmul [ebx].a14

fld st(3)
fmul [ebx].a24
faddp st(1), st(0)

fld st(2)
fmul [ebx].a34
faddp st(1), st(0)

fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a14

fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)

fld [eax].a21
fld [eax].a22
fld [eax].a23
fld [eax].a24

fld st(3)
fmul [ebx].a11

fld st(3)
fmul [ebx].a21
faddp st(1), st(0)

fld st(2)
fmul [ebx].a31
faddp st(1), st(0)

fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a21


fld st(3)
fmul [ebx].a12

fld st(3)
fmul [ebx].a22
faddp st(1), st(0)

fld st(2)
fmul [ebx].a32
faddp st(1), st(0)

fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a22


fld st(3)
fmul [ebx].a13

fld st(3)
fmul [ebx].a23
faddp st(1), st(0)

fld st(2)
fmul [ebx].a33
faddp st(1), st(0)

fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a23


fld st(3)
fmul [ebx].a14

fld st(3)
fmul [ebx].a24
faddp st(1), st(0)

fld st(2)
fmul [ebx].a34
faddp st(1), st(0)

fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a24

fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)

fld [eax].a31
fld [eax].a32
fld [eax].a33
fld [eax].a34

fld st(3)
fmul [ebx].a11

fld st(3)
fmul [ebx].a21
faddp st(1), st(0)

fld st(2)
fmul [ebx].a31
faddp st(1), st(0)

fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a31


fld st(3)
fmul [ebx].a12

fld st(3)
fmul [ebx].a22
faddp st(1), st(0)

fld st(2)
fmul [ebx].a32
faddp st(1), st(0)

fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a32


fld st(3)
fmul [ebx].a13

fld st(3)
fmul [ebx].a23
faddp st(1), st(0)

fld st(2)
fmul [ebx].a33
faddp st(1), st(0)

fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a33


fld st(3)
fmul [ebx].a14

fld st(3)
fmul [ebx].a24
faddp st(1), st(0)

fld st(2)
fmul [ebx].a34
faddp st(1), st(0)

fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a34

fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)

fld [eax].a41
fld [eax].a42
fld [eax].a43
fld [eax].a44

fld st(3)
fmul [ebx].a11

fld st(3)
fmul [ebx].a21
faddp st(1), st(0)

fld st(2)
fmul [ebx].a31
faddp st(1), st(0)

fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a41


fld st(3)
fmul [ebx].a12

fld st(3)
fmul [ebx].a22
faddp st(1), st(0)

fld st(2)
fmul [ebx].a32
faddp st(1), st(0)

fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a42


fld st(3)
fmul [ebx].a13

fld st(3)
fmul [ebx].a23
faddp st(1), st(0)

fld st(2)
fmul [ebx].a33
faddp st(1), st(0)

fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a43


fld st(3)
fmul [ebx].a14

fld st(3)
fmul [ebx].a24
faddp st(1), st(0)

fld st(2)
fmul [ebx].a34
faddp st(1), st(0)

fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a44

fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)


pop ecx
pop ebx
pop eax
}
}


Glide 64 function:

void projection_mul (float proj[4][4], float m_src[4][4], float m[4][4])
{
float m_src[4][4];

for (int i=0; i<4; i++)
{
for (int j=0; j<4; j++)
{
proj[j] =
m_src[0] * m[j][0] +
m_src[1] * m[j][1] +
m_src[2] * m[j][2] +
m_src[3] * m[j][3];
}
}

}
 

Remote

Active member
Moderator
Perhaps someone knowing could give his view on this...

EDIT: Nevermind I just read Hacktarux's reply...
 
Last edited:

Hacktarux

Emulator Developer
Moderator
i've tried just to be sure and the results was what i though, i get nearly exactly same results with your function and glide64's one. After 10^7 times of random matrixes multiplication (it take 30 seconds on my computer) the difference is less than 20ms.

You may be wondering why...
my guess is that your code take way too much space. Cpus are not designed to cache such loops generally. They prefer optimizing it for little loops. Nevertheless, it still fit in the cache after some iterations. Remember that this test is not done in real conditions. In a real plugins there's many things between two matrixes multiplication so the code has to be reload each time. This means that your function is probably slower in a real plugin than glide64 one... Again it has nothing to do with asm vs c, it's the algorithm.... You can also unroll loops in c and you'll still have the same issue.
 

Knuckles

Active member
Moderator
Did you only makes Mario64 run or also some other games? And did only the title screen is working or you can get ingame with wireframe display?
 
OP
A

ashade

New member
yeah, unroll loops i can, but look at this example:

to divide a number in c++ by a constant, you have to do it:

//consider x is a 16bit number
x /= 10; //using 10 for example

this is slow, because the divide operantion takes too much cycles... my idea is to use the multiply operation to divide by a constant (and the mul instruction is marginally faster!).

look at this:

__asm {
mov ax, x
mov dx, 6554
mul dx
mov x, dx
}


it is a little hard to explain you why this works, but this kind of multiplication can't be done in c++... test yourself if u want... make a loop repeating about 10^10 times the two instructions and see the results
 

tooie

New member
Asm can be faster then C++ .. but what you have here is algrothim method .. granted some algorithms can be done in asm and not in c .. that is where inline asm comes in .. writing high intesive function in asm can be great .. I know for your maxtrix stuff using the different P3 matrix ops can be dramaticly faster ..

but you come in to maintainabilty and being able to change code as well .. no matter how fast you wrote an interpter .. dynamic recompiler will always be faster .. this has to do with algorithm more then language ..

ashade said:
yeah, unroll loops i can, but look at this example:

to divide a number in c++ by a constant, you have to do it:

//consider x is a 16bit number
x /= 10; //using 10 for example

this is slow, because the divide operantion takes too much cycles... my idea is to use the multiply operation to divide by a constant (and the mul instruction is marginally faster!).

look at this:

__asm {
mov ax, x
mov dx, 6554
mul dx
mov x, dx
}


it is a little hard to explain you why this works, but this kind of multiplication can't be done in c++... test yourself if u want... make a loop repeating about 10^10 times the two instructions and see the results
 

sketzh

New member
ashade said:
//consider x is a 16bit number
x /= 10; //using 10 for example

Well to divide faster in C++ you do the same trick.
You just multiply like this instead:

x *= 0.10f;
 

Doomulation

?????????????????????????
Although, asm is a little faster in general. Do a small strcpy and you'll see how it jumps around and takes time to complete...
Function juming is slow, especially this happens at the first call, dunno if it calls anymore...

With asm, however, the processor amends the instructions immidietly (afaik?) and is thus a little faster. But it might not matter in the length as many have said......
And the fact that the compiler cannot optimize asm :(

Anyway, good luck on the plugin ashade and post a shot with real gfx and the speed limiter off (press F4 in pj) when you've got it working! :happy:
 

ScottJC

At your service, dood!
Why the hell are you people arguing, Ashade can program his gfx plugin in whatever compiler he wants to,

And it is a fact that assembler is faster than c++, not by much on these modern computers but it definatly is. it is also a fact that c++ compiles INTO assembler (machine code), all compilers do, a typical compiler compiles code into assembler which isn't exactly brilliantly optimized.

in assembler you have complete control over your software, and how much optimization it can have, as in c++, you do not. because it will always compile your C++ in the exact same way, optimized for what the compiler thinks is optimized;

a function in assembler can be just as slow as a c++, but i'm willing to such a function would have alot more instructions than the code produced in c++, and it did them all in the same amount of time. in the end c++ code produces lengthy assembler code as a result.

Btw ashade, good work, i look forward to the future of this plugin, :D
 
Last edited:

radTube

lazy bastard
Why do you guys sound like you want ashade to quit or code just like everybody else has done? I know these things tend to turn into some sort of competitions, usually made into such by people who have nothing to do with the coding, but couldn't we just give our support to this project and see what comes of it?

Good luck ashade, I hope bra64 develops into another great plugin. :flowers:
 

Doomulation

?????????????????????????
'Tis not good!
Ashade, we're not making you develop any diffrent if you're thinking that! You do it as you want, as long as the plugin gets good :D

Good luck! :flowers:
 

Trotterwatch

New member
I don't think anyone here has been nasty to Ashade. He has posted stating he has written code in ASM that is superior to anything that could be written in C++. As a result some experienced C Coders have stated that what he has said isn't totally correct, and explained why. Ashade should take this as a challenge rather than an insult.
 

Hacktarux

Emulator Developer
Moderator
The problem is to define in which aspect asm is superior to c. It's superior in speed when you optimize deeply a single blick of lines, but on the other hand it's harder to maintain...

What i was trying to say is that asm speed optimization isn't that important when you begin coz algorithms have much more incidence. It's like tunning your car, adding aileron and such and still having a bad engine. It'll probably be faster, but still slow, and i'd prefer starting by improving the engine...

I personnaly believe that it's very hard to do it all in asm the first time you do it. I think it would be faster to do it in c, optimize algorithms and once everything is working for sure, it can still be optimized in asm... Starting by most time consuming functions and step by step converting all parts in asm... I believe that this approach take less dev time, coz if u write everything in asm, it'll be very hard to read, correct, modify.... when the plugin will start to be huge and finally will take more time than doing it twice (one in c and one in asm). Now, it's only my opinion you don't need to agree :D
 

tooie

New member
Sayargh said:
Why the hell are you people arguing, Ashade can program his gfx plugin in whatever compiler he wants to,

And it is a fact that assembler is faster than c++, not by much on these modern computers but it definatly is. it is also a fact that c++ compiles INTO assembler (machine code), all compilers do, a typical compiler compiles code into assembler which isn't exactly brilliantly optimized.

in assembler you have complete control over your software, and how much optimization it can have, as in c++, you do not. because it will always compile your C++ in the exact same way, optimized for what the compiler thinks is optimized;

a function in assembler can be just as slow as a c++, but i'm willing to such a function would have alot more instructions than the code produced in c++, and it did them all in the same amount of time. in the end c++ code produces lengthy assembler code as a result.

Btw ashade, good work, i look forward to the future of this plugin, :D

the discussion is more about:
Doomulation said:
Well, it is developed in asm, somewhat at least, so yes it is FAST!!!
Very nice work, ashade! :inlove:

which we are just saying this can be but not nessarly true .. there is a lot more to programming then just the language .. maintainabilty is a major thing, as well as readability.
 

pj64er

PJ64 Lubba
i dont see any nastiness either. i just see three emu programmers telling ashade what they learned through cold, hard experience.

even in my limited programming knowledge, i know that assembly (low level) can be more optimised than c++ (high level). but you guys cannot think like:

-low level is better than high level!
-asm is better than c++!
-plugin in asm is better than plugin in c++!
-w00t! revolution!
-quick! defend guy who write in asm at all costs!


ashade, hacktarux, tooie and icepir8 all know their stuff. let them have their little debate. I have a feeling the rest of you (like me) dont really know whats going on:flowers:
 

tooie

New member
pj64er said:

-low level is better than high level!

I never really think of C++ as high level .. mostly cause I do work at times with Visual Basic, Web stuff, SQL .. those I would think more as high level. But I guess it is what your comparing.
 

pj64er

PJ64 Lubba
tooie said:
I never really think of C++ as high level .. mostly cause I do work at times with Visual Basic, Web stuff, SQL .. those I would think more as high level. But I guess it is what your comparing.

may I emphasize the limited knowledge part...:doh:

:happy:
 

sketzh

New member
..

Many gfx coders these days dont ever bother about coding in asm anymore.. They know it takes to much effort to make any faster that the compiler can do..

The optimizers today aint that bad.. And with all the GPUs at the market there is really no point in optimizing it yourself..

And for speaking of CPUs, I dont even know anybody with a CPU slower than 1.5 ghz these days..

My opinion is that its just a lot of waste of time..
 

Top