- Thread Starter
- #21
ok, so lets play this game...
take a look at my matrix multiplication function and glide 64 multiplication function... compare the speed of them and tell me what u think...
To compare the speed, put both in a huge loop (repeating each function about 1000000 times) and compare the time used to each function...
my function:
typedef struct {
union {
float M[4][4];
struct { //acesso direto dos elementos
float a11, a12, a13, a14,
a21, a22, a23, a24,
a31, a32, a33, a34,
a41, a42, a43, a44;
};
};
} MATRIS;
void MATRIS_dot(MATRIS* mDest, MATRIS* mSrc1, MATRIS*
mSrc2) {
/*
{{a11 b11 + a12 b21 + a13 b31 + a14 b41,
a11 b12 + a12 b22 + a13 b32 + a14 b42,
a11 b13 + a12 b23 + a13 b33 + a14 b43,
a11 b14 + a12 b24 + a13 b34 + a14 b44}, {a21 b11 + a22 b21 + a23 b31 +
a24 b41, a21 b12 + a22 b22 + a23 b32 + a24 b42,
a21 b13 + a22 b23 + a23 b33 + a24 b43,
a21 b14 + a22 b24 + a23 b34 + a24 b44}, {a31 b11 + a32 b21 + a33 b31 +
a34 b41, a31 b12 + a32 b22 + a33 b32 + a34 b42,
a31 b13 + a32 b23 + a33 b33 + a34 b43,
a31 b14 + a32 b24 + a33 b34 + a34 b44}, {a41 b11 + a42 b21 + a43 b31 +
a44 b41, a41 b12 + a42 b22 + a43 b32 + a44 b42,
a41 b13 + a42 b23 + a43 b33 + a44 b43,
a41 b14 + a42 b24 + a43 b34 + a44 b44}}
*/
__asm {
push eax
push ebx
push ecx
mov eax, mSrc2
mov ebx, mSrc1
mov ecx, mDest
fld [eax].a11
fld [eax].a12
fld [eax].a13
fld [eax].a14
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a11
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a12
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a13
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a14
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
fld [eax].a21
fld [eax].a22
fld [eax].a23
fld [eax].a24
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a21
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a22
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a23
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a24
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
fld [eax].a31
fld [eax].a32
fld [eax].a33
fld [eax].a34
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a31
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a32
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a33
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a34
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
fld [eax].a41
fld [eax].a42
fld [eax].a43
fld [eax].a44
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a41
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a42
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a43
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a44
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
pop ecx
pop ebx
pop eax
}
}
Glide 64 function:
void projection_mul (float proj[4][4], float m_src[4][4], float m[4][4])
{
float m_src[4][4];
for (int i=0; i<4; i++)
{
for (int j=0; j<4; j++)
{
proj[j] =
m_src[0] * m[j][0] +
m_src[1] * m[j][1] +
m_src[2] * m[j][2] +
m_src[3] * m[j][3];
}
}
}
take a look at my matrix multiplication function and glide 64 multiplication function... compare the speed of them and tell me what u think...
To compare the speed, put both in a huge loop (repeating each function about 1000000 times) and compare the time used to each function...
my function:
typedef struct {
union {
float M[4][4];
struct { //acesso direto dos elementos
float a11, a12, a13, a14,
a21, a22, a23, a24,
a31, a32, a33, a34,
a41, a42, a43, a44;
};
};
} MATRIS;
void MATRIS_dot(MATRIS* mDest, MATRIS* mSrc1, MATRIS*
mSrc2) {
/*
{{a11 b11 + a12 b21 + a13 b31 + a14 b41,
a11 b12 + a12 b22 + a13 b32 + a14 b42,
a11 b13 + a12 b23 + a13 b33 + a14 b43,
a11 b14 + a12 b24 + a13 b34 + a14 b44}, {a21 b11 + a22 b21 + a23 b31 +
a24 b41, a21 b12 + a22 b22 + a23 b32 + a24 b42,
a21 b13 + a22 b23 + a23 b33 + a24 b43,
a21 b14 + a22 b24 + a23 b34 + a24 b44}, {a31 b11 + a32 b21 + a33 b31 +
a34 b41, a31 b12 + a32 b22 + a33 b32 + a34 b42,
a31 b13 + a32 b23 + a33 b33 + a34 b43,
a31 b14 + a32 b24 + a33 b34 + a34 b44}, {a41 b11 + a42 b21 + a43 b31 +
a44 b41, a41 b12 + a42 b22 + a43 b32 + a44 b42,
a41 b13 + a42 b23 + a43 b33 + a44 b43,
a41 b14 + a42 b24 + a43 b34 + a44 b44}}
*/
__asm {
push eax
push ebx
push ecx
mov eax, mSrc2
mov ebx, mSrc1
mov ecx, mDest
fld [eax].a11
fld [eax].a12
fld [eax].a13
fld [eax].a14
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a11
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a12
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a13
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a14
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
fld [eax].a21
fld [eax].a22
fld [eax].a23
fld [eax].a24
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a21
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a22
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a23
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a24
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
fld [eax].a31
fld [eax].a32
fld [eax].a33
fld [eax].a34
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a31
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a32
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a33
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a34
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
fld [eax].a41
fld [eax].a42
fld [eax].a43
fld [eax].a44
fld st(3)
fmul [ebx].a11
fld st(3)
fmul [ebx].a21
faddp st(1), st(0)
fld st(2)
fmul [ebx].a31
faddp st(1), st(0)
fld st(1)
fmul [ebx].a41
faddp st(1), st(0)
fstp [ecx].a41
fld st(3)
fmul [ebx].a12
fld st(3)
fmul [ebx].a22
faddp st(1), st(0)
fld st(2)
fmul [ebx].a32
faddp st(1), st(0)
fld st(1)
fmul [ebx].a42
faddp st(1), st(0)
fstp [ecx].a42
fld st(3)
fmul [ebx].a13
fld st(3)
fmul [ebx].a23
faddp st(1), st(0)
fld st(2)
fmul [ebx].a33
faddp st(1), st(0)
fld st(1)
fmul [ebx].a43
faddp st(1), st(0)
fstp [ecx].a43
fld st(3)
fmul [ebx].a14
fld st(3)
fmul [ebx].a24
faddp st(1), st(0)
fld st(2)
fmul [ebx].a34
faddp st(1), st(0)
fld st(1)
fmul [ebx].a44
faddp st(1), st(0)
fstp [ecx].a44
fstp st(0)
fstp st(0)
fstp st(0)
fstp st(0)
pop ecx
pop ebx
pop eax
}
}
Glide 64 function:
void projection_mul (float proj[4][4], float m_src[4][4], float m[4][4])
{
float m_src[4][4];
for (int i=0; i<4; i++)
{
for (int j=0; j<4; j++)
{
proj[j] =
m_src[0] * m[j][0] +
m_src[1] * m[j][1] +
m_src[2] * m[j][2] +
m_src[3] * m[j][3];
}
}
}