Ok, so what's been happening recently:
* Fixed mario cannon bug in snow level that used to freeze (actually had to do with pushing/popping the matrices!!! It took me 2 days to find the source of this bug)
* Added support for texture cache using any size texture memory
* Fixed the 2MB texture memory boundary bug
The next thing I need is to fix my matrices. I think that is why SF Rush and Tetrisphere don't work well. I think I'm reversing something and I absolutely cannot figure it out, considering I'm not very good with matrices in the first place. I'll post my code here, and PLEEEASSE help me

(look down for main uc0:matrix function, look at post on a previous page to see how I'm using matrices to draw):
in rdp:
// Matrices
float model[4][4];
float proj[4][4];
float model_stack[10][4][4]; // 10 deep, will warn if overflow
int model_i; // index in the model matrix stack
void modelview_load (float m[4][4])
{
memcpy (rdp.model, m, 64); // 4*4*4(float)
}
void modelview_mul (float m[4][4])
{
float m_src[4][4];
memcpy (m_src, rdp.model, 64);
for (int i=0; i<4; i++) // row in result
{
for (int j=0; j<4; j++) // column in result
{
rdp.model
[j] =
m_src[0] * m[0][j] +
m_src[1] * m[1][j] +
m_src[2] * m[2][j] +
m_src[3] * m[3][j];
}
}
}
void modelview_push ()
{
if (rdp.model_i == 9)
{
RDP_E ("** Model matrix stack overflow ** > 32 push");
return;
}
memcpy (rdp.model_stack[rdp.model_i], rdp.model, 64);
rdp.model_i ++;
}
void modelview_pop ()
{
if (rdp.model_i == 0)
{
RDP_E ("** Model matrix stack failed** too many pops");
return;
}
rdp.model_i --;
memcpy (rdp.model, rdp.model_stack[rdp.model_i], 64);
}
void modelview_load_push (float m[4][4])
{
modelview_push ();
modelview_load (m);
}
void modelview_mul_push (float m[4][4])
{
modelview_push ();
modelview_mul (m);
}
void projection_load (float m[4][4])
{
memcpy (rdp.proj, m, 64); // 4*4*4(float)
}
void projection_mul (float m[4][4])
{
float m_src[4][4];
memcpy (m_src, rdp.proj, 64);
for (int i=0; i<4; i++) // row in result
{
for (int j=0; j<4; j++) // column in result
{
rdp.proj[j] =
m_src[0] * m[0][j] +
m_src[1] * m[1][j] +
m_src[2] * m[2][j] +
m_src[3] * m[3][j];
}
}
}
static void rsp_uc00_matrix()
{
RDP("uc0:matrix ");
// Use segment offset to get the address
DWORD addr = segoffset(rdp.cmd1) & 0x003FFFFF;
BYTE command = (BYTE)((rdp.cmd0 >> 16) & 0xFF);
float m[4][4];
int x,y; // matrix index
addr >>= 1;
for (x=0; x<16; x+=4) { // Adding 4 instead of one, just to remove mult. later
for (y=0; y<4; y++) {
m[x>>2][y] = (float)(
(((__int32)((WORD*)gfx.RDRAM)[(addr+x+y)^1]) << 16) |
((WORD*)gfx.RDRAM)[(addr+x+y+16)^1]
) / 65536.0f;
}
}
switch (command)
{
case 0: // modelview mul nopush
RDP ("modelview mul\n");
modelview_mul (m);
break;
case 1: // projection mul nopush
case 5: // projection mul push, can't push projection
RDP ("projection mul\n");
projection_mul (m);
break;
case 2: // modelview load nopush
RDP ("modelview load\n");
modelview_load (m);
break;
case 3: // projection load nopush
case 7: // projection load push, can't push projection
RDP ("projection load\n");
projection_load (m);
break;
case 4: // modelview mul push
RDP ("modelview mul push\n");
modelview_mul_push (m);
break;
case 6: // modelview load push
RDP ("modelview load push\n");
modelview_load_push (m);
break;
default:
FRDP_E ("Unknown matrix command, %02lx", command);
}
}