Here is a matrix multiply routine that multiplies a 4x4 matrix by a 4x1 vector, where both the matrix and vector elements are single precision floating-point numbers. This routine is probably close to optimal on a Pentium. This code was written by an ex-colleague of mine. I merely added the comments and measured the execution time.;
; 4x4 by 4x1 matrix multiply
; executes in 58 cycles
;
; input:
; esi = pointer to original 4x1 vector
; ebx = pointer to 4x4 transformation matrix
;
; output:
; edi = pointer to transformed 4x1 vector
;
; destroys:
; fp stack
;
MACRO MATVECMUL
fld [dword ptr esi] ; x
fld st(0) ; x, x
fmul [dword ptr ebx] ; x*a11, x
fld st(1) ; x, x*a11, x
fmul [dword ptr ebx+16] ; x*a12, x*a11, x
fld st(2) ; x, x*a12, x*a11, x
fmul [dword ptr ebx+32] ; x*a13, x*a12, x*a11, x
fxch st(3) ; x, x*a12, x*a11, x*a13
fmul [dword ptr ebx+48] ; x*a14, x*a12, x*a11, x*a13
fld [dword ptr esi+4] ; y, x*a14, x*a12, x*a11, x*a13
fld st(0) ; y, y, x*a14, x*a12, x*a11, x*a13
fmul [dword ptr ebx+4] ; y*a21, y, x*a14, x*a12, x*a11, x*a13
fld st(1) ; y, y*a21, y, x*a14, x*a12, x*a11, x*a13
fmul [dword ptr ebx+20] ; y*a22, y*a21, y, x*a14, x*a12, x*a11, x*a13
fld st(2) ; y, y*a22, y*a21, y, x*a14, x*a12, x*a11, x*a13
fmul [dword ptr ebx+36] ; y*a23, y*a22, y*a21, y, x*a14, x*a12, x*a11, x*a13
fxch st(3) ; y, y*a22, y*a21, y*a23, x*a14, x*a12, x*a11, x*a13
fmul [dword ptr ebx+52] ; y*a24, y*a22, y*a21, y*a23, x*a14, x*a12, x*a11, x*a13
fxch st(2) ; y*a21, y*a22, y*a24, y*a23, x*a14, x*a12, x*a11, x*a13
faddp st(6),st ; y*a22, y*a24, y*a23, x*a14, x*a12, x*a11+y*a21, x*a13
faddp st(4),st ; y*a24, y*a23, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13
fxch st(1) ; y*a23, y*a24, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13
faddp st(5),st ; y*a24, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fld [dword ptr esi+8] ; z, y*a24, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fxch st(1) ; y*a24, z, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
faddp st(2),st ; z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fld st(0) ; z, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fmul [dword ptr ebx+8] ; z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fld st(1) ; z, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fmul [dword ptr ebx+24] ; z*a32, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fld st(2) ; z, z*a32, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fmul [dword ptr ebx+40] ; z*a33, z*a32, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fxch st(3) ; z, z*a32, z*a31, z*a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fmul [dword ptr ebx+56] ; z*a34, z*a32, z*a31, z*a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
fxch st(2) ; z*a31, z*a32, z*a34, z*a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
faddp st(6),st ; z*a32, z*a34, z+a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21+z*a31, x*a13+y*a23
faddp st(4),st ; z*a34, z*a33, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23
fxch st(1) ; z*a33, z*a34, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23
faddp st(5),st ; z*a34, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fld [dword ptr esi+12] ; w, z*a34, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fxch st(1) ; z*a34, w, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
faddp st(2),st ; w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fld st(0) ; w, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fmul [dword ptr ebx+12] ; w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fld st(1) ; w, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fmul [dword ptr ebx+28] ; w*a42, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fld st(2) ; w, w*a42, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fmul [dword ptr ebx+44] ; w*a43, w*a42, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fxch st(3) ; w, w*a42, w*a41, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fmul [dword ptr ebx+60] ; w*a44, w*a42, w*a41, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
fxch st(2) ; w*a41, w*a42, w*a44, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
faddp st(6),st ; w*a42, w*a44, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33
faddp st(4),st ; w*a44, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33
fxch st(1) ; w*a43, w*a44, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33
faddp st(5),st ; w*a44, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33+w*a43
fxch st(3) ; x*a11+y*a21+z*a31+w*a41, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, w*a44, x*a13+y*a23+z*a33+w*a43
fstp [dword ptr edi] ; x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, w*a44, x*a13+y*a23+z*a33+w*a43
faddp st(2),st ; x*a12+y*a22+z*a32+w*a42, x*a14+y*a24+z*a34+w*a44, x*a13+y*a23+z*a33+w*a43
fstp [dword ptr edi+4] ; x*a14+y*a24+z*a34+w*a44, x*a13+y*a23+z*a33+w*a43
fxch st(1) ; x*a13+y*a23+z*a33+w*a43, x*a14+y*a24+z*a34+w*a44
fstp [dword ptr edi+8] ; x*a14+y*a24+z*a34+w*a44
fstp [dword ptr edi+12] ;
ENDM