4x4 by 4x1 matrix multiplyAssembler/80386+FPU

Here is a matrix multiply routine that multiplies a 4x4 matrix by a 4x1 vector, where both the matrix and vector elements are single precision floating-point numbers. This routine is probably close to optimal on a Pentium. This code was written by an ex-colleague of mine. I merely added the comments and measured the execution time.

;
; 4x4 by 4x1 matrix multiply
; executes in 58 cycles
;
; input:
;   esi = pointer to original 4x1 vector
;   ebx = pointer to 4x4 transformation matrix
;
; output:
;   edi = pointer to transformed 4x1 vector
;
; destroys:
;   fp stack
;

MACRO   MATVECMUL

        fld     [dword ptr esi]    ; x
        fld     st(0)              ; x, x
        fmul    [dword ptr ebx]    ; x*a11, x
        fld     st(1)              ; x, x*a11, x
        fmul    [dword ptr ebx+16] ; x*a12, x*a11, x
        fld     st(2)              ; x, x*a12, x*a11, x
        fmul    [dword ptr ebx+32] ; x*a13, x*a12, x*a11, x
        fxch    st(3)              ; x, x*a12, x*a11, x*a13
        fmul    [dword ptr ebx+48] ; x*a14, x*a12, x*a11, x*a13
        fld     [dword ptr esi+4]  ; y, x*a14, x*a12, x*a11, x*a13
        fld     st(0)              ; y, y, x*a14, x*a12, x*a11, x*a13
        fmul    [dword ptr ebx+4]  ; y*a21, y, x*a14, x*a12, x*a11, x*a13
        fld     st(1)              ; y, y*a21, y, x*a14, x*a12, x*a11, x*a13
        fmul    [dword ptr ebx+20] ; y*a22, y*a21, y, x*a14, x*a12, x*a11, x*a13
        fld     st(2)              ; y, y*a22, y*a21, y, x*a14, x*a12, x*a11, x*a13
        fmul    [dword ptr ebx+36] ; y*a23, y*a22, y*a21, y, x*a14, x*a12, x*a11, x*a13
        fxch    st(3)              ; y, y*a22, y*a21, y*a23, x*a14, x*a12, x*a11, x*a13
        fmul    [dword ptr ebx+52] ; y*a24, y*a22, y*a21, y*a23, x*a14, x*a12, x*a11, x*a13
        fxch    st(2)              ; y*a21, y*a22, y*a24, y*a23, x*a14, x*a12, x*a11, x*a13
        faddp   st(6),st           ; y*a22, y*a24, y*a23, x*a14, x*a12, x*a11+y*a21, x*a13
        faddp   st(4),st           ; y*a24, y*a23, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13
        fxch    st(1)              ; y*a23, y*a24, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13
        faddp   st(5),st           ; y*a24, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fld     [dword ptr esi+8]  ; z, y*a24, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fxch    st(1)              ; y*a24, z, x*a14, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        faddp   st(2),st           ; z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fld     st(0)              ; z, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fmul    [dword ptr ebx+8]  ; z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fld     st(1)              ; z, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fmul    [dword ptr ebx+24] ; z*a32, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fld     st(2)              ; z, z*a32, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fmul    [dword ptr ebx+40] ; z*a33, z*a32, z*a31, z, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fxch    st(3)              ; z, z*a32, z*a31, z*a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fmul    [dword ptr ebx+56] ; z*a34, z*a32, z*a31, z*a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        fxch    st(2)              ; z*a31, z*a32, z*a34, z*a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21, x*a13+y*a23
        faddp   st(6),st           ; z*a32, z*a34, z+a33, x*a14+y*a24, x*a12+y*a22, x*a11+y*a21+z*a31, x*a13+y*a23
        faddp   st(4),st           ; z*a34, z*a33, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23
        fxch    st(1)              ; z*a33, z*a34, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23
        faddp   st(5),st           ; z*a34, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fld     [dword ptr esi+12] ; w, z*a34, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fxch    st(1)              ; z*a34, w, x*a14+y*a24, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        faddp   st(2),st           ; w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fld     st(0)              ; w, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fmul    [dword ptr ebx+12] ; w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fld     st(1)              ; w, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fmul    [dword ptr ebx+28] ; w*a42, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fld     st(2)              ; w, w*a42, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fmul    [dword ptr ebx+44] ; w*a43, w*a42, w*a41, w, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fxch    st(3)              ; w, w*a42, w*a41, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fmul    [dword ptr ebx+60] ; w*a44, w*a42, w*a41, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        fxch    st(2)              ; w*a41, w*a42, w*a44, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31, x*a13+y*a23+z*a33
        faddp   st(6),st           ; w*a42, w*a44, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33
        faddp   st(4),st           ; w*a44, w*a43, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33
        fxch    st(1)              ; w*a43, w*a44, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33
        faddp   st(5),st           ; w*a44, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, x*a11+y*a21+z*a31+w*a41, x*a13+y*a23+z*a33+w*a43
        fxch    st(3)              ; x*a11+y*a21+z*a31+w*a41, x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, w*a44, x*a13+y*a23+z*a33+w*a43
        fstp    [dword ptr edi]    ; x*a14+y*a24+z*a34, x*a12+y*a22+z*a32+w*a42, w*a44, x*a13+y*a23+z*a33+w*a43
        faddp   st(2),st           ; x*a12+y*a22+z*a32+w*a42, x*a14+y*a24+z*a34+w*a44, x*a13+y*a23+z*a33+w*a43
        fstp    [dword ptr edi+4]  ; x*a14+y*a24+z*a34+w*a44, x*a13+y*a23+z*a33+w*a43
        fxch    st(1)              ; x*a13+y*a23+z*a33+w*a43, x*a14+y*a24+z*a34+w*a44
        fstp    [dword ptr edi+8]  ; x*a14+y*a24+z*a34+w*a44
        fstp    [dword ptr edi+12] ;

ENDM
Gem writer: Norbert Juffa
last updated: 1998-03-16