Inverse squarerootAssembler/80386+FPU

invsqrt computes an approximation to the inverse square root of an IEEE-754 single precision number. The algorithm used is that described in this paper: Debjit Das Sarma and David W. Matula: "Faithful Bipartite ROM Reciprocal Tables", Proceedings 12th Symposium on Computer Arithmetic, July 19-21, 1995, Bath, England.

This code delivers inverse square root results that differ by at most 1 ulp from correctly rounded single precision results. The correctly rounded single precision result is returned for > 98% of all arguments if the FPU precision is set to either extended precision (default under DOS) or double precision (default under WindowsNT). This routine works does not work correctly if the input is negative, zero, a denormal, and infinity or a NaN. It works properly for arguments in [2^-126, 2^128], which is the range of positive, normalized IEEE single precision numbers and is roughly equivalent to [1.1755e-38, 3.4028e38].

Note that this algorithm uses relatively large tables with a combined size of 1.5 KB, hence it may be prone to cache misses if called infrequently or if used on machines with a small data cache. In case all memory accesses are cache hits, this code has been timed to have a latency of 39 cycles on a PentiumMMX and 34 cycles on a PentiumII.

;
; invsqrt - compute an approximation to the inverse square root of an
;           IEEE-754 single precision number.
;
; input:
;    ebx  = pointer to IEEE-754 single precision number, argument
;    esi  = pointer to IEEE-754 single precision number, res = 1/sqrt(arg)
;
; output:
;   [esi] = 1/sqrt(arg)
;
; destroys:
;   eax, ecx, edx, edi
;   eflags
;

MACRO   INVSQRT

     DATASEG

taba    dw      0695eh,068aah,067f7h,06745h,06694h,065e4h,06535h,06488h
        dw      063dbh,0632fh,06284h,061dah,06131h,06089h,05fe2h,05f3ch
        dw      05e9dh,05df9h,05d55h,05cb3h,05c11h,05b71h,05ad0h,05a31h
        dw      05993h,058f5h,05859h,057bdh,05722h,05688h,055efh,05556h
        dw      054c5h,0542eh,05398h,05302h,0526eh,051dah,05147h,050b5h
        dw      05023h,04f92h,04f02h,04e73h,04de4h,04d56h,04cc9h,04c3ch
        dw      04bb5h,04b2ah,04a9fh,04a15h,0498ch,04904h,0487ch,047f5h
        dw      0476eh,046e8h,04663h,045deh,0455ah,044d7h,04454h,043d2h
        dw      04355h,042d4h,04254h,041d4h,04155h,040d6h,04058h,03fdbh
        dw      03f5eh,03ee2h,03e66h,03debh,03d71h,03cf7h,03c7dh,03c04h
        dw      03b91h,03b19h,03aa2h,03a2bh,039b5h,0393fh,038cah,03855h
        dw      037e1h,0376dh,036fah,03687h,03615h,035a3h,03532h,034c1h
        dw      03454h,033e4h,03375h,03306h,03298h,0322ah,031bch,0314fh
        dw      030e3h,03076h,0300bh,02fa0h,02f35h,02ecbh,02e61h,02df7h
        dw      02d92h,02d29h,02cc1h,02c59h,02bf2h,02b8bh,02b25h,02abeh
        dw      02a59h,029f3h,0298eh,0292ah,028c6h,02862h,027ffh,0279ch
        dw      0273ch,026dah,02678h,02617h,025b6h,02555h,024f5h,02495h
        dw      02435h,023d6h,02378h,02319h,022bbh,0225dh,02200h,021a3h
        dw      02148h,020ech,02090h,02034h,01fd9h,01f7eh,01f23h,01ec9h
        dw      01e6fh,01e15h,01dbch,01d63h,01d0ah,01cb2h,01c5ah,01c02h
        dw      01baeh,01b57h,01b00h,01aaah,01a54h,019feh,019a8h,01953h
        dw      018feh,018aah,01855h,01801h,017aeh,0175ah,01707h,016b4h
        dw      01663h,01610h,015bfh,0156dh,0151bh,014cah,01479h,01429h
        dw      013d8h,01389h,01339h,012e9h,0129ah,0124bh,011fch,011aeh
        dw      01161h,01113h,010c6h,01078h,0102bh,00fdeh,00f91h,00f45h
        dw      00ef9h,00eadh,00e61h,00e16h,00dcbh,00d80h,00d35h,00cebh
        dw      00ca3h,00c59h,00c0fh,00bc6h,00b7ch,00b33h,00aebh,00aa2h
        dw      00a5ah,00a12h,009cah,00982h,0093bh,008f4h,008adh,00866h
        dw      00821h,007dbh,00795h,0074fh,00709h,006c4h,0067fh,0063ah
        dw      005f5h,005b0h,0056ch,00528h,004e4h,004a0h,0045dh,00419h
        dw      003d8h,00395h,00352h,00310h,002cdh,0028bh,00249h,00207h
        dw      001c6h,00185h,00143h,00102h,000c2h,00081h,00041h,00000h
        dw      0ff0dh,0fe0eh,0fd12h,0fc16h,0fb1ch,0fa23h,0f92ch,0f836h
        dw      0f741h,0f64fh,0f55dh,0f46ch,0f37dh,0f290h,0f1a3h,0f0b9h
        dw      0efd9h,0eef0h,0ee09h,0ed23h,0ec3fh,0eb5bh,0ea79h,0e998h
        dw      0e8b8h,0e7d9h,0e6fch,0e620h,0e545h,0e46bh,0e392h,0e2bah
        dw      0e1ebh,0e116h,0e042h,0df6eh,0de9ch,0ddcbh,0dcfbh,0dc2ch
        dw      0db5eh,0da91h,0d9c6h,0d8fbh,0d831h,0d768h,0d6a1h,0d5dah
        dw      0d51bh,0d457h,0d393h,0d2d0h,0d20eh,0d14dh,0d08dh,0cfcdh
        dw      0cf0fh,0ce52h,0cd96h,0ccdah,0cc1fh,0cb66h,0caach,0c9f4h
        dw      0c943h,0c88dh,0c7d7h,0c723h,0c66fh,0c5bch,0c50ah,0c459h
        dw      0c3a8h,0c2f9h,0c24ah,0c19bh,0c0eeh,0c041h,0bf96h,0beebh
        dw      0be46h,0bd9dh,0bcf4h,0bc4ch,0bba5h,0bafeh,0ba58h,0b9b3h
        dw      0b90fh,0b86bh,0b7c8h,0b726h,0b685h,0b5e4h,0b543h,0b4a4h
        dw      0b40bh,0b36dh,0b2d0h,0b233h,0b197h,0b0fbh,0b060h,0afc6h
        dw      0af2dh,0ae94h,0adfch,0ad64h,0accdh,0ac37h,0aba1h,0ab0ch
        dw      0aa7ch,0a9e8h,0a954h,0a8c2h,0a830h,0a79eh,0a70dh,0a67dh
        dw      0a5edh,0a55eh,0a4cfh,0a441h,0a3b4h,0a326h,0a29ah,0a20eh
        dw      0a187h,0a0fch,0a071h,09fe8h,09f5eh,09ed6h,09e4eh,09dc6h
        dw      09d3fh,09cb8h,09c32h,09badh,09b28h,09aa3h,09a1fh,0999ch
        dw      0991ch,09899h,09817h,09796h,09715h,09694h,09614h,09594h
        dw      09515h,09496h,09418h,0939ah,0931dh,092a0h,09223h,091a7h
        dw      0912fh,090b4h,09039h,08fbfh,08f45h,08ecch,08e53h,08ddah
        dw      08d62h,08ceah,08c73h,08bfch,08b86h,08b10h,08a9bh,08a25h
        dw      089b3h,0893eh,088cbh,08857h,087e4h,08771h,086ffh,0868dh
        dw      0861bh,085aah,08539h,084c9h,08459h,083e9h,0837ah,0830bh
        dw      0829fh,08231h,081c3h,08155h,080e8h,0807bh,0800fh,07fa3h
        dw      07f37h,07ecch,07e61h,07df6h,07d8ch,07d22h,07cb9h,07c4fh
        dw      07be9h,07b81h,07b19h,07ab1h,07a49h,079e2h,0797bh,07915h
        dw      078aeh,07848h,077e3h,0777dh,07718h,076b4h,0764fh,075ebh
        dw      07589h,07526h,074c3h,07460h,073fdh,0739bh,07339h,072d8h
        dw      07276h,07215h,071b5h,07155h,070f4h,07095h,07035h,06fd6h
        dw      06f7ah,06f1bh,06ebdh,06e5fh,06e01h,06da3h,06d46h,06ce9h
        dw      06c8ch,06c30h,06bd4h,06b78h,06b1ch,06ac1h,06a66h,06a0bh

tabb    db      0a3h,098h,08dh,082h,077h,06dh,062h,057h
        db      04ch,042h,037h,02ch,021h,017h,00ch,001h
        db      095h,08bh,081h,077h,06eh,064h,05ah,050h
        db      046h,03ch,032h,028h,01eh,015h,00bh,001h
        db      089h,080h,077h,06eh,064h,05bh,052h,049h
        db      040h,037h,02eh,024h,01ch,013h,009h,000h
        db      07fh,077h,06eh,066h,05dh,055h,04dh,044h
        db      03ch,033h,02bh,023h,01ah,012h,009h,001h
        db      076h,06eh,067h,05fh,057h,04fh,047h,040h
        db      038h,030h,028h,020h,018h,011h,009h,001h
        db      06dh,066h,05eh,057h,050h,048h,041h,03ah
        db      033h,02bh,024h,01dh,016h,00eh,007h,000h
        db      066h,060h,059h,052h,04bh,044h,03dh,037h
        db      030h,029h,022h,01ch,015h,00eh,007h,001h
        db      060h,059h,053h,04dh,046h,040h,039h,033h
        db      02dh,026h,020h,01ah,013h,00dh,007h,000h
        db      05ah,054h,04eh,048h,042h,03ch,036h,030h
        db      02ah,024h,01eh,018h,012h,00ch,006h,001h
        db      055h,050h,04ah,045h,03fh,039h,034h,02eh
        db      028h,023h,01dh,017h,012h,00ch,007h,001h
        db      050h,04ah,045h,040h,03ah,035h,030h,02ah
        db      025h,020h,01bh,015h,010h,00bh,005h,000h
        db      04ch,047h,042h,03dh,038h,033h,02eh,029h
        db      024h,01fh,01ah,015h,010h,00bh,006h,001h
        db      048h,044h,03fh,03ah,035h,031h,02ch,027h
        db      022h,01eh,019h,014h,00fh,00bh,006h,001h
        db      044h,040h,03bh,037h,032h,02eh,029h,025h
        db      020h,01ch,017h,013h,00eh,00ah,005h,001h
        db      041h,03dh,039h,034h,030h,02ch,027h,023h
        db      01fh,01bh,016h,012h,00dh,009h,005h,001h
        db      03eh,03ah,036h,032h,02eh,02ah,026h,022h
        db      01dh,019h,015h,011h,00dh,009h,005h,001h
        db      0e6h,0d7h,0c8h,0b8h,0a9h,099h,08ah,07bh
        db      06ch,05ch,04dh,03eh,02eh,01fh,010h,000h
        db      0d2h,0c4h,0b6h,0a8h,09ah,08ch,07eh,070h
        db      062h,054h,046h,038h,02ah,01ch,00eh,000h
        db      0c2h,0b5h,0a8h,09bh,08eh,082h,075h,068h
        db      05bh,04eh,041h,034h,028h,01bh,00eh,001h
        db      0b3h,0a7h,09ch,08fh,083h,077h,06bh,060h
        db      054h,048h,03ch,030h,024h,018h,00ch,000h
        db      0a6h,09bh,090h,085h,07ah,06fh,064h,059h
        db      04eh,043h,038h,02dh,022h,017h,00ch,001h
        db      09bh,091h,086h,07ch,072h,068h,05dh,053h
        db      049h,03fh,034h,02ah,020h,015h,00bh,001h
        db      090h,087h,07dh,073h,06ah,060h,057h,04dh
        db      043h,03ah,030h,027h,01dh,013h,00ah,000h
        db      087h,07eh,075h,06ch,063h,05ah,051h,048h
        db      03fh,036h,02dh,024h,01bh,012h,009h,000h
        db      07fh,077h,06eh,066h,05dh,055h,04dh,044h
        db      03bh,033h,02bh,022h,01ah,011h,009h,000h
        db      078h,070h,068h,060h,058h,050h,048h,040h
        db      038h,030h,028h,020h,018h,010h,008h,000h
        db      071h,069h,062h,05ah,053h,04bh,044h,03ch
        db      035h,02dh,026h,01eh,017h,00fh,008h,000h
        db      06bh,064h,05dh,056h,04fh,048h,040h,039h
        db      033h,02bh,024h,01dh,016h,00fh,008h,001h
        db      065h,05fh,058h,051h,04bh,044h,03dh,036h
        db      030h,029h,022h,01ch,015h,00eh,007h,001h
        db      060h,05ah,053h,04dh,046h,040h,03ah,033h
        db      02dh,026h,020h,01ah,013h,00dh,006h,000h
        db      05ch,056h,050h,04ah,044h,03eh,038h,032h
        db      02bh,025h,01fh,019h,013h,00dh,007h,001h
        db      057h,052h,04ch,046h,040h,03ah,035h,02fh
        db      029h,023h,01dh,018h,012h,00ch,006h,000h

        CODESEG

        mov     eax,[ebx]       ; arg (single precision floating-point)
        mov     edi,0be000000h  ; 0xBE000000L
        mov     ecx,eax         ; arg
        mov     edx,eax         ; arg
        shr     eax,15          ; arg >> 15
        and     edx,07f800000h  ; arg & 0x7F800000
        shr     ecx,11          ; arg >> 11
        sub     edi,edx         ; (0xBE000000L - ((arg->i) & 0x7F800000L))
        and     eax,0000001ffh  ; indexa = (arg >> 15) & 0x1FF
        and     ecx,00000000fh  ; (arg >> 11) & 0xF
        mov     edx,eax         ; indexa
        and     eax,0000001f0h  ; indexa & 0x1F0
        movzx   edx,[word ptr edx*2+taba]       ; taba[indexa]
        movzx   eax,[byte ptr eax+ecx+tabb]     ; tabb[indexa & 0x1F0 | (arg >> 11) & 0xF]
        shr     edi,1           ; (0xBE000000L - ((arg->i) & 0x7F800000L)) >> 1
        add     eax,edx         ; taba[indexa]+tabb[indexb] = a+b
        and     edi,07f800000h  ; ((0xBE000000L - ((arg->i) & 0x7F800000L)) >> 1) & 0x7F800000L
        shl     eax,7           ; (a+b) << 7
        mov     ecx,03f000000h  ; 0.5 in single precision
        add     eax,edi         ; 1/sqrt(arg) approx (single precision)
        push    eax             ; push approx onto stack
        push    ecx             ; push 0.5 onto stack
        mov     ecx,040400000h  ; 3.0 in single precision
        push    ecx             ; push 3.0 onto stack and approx
        fld     [dword ptr esp+8]
        fld     st(0)           ; approx approx
        fmul    st,st(0)        ; approx*approx approx
        fxch    st(1)           ; approx approx*approx
        fmul    [dword ptr esp+4]          ; approx*0.5 approx*approx
        fxch    st(1)           ; approx*approx approx*0.5
        fmul    [dword ptr ebx] ; approx*approx*arg approx*0.5
        fsubr   [dword ptr esp] ; 3-approx*approx*arg approx*0.5
        fmulp   st(1),st        ; res = (3-approx*approx*arg)*guess*0.5
        fstp    [dword ptr esi] ; store result
        add     esp,12          ; remove floating-point temps
ENDM
Gem writer: Norbert Juffa
last updated: 1998-04-18