Inverse squareroot | Assembler/80386+FPU |
This code delivers inverse square root results that differ by at most 1 ulp from correctly rounded single precision results. The correctly rounded single precision result is returned for > 98% of all arguments if the FPU precision is set to either extended precision (default under DOS) or double precision (default under WindowsNT). This routine works does not work correctly if the input is negative, zero, a denormal, and infinity or a NaN. It works properly for arguments in [2^-126, 2^128]
, which is the range of positive, normalized IEEE single precision numbers and is roughly equivalent to [1.1755e-38, 3.4028e38]
.
Note that this algorithm uses relatively large tables with a combined size of 1.5 KB, hence it may be prone to cache misses if called infrequently or if used on machines with a small data cache. In case all memory accesses are cache hits, this code has been timed to have a latency of 39 cycles on a PentiumMMX and 34 cycles on a PentiumII.;
; invsqrt - compute an approximation to the inverse square root of an
; IEEE-754 single precision number.
;
; input:
; ebx = pointer to IEEE-754 single precision number, argument
; esi = pointer to IEEE-754 single precision number, res = 1/sqrt(arg)
;
; output:
; [esi] = 1/sqrt(arg)
;
; destroys:
; eax, ecx, edx, edi
; eflags
;
MACRO INVSQRT
DATASEG
taba dw 0695eh,068aah,067f7h,06745h,06694h,065e4h,06535h,06488h
dw 063dbh,0632fh,06284h,061dah,06131h,06089h,05fe2h,05f3ch
dw 05e9dh,05df9h,05d55h,05cb3h,05c11h,05b71h,05ad0h,05a31h
dw 05993h,058f5h,05859h,057bdh,05722h,05688h,055efh,05556h
dw 054c5h,0542eh,05398h,05302h,0526eh,051dah,05147h,050b5h
dw 05023h,04f92h,04f02h,04e73h,04de4h,04d56h,04cc9h,04c3ch
dw 04bb5h,04b2ah,04a9fh,04a15h,0498ch,04904h,0487ch,047f5h
dw 0476eh,046e8h,04663h,045deh,0455ah,044d7h,04454h,043d2h
dw 04355h,042d4h,04254h,041d4h,04155h,040d6h,04058h,03fdbh
dw 03f5eh,03ee2h,03e66h,03debh,03d71h,03cf7h,03c7dh,03c04h
dw 03b91h,03b19h,03aa2h,03a2bh,039b5h,0393fh,038cah,03855h
dw 037e1h,0376dh,036fah,03687h,03615h,035a3h,03532h,034c1h
dw 03454h,033e4h,03375h,03306h,03298h,0322ah,031bch,0314fh
dw 030e3h,03076h,0300bh,02fa0h,02f35h,02ecbh,02e61h,02df7h
dw 02d92h,02d29h,02cc1h,02c59h,02bf2h,02b8bh,02b25h,02abeh
dw 02a59h,029f3h,0298eh,0292ah,028c6h,02862h,027ffh,0279ch
dw 0273ch,026dah,02678h,02617h,025b6h,02555h,024f5h,02495h
dw 02435h,023d6h,02378h,02319h,022bbh,0225dh,02200h,021a3h
dw 02148h,020ech,02090h,02034h,01fd9h,01f7eh,01f23h,01ec9h
dw 01e6fh,01e15h,01dbch,01d63h,01d0ah,01cb2h,01c5ah,01c02h
dw 01baeh,01b57h,01b00h,01aaah,01a54h,019feh,019a8h,01953h
dw 018feh,018aah,01855h,01801h,017aeh,0175ah,01707h,016b4h
dw 01663h,01610h,015bfh,0156dh,0151bh,014cah,01479h,01429h
dw 013d8h,01389h,01339h,012e9h,0129ah,0124bh,011fch,011aeh
dw 01161h,01113h,010c6h,01078h,0102bh,00fdeh,00f91h,00f45h
dw 00ef9h,00eadh,00e61h,00e16h,00dcbh,00d80h,00d35h,00cebh
dw 00ca3h,00c59h,00c0fh,00bc6h,00b7ch,00b33h,00aebh,00aa2h
dw 00a5ah,00a12h,009cah,00982h,0093bh,008f4h,008adh,00866h
dw 00821h,007dbh,00795h,0074fh,00709h,006c4h,0067fh,0063ah
dw 005f5h,005b0h,0056ch,00528h,004e4h,004a0h,0045dh,00419h
dw 003d8h,00395h,00352h,00310h,002cdh,0028bh,00249h,00207h
dw 001c6h,00185h,00143h,00102h,000c2h,00081h,00041h,00000h
dw 0ff0dh,0fe0eh,0fd12h,0fc16h,0fb1ch,0fa23h,0f92ch,0f836h
dw 0f741h,0f64fh,0f55dh,0f46ch,0f37dh,0f290h,0f1a3h,0f0b9h
dw 0efd9h,0eef0h,0ee09h,0ed23h,0ec3fh,0eb5bh,0ea79h,0e998h
dw 0e8b8h,0e7d9h,0e6fch,0e620h,0e545h,0e46bh,0e392h,0e2bah
dw 0e1ebh,0e116h,0e042h,0df6eh,0de9ch,0ddcbh,0dcfbh,0dc2ch
dw 0db5eh,0da91h,0d9c6h,0d8fbh,0d831h,0d768h,0d6a1h,0d5dah
dw 0d51bh,0d457h,0d393h,0d2d0h,0d20eh,0d14dh,0d08dh,0cfcdh
dw 0cf0fh,0ce52h,0cd96h,0ccdah,0cc1fh,0cb66h,0caach,0c9f4h
dw 0c943h,0c88dh,0c7d7h,0c723h,0c66fh,0c5bch,0c50ah,0c459h
dw 0c3a8h,0c2f9h,0c24ah,0c19bh,0c0eeh,0c041h,0bf96h,0beebh
dw 0be46h,0bd9dh,0bcf4h,0bc4ch,0bba5h,0bafeh,0ba58h,0b9b3h
dw 0b90fh,0b86bh,0b7c8h,0b726h,0b685h,0b5e4h,0b543h,0b4a4h
dw 0b40bh,0b36dh,0b2d0h,0b233h,0b197h,0b0fbh,0b060h,0afc6h
dw 0af2dh,0ae94h,0adfch,0ad64h,0accdh,0ac37h,0aba1h,0ab0ch
dw 0aa7ch,0a9e8h,0a954h,0a8c2h,0a830h,0a79eh,0a70dh,0a67dh
dw 0a5edh,0a55eh,0a4cfh,0a441h,0a3b4h,0a326h,0a29ah,0a20eh
dw 0a187h,0a0fch,0a071h,09fe8h,09f5eh,09ed6h,09e4eh,09dc6h
dw 09d3fh,09cb8h,09c32h,09badh,09b28h,09aa3h,09a1fh,0999ch
dw 0991ch,09899h,09817h,09796h,09715h,09694h,09614h,09594h
dw 09515h,09496h,09418h,0939ah,0931dh,092a0h,09223h,091a7h
dw 0912fh,090b4h,09039h,08fbfh,08f45h,08ecch,08e53h,08ddah
dw 08d62h,08ceah,08c73h,08bfch,08b86h,08b10h,08a9bh,08a25h
dw 089b3h,0893eh,088cbh,08857h,087e4h,08771h,086ffh,0868dh
dw 0861bh,085aah,08539h,084c9h,08459h,083e9h,0837ah,0830bh
dw 0829fh,08231h,081c3h,08155h,080e8h,0807bh,0800fh,07fa3h
dw 07f37h,07ecch,07e61h,07df6h,07d8ch,07d22h,07cb9h,07c4fh
dw 07be9h,07b81h,07b19h,07ab1h,07a49h,079e2h,0797bh,07915h
dw 078aeh,07848h,077e3h,0777dh,07718h,076b4h,0764fh,075ebh
dw 07589h,07526h,074c3h,07460h,073fdh,0739bh,07339h,072d8h
dw 07276h,07215h,071b5h,07155h,070f4h,07095h,07035h,06fd6h
dw 06f7ah,06f1bh,06ebdh,06e5fh,06e01h,06da3h,06d46h,06ce9h
dw 06c8ch,06c30h,06bd4h,06b78h,06b1ch,06ac1h,06a66h,06a0bh
tabb db 0a3h,098h,08dh,082h,077h,06dh,062h,057h
db 04ch,042h,037h,02ch,021h,017h,00ch,001h
db 095h,08bh,081h,077h,06eh,064h,05ah,050h
db 046h,03ch,032h,028h,01eh,015h,00bh,001h
db 089h,080h,077h,06eh,064h,05bh,052h,049h
db 040h,037h,02eh,024h,01ch,013h,009h,000h
db 07fh,077h,06eh,066h,05dh,055h,04dh,044h
db 03ch,033h,02bh,023h,01ah,012h,009h,001h
db 076h,06eh,067h,05fh,057h,04fh,047h,040h
db 038h,030h,028h,020h,018h,011h,009h,001h
db 06dh,066h,05eh,057h,050h,048h,041h,03ah
db 033h,02bh,024h,01dh,016h,00eh,007h,000h
db 066h,060h,059h,052h,04bh,044h,03dh,037h
db 030h,029h,022h,01ch,015h,00eh,007h,001h
db 060h,059h,053h,04dh,046h,040h,039h,033h
db 02dh,026h,020h,01ah,013h,00dh,007h,000h
db 05ah,054h,04eh,048h,042h,03ch,036h,030h
db 02ah,024h,01eh,018h,012h,00ch,006h,001h
db 055h,050h,04ah,045h,03fh,039h,034h,02eh
db 028h,023h,01dh,017h,012h,00ch,007h,001h
db 050h,04ah,045h,040h,03ah,035h,030h,02ah
db 025h,020h,01bh,015h,010h,00bh,005h,000h
db 04ch,047h,042h,03dh,038h,033h,02eh,029h
db 024h,01fh,01ah,015h,010h,00bh,006h,001h
db 048h,044h,03fh,03ah,035h,031h,02ch,027h
db 022h,01eh,019h,014h,00fh,00bh,006h,001h
db 044h,040h,03bh,037h,032h,02eh,029h,025h
db 020h,01ch,017h,013h,00eh,00ah,005h,001h
db 041h,03dh,039h,034h,030h,02ch,027h,023h
db 01fh,01bh,016h,012h,00dh,009h,005h,001h
db 03eh,03ah,036h,032h,02eh,02ah,026h,022h
db 01dh,019h,015h,011h,00dh,009h,005h,001h
db 0e6h,0d7h,0c8h,0b8h,0a9h,099h,08ah,07bh
db 06ch,05ch,04dh,03eh,02eh,01fh,010h,000h
db 0d2h,0c4h,0b6h,0a8h,09ah,08ch,07eh,070h
db 062h,054h,046h,038h,02ah,01ch,00eh,000h
db 0c2h,0b5h,0a8h,09bh,08eh,082h,075h,068h
db 05bh,04eh,041h,034h,028h,01bh,00eh,001h
db 0b3h,0a7h,09ch,08fh,083h,077h,06bh,060h
db 054h,048h,03ch,030h,024h,018h,00ch,000h
db 0a6h,09bh,090h,085h,07ah,06fh,064h,059h
db 04eh,043h,038h,02dh,022h,017h,00ch,001h
db 09bh,091h,086h,07ch,072h,068h,05dh,053h
db 049h,03fh,034h,02ah,020h,015h,00bh,001h
db 090h,087h,07dh,073h,06ah,060h,057h,04dh
db 043h,03ah,030h,027h,01dh,013h,00ah,000h
db 087h,07eh,075h,06ch,063h,05ah,051h,048h
db 03fh,036h,02dh,024h,01bh,012h,009h,000h
db 07fh,077h,06eh,066h,05dh,055h,04dh,044h
db 03bh,033h,02bh,022h,01ah,011h,009h,000h
db 078h,070h,068h,060h,058h,050h,048h,040h
db 038h,030h,028h,020h,018h,010h,008h,000h
db 071h,069h,062h,05ah,053h,04bh,044h,03ch
db 035h,02dh,026h,01eh,017h,00fh,008h,000h
db 06bh,064h,05dh,056h,04fh,048h,040h,039h
db 033h,02bh,024h,01dh,016h,00fh,008h,001h
db 065h,05fh,058h,051h,04bh,044h,03dh,036h
db 030h,029h,022h,01ch,015h,00eh,007h,001h
db 060h,05ah,053h,04dh,046h,040h,03ah,033h
db 02dh,026h,020h,01ah,013h,00dh,006h,000h
db 05ch,056h,050h,04ah,044h,03eh,038h,032h
db 02bh,025h,01fh,019h,013h,00dh,007h,001h
db 057h,052h,04ch,046h,040h,03ah,035h,02fh
db 029h,023h,01dh,018h,012h,00ch,006h,000h
CODESEG
mov eax,[ebx] ; arg (single precision floating-point)
mov edi,0be000000h ; 0xBE000000L
mov ecx,eax ; arg
mov edx,eax ; arg
shr eax,15 ; arg >> 15
and edx,07f800000h ; arg & 0x7F800000
shr ecx,11 ; arg >> 11
sub edi,edx ; (0xBE000000L - ((arg->i) & 0x7F800000L))
and eax,0000001ffh ; indexa = (arg >> 15) & 0x1FF
and ecx,00000000fh ; (arg >> 11) & 0xF
mov edx,eax ; indexa
and eax,0000001f0h ; indexa & 0x1F0
movzx edx,[word ptr edx*2+taba] ; taba[indexa]
movzx eax,[byte ptr eax+ecx+tabb] ; tabb[indexa & 0x1F0 | (arg >> 11) & 0xF]
shr edi,1 ; (0xBE000000L - ((arg->i) & 0x7F800000L)) >> 1
add eax,edx ; taba[indexa]+tabb[indexb] = a+b
and edi,07f800000h ; ((0xBE000000L - ((arg->i) & 0x7F800000L)) >> 1) & 0x7F800000L
shl eax,7 ; (a+b) << 7
mov ecx,03f000000h ; 0.5 in single precision
add eax,edi ; 1/sqrt(arg) approx (single precision)
push eax ; push approx onto stack
push ecx ; push 0.5 onto stack
mov ecx,040400000h ; 3.0 in single precision
push ecx ; push 3.0 onto stack and approx
fld [dword ptr esp+8]
fld st(0) ; approx approx
fmul st,st(0) ; approx*approx approx
fxch st(1) ; approx approx*approx
fmul [dword ptr esp+4] ; approx*0.5 approx*approx
fxch st(1) ; approx*approx approx*0.5
fmul [dword ptr ebx] ; approx*approx*arg approx*0.5
fsubr [dword ptr esp] ; 3-approx*approx*arg approx*0.5
fmulp st(1),st ; res = (3-approx*approx*arg)*guess*0.5
fstp [dword ptr esi] ; store result
add esp,12 ; remove floating-point temps
ENDM