;/* File: "atan_wmt.asm". */
;//
;//               INTEL CORPORATION PROPRIETARY INFORMATION
;//  This software is supplied under the terms of a license agreement or
;//  nondisclosure agreement with Intel Corporation and may not be copied
;//  or disclosed except in accordance with the terms of that agreement.
;//        Copyright (c) 2000 Intel Corporation. All Rights Reserved.
;//
;//
;//  Contents:      atan.
;//
;//  Purpose:       Libm 
;//

    .686P
    .387
    .XMM
    .MODEL FLAT,C

EXTRN C __libm_error_support : NEAR  

CONST	SEGMENT PARA PUBLIC USE32 'CONST'
        ALIGN 16

EXTRN C _atan_table:QWORD

_atn TEXTEQU <_atan_table>

;/*
;//   FUNCTION:     double atan(double x)
;//
;//   DESCRIPTION:
;//
;//      1. For |x| < 2^(-27), where atan(x) ~= x, return x.
;//      2. For |x| >= 0.1633123935319536975596774e+17, where atan(x) ~= +-Pi/2, return +-Pi/2.
;//      3. In interval [0.0,0.03125] polynomial approximation of atan(x)=x-x*P(x^2).
;//      4. In interval [0.03125,0.375] polynomial approximation of atan(x)=x-x*D(x^2).
;//      5. In interval [0.375,8.0] we compute ind and eps such, that x=0.03125*ind+eps and 0.0<eps<0.03125.
;//         Let s=0.03125*ind, then atan(x)=atan(s)+atan(t), where t=((x-s)/(1+x*s)). For lo and hi part of
;//         atan(s) we have table (see file atan_table.c): atn[ind]+atn[ind+1]=atan(s).
;//         atan(t) is approximated atan(t)=t-t*P(t^2).
;//      6. In interval [8.0,0.1633123935319536975596774e+17] atan(x)=Pi/2+atan(-1/x).
;//         atan(-1/x) is approximated atan(t)=t-t*P(t^2), where t=-1/x.
;//      7. For x < 0.0 atan(x) = -atan(|x|).
;//      8. Special cases:
;//         atan(+0)   = +0;
;//         atan(-0)   = -0;
;//         atan(+INF) = +Pi/2;
;//         atan(-INF) = -Pi/2;
;//         atan(NaN)  = NaN.
;//
;//   KEYS OF COMPILER: -c -w -Zl -Di386 /QIfdiv-
;*/

_mexp   DQ  07ff0000000000000H, 07ff0000000000000H
_mabs   DQ  07fffffffffffffffH, 07fffffffffffffffH
_pi_2d  DQ  03ff921fb54442d18H, 0bff921fb54442d18H
_cntshf DQ  00000000000040201H, 00000000000040201H
_d1400  DQ  03fd5555555555552H, 00000000000000000H
_d1213  DQ  03fc249249246aa76H, 0bfc99999999992acH
_d1011  DQ  03fb745d15933de8aH, 0bfbc71c71b835923H
_d89    DQ  03fb110f5eeb76ecaH, 0bfb3b1390a3b9899H
_d67    DQ  03faae4492fe3a600H, 0bfae1c1704144b68H
_d45    DQ  03fa51fa164891abeH, 0bfa8171d55d53138H
_d23    DQ  03f974721481ca2a2H, 0bfa124ce2388f2cbH
_d01    DQ  03f66107c30e0b8a5H, 0bf866e5652b14bbdH
_p60    DQ  03fd55555555554ebH, 00000000000000000H
_p45    DQ  03fc249249014497eH, 0bfc9999999976718H
_p23    DQ  03fb7453ba342480fH, 0bfbc71c4eebfb10eH
_p01    DQ  03fae9be97b0f8d08H, 0bfb39ad683f878c6H
_zero   DQ  00000000000000000H, 00000000000000000H
_onen   DQ  0bff0000000000000H, 0bff0000000000000H
_one    DQ  03ff0000000000000H, 03ff0000000000000H
_cnst8  DQ  04020000000000000H, 04020000000000000H
_in3    DQ  04020000000000000H, 04020000000000000H
_in2    DQ  03fd8000000000000H, 03fd8000000000000H
_in1    DQ  03fa0000000000000H, 03fa0000000000000H
_in0    DQ  03e40000000000000H, 03e40000000000000H
_in     DQ  0434d02967c31cdb5H, 0434d02967c31cdb5H
_minval DQ  00010000000000000H, 00010000000000000H
libm_small  DQ  00200000000000000H 
CONST	ENDS

_x      TEXTEQU     <esp+4>
XMMWORD TEXTEQU <OWORD>

_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
        ALIGN       4

PUBLIC C _atan_pentium4, _CIatan_pentium4
_CIatan_pentium4  PROC NEAR
	push	    ebp
	mov 	    ebp, esp 
	sub         esp, 8                          ; for argument DBLSIZE
	and         esp, 0fffffff0h
	fstp        qword ptr [esp]
	movq        xmm7, qword ptr [esp]
	call        start
	leave
	ret
_atan_pentium4   label proc
        movq        xmm7, QWORD PTR [_x]            ;  x
start:
        unpcklpd    xmm7, xmm7
        movapd      xmm2, xmm7
        andpd       xmm2, XMMWORD PTR _mabs         ; |x|
        comisd      xmm2, XMMWORD PTR _in           ; |x| < 0.1633123935319536975596774e+17 ?
        jp          x_nan
        jae         bigx
        comisd      xmm2, XMMWORD PTR _in1          ; |x| < 0.03125 ?
        jae         xge0_03125
        comisd      xmm2, XMMWORD PTR _in0          ; |x| < 2^(-27) ?
        jb          retx                            ; atan(x) ~= x

        ; 2^(-27) < |x| < 0.03125, atan(x)=x-x*P(x^2)

        movapd      xmm1, xmm2
        mulpd       xmm1, xmm2                      ; |x|^2
        movapd      xmm3, xmm1
        mulpd       xmm3, xmm1                      ; |x|^4
        movapd      xmm5, XMMWORD PTR _p01          ; calculate P(x^2)
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _p23
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _p45
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _p60
        mulsd       xmm5, xmm1
        movapd      xmm3, xmm5
        shufpd      xmm3, xmm3, 1
        addsd       xmm5, xmm3                      ; P(x^2)
        mulsd       xmm5, xmm7                      ; x * P(x^2)
        subsd       xmm7, xmm5                      ; x - x * P(x^2)
        movq        QWORD PTR [_x], xmm7
        fld         QWORD PTR [_x]
        ret

xge0_03125:                                         ; |x| >= 0.03125
        comisd    xmm2, XMMWORD PTR _in2            ; |x| < 0.375 ?
        jae       xge0_375

        ; 0.03125 < |x| < 0.375, atan(x)=x-x*D(x^2)

        movapd      xmm1, xmm2
        mulpd       xmm1, xmm2                      ; |x|^2
        movapd      xmm3, xmm1
        mulpd       xmm3, xmm1                      ; |x|^4
        movapd      xmm5, XMMWORD PTR _d01          ; calculate D(x^2)
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d23
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d45
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d67
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d89
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d1011
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d1213
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _d1400
        mulsd       xmm5, xmm1
        movapd      xmm3, xmm5
        shufpd      xmm3, xmm3, 1
        addsd       xmm5, xmm3                      ; D(x^2)
        mulsd       xmm5, xmm7                      ; x * D(x^2)
        subsd       xmm7, xmm5                      ; x - x * D(x^2)
        movq        QWORD PTR [_x], xmm7
        fld         QWORD PTR [_x]
        ret

xge0_375:                                           ; |x| >= 0.375
        movq        xmm6, xmm7                      ; x
        xorpd       xmm6, xmm2                      ; sign x
        comisd      xmm2, XMMWORD PTR _in3          ; |x| < 8.0 ?
        jae         xge8_0

;       0.375 < |x| < 8.0:
;       atan(|x|)=atan(s)+atan(t), s=ind*0.03125, t=(|x|-s)/(1+|x|*s)

        movq        xmm0, XMMWORD PTR _cnst8
        movq        xmm5, XMMWORD PTR _cntshf
        movq        xmm3, xmm2                      ; calculate ind
        addsd       xmm3, xmm0
        psrlq       xmm3, 44
        psubd       xmm3, xmm5
        movd        eax,  xmm3                      ; ind
        lea         eax,  DWORD PTR [eax+eax*2]     ; ind*3
        movq        xmm5, QWORD PTR _atn[eax*8+16]  ; s
        movq        xmm3, xmm2                      ; |x|
        subsd       xmm2, xmm5                      ; |x|-s
        mulsd       xmm3, xmm5                      ; |x|*s
        addsd       xmm3, XMMWORD PTR _one          ; 1+|x|*s
        divsd       xmm2, xmm3                      ; (|x|-s)/(1+|x|*s)
        unpcklpd    xmm2, xmm2
        jmp         clcpol

xge8_0:                                             ; |x| > 8.0

;       8.0 < |x| < 0.1633123935319536975596774e+17:
;       atan(|x|)=Pi/2+atan(-1/|x|)

        mov         eax, 768                        ; ind*3 - entry point in table, where lo and hi part of Pi/2
        movq        xmm0, xmm2                      ; |x|
        movq        xmm2, XMMWORD PTR _onen
        divsd       xmm2, xmm0                      ;-1/|x|
        unpcklpd    xmm2, xmm2

clcpol:
        movq        xmm0, QWORD PTR _atn[0+eax*8]   ; atn[ind+0] - hi part of atan(s) or Pi/2
        movq        xmm4, QWORD PTR _atn[8+eax*8]   ; atn[ind+1] - lo part of atan(s) or Pi/2
        movapd      xmm1, xmm2
        mulpd       xmm1, xmm2                      ; |x|^2
        movapd      xmm3, xmm1
        mulpd       xmm3, xmm1                      ; |x|^4
        movapd      xmm5, XMMWORD PTR _p01          ; calculate P(x^2)
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _p23
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _p45
        mulpd       xmm5, xmm3
        addpd       xmm5, XMMWORD PTR _p60
        mulsd       xmm5, xmm1
        movapd      xmm3, xmm5
        shufpd      xmm3, xmm3, 1
        addsd       xmm5, xmm3                      ; P(x^2)

;       atan(|x|) = atn[ind+0]-((|x|*P(x^2)-atn[ind+1])-|x|)

        mulsd       xmm5, xmm2                      ; |x|*P(x^2)
        subsd       xmm5, xmm4                      ; |x|*P(x^2)-atn[ind+1]
        subsd       xmm5, xmm2                      ; (|x|*P(x^2)-atn[ind+1])-|x|
        subsd       xmm0, xmm5                      ; atn[ind+0]-((|x|*P(x^2)-atn[ind+1])-|x|)
        orpd        xmm0, xmm6                      ; sign x
        movq        QWORD PTR [_x], xmm0
        fld         QWORD PTR [_x]
        ret

retx:                                               ; |x| < 2^(-27): atan(x) ~= x
        comisd      xmm2, XMMWORD PTR _zero         ; x == 0 ?
        jne         notzero
        fld         QWORD PTR [_x]                  ; x == +0.0 or -0.0
        ret

notzero:
        comisd      xmm2, XMMWORD PTR _minval       ; x < minval ?
        jae         ge_minval
        fld         QWORD PTR libm_small
        fmul        QWORD PTR libm_small
        sub         esp, 8
        fstp        QWORD PTR [esp]                 ; should be flag UNDERFLOW
        fld         QWORD PTR [esp]
        add         esp, 8
        fadd        QWORD PTR [_x]                  ; should be inexact result
        ret

ge_minval:                                          ; minval < x < 2^(-27)
        fld         QWORD PTR libm_small
        fmul        QWORD PTR libm_small
        fadd        QWORD PTR [_x]                  ; should be inexact result
        ret

bigx:                                               ; |x| > 0.1633123935319536975596774e+17
        movq        xmm0, xmm2                      ; |x|
        movq        xmm3, QWORD PTR _mexp
        andpd       xmm0, xmm3
        ucomisd     xmm0, xmm3
        jp          x_nan

        mov         eax, DWORD PTR [_x+4]           ; x
        shr         eax, 31                         ; sign x
        fld         QWORD PTR libm_small
        fadd        QWORD PTR _pi_2d[eax*8]         ; should be inexact result
        ret                                         ; return +-Pi/2

x_nan:
        mov         edx, 1003
        ;call libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag)
        sub         esp, 16
        mov         DWORD PTR [esp+12],edx
        mov         edx, esp
        add         edx, 16+4
        mov         DWORD PTR [esp+8],edx
        mov         DWORD PTR [esp+4],edx
        mov         DWORD PTR [esp],edx
        call        NEAR PTR __libm_error_support
        add         esp, 16

        fld         QWORD PTR [_x]
        ret                                         ; return same nan

        ALIGN       4

_CIatan_pentium4  ENDP

_TEXT   ENDS

        END