src/libm/Source/Intel/nearbyintl.S at fixPythonPipStalling

overby.me / darling-nix
fork
this repo has no description
fork
darling-nix / src / libm / Source / Intel / nearbyintl.S
at fixPythonPipStalling 169 lines 9.9 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
  1//  long double nearbyintl(long double x)
  2//
  3//  returns the argument rounded to an integral value using the prevailing
  4//  rounding mode, and without raising the inexact flag (C99 7.12.9.4).
  5//
  6//  Original version due to Ian Ollmann, 2007
  7//
  8//  Edits by Steve Canon January 2010:
  9//      rearrange branches to match nearbyint[f]
 10//      minimal tune-up to get sign of zero right
 11 
 12#include <machine/asm.h>
 13#include "abi.h"
 14
 15ENTRY( nearbyintl )
 16    SUBP    $(32-FRAME_SIZE),       STACKP
 17    
 18    //write the current rounding mode to the stack.  Using the mxcsr instead cost an extra 20 cycles.
 19    fnstcw  16(STACKP) 
 20
 21    fldz                                        // { 0 }
 22    movq    32(STACKP),             %xmm0       //Load the mantissa
 23    fldt    32(STACKP)                          // { x, 0 }
 24    pxor    %xmm1,                  %xmm1       // set our default truncation mask to 0
 25    xorl    %ecx,                   %ecx
 26    movw    40(STACKP),             %cx         //Load the signed exponent
 27    movl    %ecx,                   %edx        // save signed exponent
 28    movw    %cx,                    8(STACKP)   // write out exponent of truncated value
 29    andl    $0x8000,                %edx        // sign
 30    andl    $0x7fff,                %ecx        // biased exponent
 31    shll    $16,                    %edx        // sign << 16
 32    addl    $(16384-62),            %ecx        // add 16384-62 to exponent. This forces Inf, NaN, numbers >= 2**63 to be negative
 33    cmpw    $(16383+16384-62),      %cx         // values with exponents less than 16384-62 are fractional numbers only
 34    jl      L_xIsLargeOrSmall                   // branch for |x| < 1.0L, |x| >= 0x1.0p63L, NaN
 35
 36    //Now we know we have a number 1.0 <= |x| < 2**63
 37
 38    fstp    %st(1)                              // { x }
 39    //create a mask of non-fractional bits to calculate trunc(x)
 40    subl    $(16383+16384-62+63),   %ecx        // unbias the exponent, undo our magic above
 41    negl                            %ecx        // bits to shift -1ULL left to create a truncation mask
 42    pcmpeqb %xmm1,                  %xmm1       // -1ULL
 43    movd    %ecx,                   %xmm7       // bits to shift -1ULL left to create a truncation mask
 44    psllq   %xmm7,                  %xmm1       // left shift -1ULL by number of fractional bits in x
 45    
 46    //Round. We get here in one of two ways:
 47    //      fall through from above:    xmm2 has a truncation mask   
 48    //      jmp from |x| < 1.0:         xmm2 holds the sign of x (stored as +-0.0f), which rounds x to zero of the correct sign in this step
 49L_reentryForSmallX:
 50    pand    %xmm0,                  %xmm1       // trunc(mantissa)
 51    movq    %xmm1,                  (STACKP)    
 52    fldt    (STACKP)                            // { trunc(x), x }
 53    fucomi  %st(1), %st(0)                      // trunc(x) == x ?
 54    je      L_truncate                          // if the value was an integer, we are done, so return x (satisfies x==0, small integer cases )
 55
 56    //find our what our rounding mode is
 57    movl    16(STACKP),             %ecx
 58    andl    $0xC00,                 %ecx        // isolate the RC field
 59    subl    $0x400,                 %ecx        // move one of the possible values negative
 60    cmpw    $0x400,                 %cx         // test the RC value
 61    
 62    // Rounding modes in eax:
 63    //      -0x400          Round to nearest even
 64    //      0               Round to -Inf
 65    //      0x400           Round to +inf
 66    //      0x800           Round to zero
 67    
 68    jg      L_truncate                          //Round to zero mode.               Note: signed compare here
 69    je      L_roundUp                           //Round to +Inf, Go there
 70    jb      L_roundDown                         //Round to -Inf, Go there.          Note: unsigned compare here
 71    
 72    //Round to nearest even is the fall through path, because it is most common
 73    fsubr   %st(0), %st(1)                      // { trunc(x), fract }
 74
 75    orl     $0x3f000000,            %edx        // copysign( 0.5f, x )
 76    movl    %edx,                   16(STACKP)
 77    flds    16(STACKP)                          // { copysign( 0.5L, x ), trunc(x), fract }
 78    
 79    //check for fract == copysign( 0.5f, x )
 80
 81    fld     %st(2)                              // { fract, copysign( 0.5L, x ), trunc(x), fract }
 82    fabs                                        // { |fract|, copysign( 0.5L, x ), trunc(x), fract } 
 83    fld     %st(1)                              // { copysign( 0.5L, x ), |fract|, copysign( 0.5L, x ), trunc(x), fract }
 84    fabs                                        // { 0.5L, |fract|, copysign( 0.5L, x ), trunc(x), fract }
 85    fucomip %st(1), %st(0)                      // { |fract|, copysign( 0.5L, x ), trunc(x), fract }        
 86    fstp    %st(0)                              // { copysign( 0.5L, x ), trunc(x), fract } 
 87    je      L_exactHalfway                      // |fract| == 0.5, figure out which way to round in the comfort and safety of a separate code branch
 88
 89    fldz                                        // { 0.0L, copysign( 0.5L, x ), trunc(x), fract } 
 90    fcmovb  %st(1), %st(0)                      // { 0.5L < |fract| ? copysign( 0.5L, x ) : 0.0L, copysign( 0.5L, x ), trunc(x), fract }
 91    fstp    %st(1)                              // { 0.5L < |fract| ? copysign( 0.5L, x ) : 0.0L, trunc(x), fract }
 92    fadd    %st(0), %st(0)                      // { 0.5L < |fract| ? copysign( 1.0L, x ) : 0.0L, trunc(x), fract }
 93    faddp                                       // { correctly rounded x, fract }
 94    fstp    %st(1)
 95    fabs                                        // { |result| }
 96    cmp     $0,     %edx
 97    jge     1f
 98    fchs                                        // flip the sign if x is negative
 991:  ADDP    $(32-FRAME_SIZE),       STACKP
100    ret
101    
102L_exactHalfway:                                 // { copysign( 0.5L, x ), trunc(x), fract }
103    addl    $0x20000000,            %edx        // copysign( 0x1.0p63f, x )
104    movl    %edx,                   16(STACKP)  // copysign( 0x1.0p63f, x )
105    flds    16(STACKP)                          // { copysign( 0x1.0p63L, x ), copysign( 0.5L, x ), trunc(x), fract }
106    fadd    %st(2), %st(0)                      // { trunc(x) + copysign( 0x1.0p63L, x ), copysign( 0.5L, x ), trunc(x), fract }
107    fstpt   (STACKP)                            // { copysign( 0.5L, x ), trunc(x), fract }
108    fstp    %st(0)                              // { trunc(x), fract }
109    fxch                                        // { fract, trunc(x) }
110    fadd    %st(0), %st(0)                      // { copysign( 1.0L, x ), trunc(x) }
111    movl    $1,                     %eax        //
112    fldz                                        // { 0, copysign( 1.0L, x ), trunc(x) }
113    andl    (STACKP),               %eax        // (mantissa & 1ULL) == 0
114    fcmovne %st(1), %st(0)                      // { rounding value, copysign( 1.0L, x ), trunc(x) }
115    fstp    %st(1)                              // { rounding value, trunc(x) }
116    faddp                                       // { result }
117    fabs                                        // { |result| }
118    cmp     $0,     %edx
119    jge     1f
120    fchs                                        // flip the sign if x is negative
1211:  ADDP    $(32-FRAME_SIZE),       STACKP
122    ret
123
124L_truncate:
125    fstp    %st(1)
126    ADDP    $(32-FRAME_SIZE),       STACKP
127    ret                           
128    
129L_roundUp:                                      // { trunc(x), x }
130    fsubr   %st(0), %st(1)                      // { trunc(x), fract }
131    fldz                                        // { 0.0L, trunc(x), fract }
132    fucomi  %st(2), %st(0)                      // { 0.0L, trunc(x), fract }      0.0L < fract ?
133    fld1                                        // { 1.0L, 0.0L, trunc(x), fract }
134    fcmovnbe %st(1), %st(0)                     // { 0.0L or 1.0L, 0.0L, trunc(x), fract }
135    fstp    %st(3)                              // { 0.0L, trunc(x), 0.0L or 1.0L }
136    fstp    %st(0)                              // { trunc(x), 0.0L or 1.0L }
137    fadd    %st(1), %st(0)                      // { correctly rounded x, 0.0L or 1.0L }
138    fstp    %st(1)                              // { correctly rounded x }
139    fabs                                        // { |result| }
140    cmp     $0,     %edx
141    jge     1f
142    fchs                                        // flip the sign if x is negative
1431:  ADDP    $(32-FRAME_SIZE),       STACKP
144    ret
145
146L_roundDown:                                    // { trunc(x), x }
147    fsubr   %st(0), %st(1)                      // { trunc(x), fract }
148    fldz                                        // { 0.0L, trunc(x), fract }
149    fucomi  %st(2), %st(0)                      // { 0.0L, trunc(x), fract }      0.0L > fract ?
150    fld1                                        // { 1.0L, 0.0L, trunc(x), fract }
151    fcmovb  %st(1), %st(0)                      // { 0.0L or 1.0L, 0.0L, trunc(x), fract }
152    fstp    %st(3)                              // { 0.0L, trunc(x), 0.0L or 1.0L }
153    fstp    %st(0)                              // { trunc(x), 0.0L or 1.0L }
154    fsub    %st(1), %st(0)                      // { correctly rounded x, 0.0L or 1.0L }
155    fstp    %st(1)                              // { correctly rounded x }
156    fabs                                        // { |result| }
157    cmp     $0,     %edx
158    jge     1f
159    fchs                                        // flip the sign if x is negative
1601:  ADDP    $(32-FRAME_SIZE),       STACKP
161    ret
162
163L_xIsLargeOrSmall:
164    jae     L_truncate                          // NaN, |x| >= 2**63 we are done
165
166    // |x| < 1.0f skip to rounding
167    fstp    %st(1)                              // { x }
168    andw    $0x8000,    8(STACKP)               // zero exponent of truncated value, but preserve sign
169    jmp     L_reentryForSmallX
Configure Feed

Configure Feed