src/libm/Source/Intel/nearbyintf.S at fixPythonPipStalling

overby.me / darling-nix
fork
this repo has no description
fork
darling-nix / src / libm / Source / Intel / nearbyintf.S
at fixPythonPipStalling 160 lines 6.0 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1//  float nearbyintf(float x)
  2//
  3//  returns the argument rounded to an integral value using the prevailing
  4//  rounding mode, and without raising the inexact flag (C99 7.12.9.4).
  5//
  6//  -- Stephen Canon, January 2010
  7
  8#include <System/i386/cpu_capabilities.h>
  9
 10.text
 11.align 4
 12.globl _nearbyintf
 13_nearbyintf:
 14
 15// Fast path if SSE4.1 is available ------------------------------------------
 16
 17#if defined __i386__
 18    movss      4(%esp),         %xmm0
 19    mov         _COMM_PAGE_CPU_CAPABILITIES,    %eax
 20    test        $(kHasSSE4_1),  %eax
 21    jz          L_noSSE4_1
 22    roundss     $0xc,  %xmm0,   %xmm0
 23    movss       %xmm0,         4(%esp)
 24    flds       4(%esp)
 25    ret
 26#elif defined __x86_64__
 27    movabs      _COMM_PAGE_CPU_CAPABILITIES,    %eax
 28    test        $(kHasSSE4_1),  %eax
 29    jz          L_noSSE4_1
 30    roundss     $0xc,  %xmm0,   %xmm0
 31    ret
 32#else
 33#error "This implementations supports i386 and x86_64 only"
 34#endif
 35
 36// Fallback path if SSE4.1 is not available ----------------------------------
 37//
 38// FPSCR rounding control bits 11:10
 39//
 40//  00  --  Round to nearest even
 41//  01  --  Round to -Inf
 42//  10  --  Round to +Inf
 43//  11  --  Round to zero
 44//  
 45// This path is adapted from Ian Ollmann's original implementation
 46
 47L_noSSE4_1:
 48    movd        %xmm0,          %ecx
 49#if defined __i386__
 50    sub         $4,             %esp    // make space for the fp control word
 51    #define     fpcw            (%esp)
 52    #define     floatval       8(%esp)  // address of argument to nearbyintf
 53#else
 54    #define     fpcw          -4(%rsp)  // space for fp control word in red zone
 55#endif
 56    fnstcw      fpcw
 57      
 58// Detect special cases and branch
 59    mov         %ecx,           %edx
 60    and         $0x7fffffff,    %ecx    // |x|
 61    xor         %ecx,           %edx    // signbit of x
 62    movd        %edx,           %xmm2
 63    sub         $0x3f800000,    %ecx    // |x| - exponent bias
 64    mov         $0x80000000,    %eax
 65    cmp         $0x0b800000,    %ecx    // if |x| < 1.0 or |x| >= 0x1.0p23f or isnan(x)
 66    jae         L_edgeCases             //    branch
 67    
 68    shr         $23,            %ecx    // exponent of x, in [0,22]
 69    add         $8,             %ecx
 70    sar         %cl,            %eax    // mask covering integral bits of x
 71    
 72L_reentryForSmallX:
 73    movd        %eax,           %xmm1
 74    andps       %xmm0,          %xmm1   // trunc(x)
 75    ucomiss     %xmm1,          %xmm0   // if x == trunc(x)
 76    je          L_roundToZero           //    return trunc(x)
 77
 78    mov         fpcw,           %ecx
 79    and         $0xc00,         %ecx    // current rounding mode encoding
 80    sub         $0x400,         %ecx
 81    cmp         $0x400,         %ecx
 82    jg          L_roundToZero           // ecx = 0x800
 83    je          L_roundUp               // ecx = 0x400
 84    jb          L_roundDown             // ecx = 0x000
 85
 86L_roundToNearest:                       // fall into default rounding path
 87    subss       %xmm1,          %xmm0   // fractional part of x
 88    movd        %xmm0,          %ecx
 89    or          $0x3f000000,    %edx    // copysignf(0.5f,x)
 90    xor         %edx,           %ecx    // frac(x) ^ copysignf(0.5f,x)
 91    jz          L_exactHalfway          // handle exact halfway cases separately
 92    
 93    sub         $0x00800000,    %ecx    // |frac(x)| > 0.5 ? negative : positive
 94    or          $0x00800000,    %edx
 95    sar         $31,            %ecx    // |frac(x)| > 0.5 ? -1 : 0
 96    and         %ecx,           %edx    // |frac(x)| > 0.5 ? copysignf(1.0f,x) : 0
 97    movd        %edx,           %xmm0
 98    addss       %xmm1,          %xmm0   // |frac(x)| > 0.5 ? trunc(x) + copysign(1.0f,x) : trunc(x)
 99    orps        %xmm2,          %xmm0   // apply signbit to get the right zero
100#if defined( __i386__ )
101    movss       %xmm0,          floatval
102    flds        floatval
103    add         $4,             %esp
104#endif
105    ret
106
107L_exactHalfway:
108    shl         $1,             %eax
109    addss       %xmm0,          %xmm0   // copysignf(1.0f,x)
110    or          $0x80000000,    %eax    // mask covering signbit and bits larger than 2^1
111    movd        %eax,           %xmm3
112    pandn       %xmm1,          %xmm3   // trunc(x) odd ? non-zero : zero
113    pxor        %xmm4,          %xmm4
114    pcmpeqd     %xmm4,          %xmm3   // trunc(x) odd ? 0 : -1
115    andnps      %xmm0,          %xmm3   // trunc(x) odd ? copysignf(1.0f,x) : 0.0f
116    addss       %xmm3,          %xmm1
117    orps        %xmm2,          %xmm1   // apply signbit to result
118L_roundToZero:
119#if defined( __i386__ )
120    movss       %xmm1,          floatval
121    flds        floatval
122    add         $4,             %esp
123#else
124    movaps      %xmm1,          %xmm0
125#endif
126    ret
127    
128L_roundUp:                              // we know that x is not integral
129    psrad       $31,            %xmm0   // x < 0 ? -1 : 0
130    pcmpeqb     %xmm3,          %xmm3   // -1
131    psubd       %xmm3,          %xmm0   // x < 0 ? 0 : 1
132    cvtdq2ps    %xmm0,          %xmm0   // x < 0 ? 0.0 : 1.0
133    addss       %xmm1,          %xmm0   // x < 0 ? trunc(x) : trunc(x) + 1
134    orps        %xmm2,          %xmm0   // apply signbit to get the right zero
135#if defined( __i386__ )
136    movss       %xmm0,          floatval
137    flds        floatval
138    add         $4,             %esp
139#endif
140    ret
141    
142L_roundDown:                            // we know that x is not integral
143    psrad       $31,            %xmm0   // x < 0 ? -1 : 0
144    cvtdq2ps    %xmm0,          %xmm0   // x < 0 ? -1 : 0
145    addss       %xmm1,          %xmm0   // x < 0 ? trunc(x) - 1 : trunc(x)
146#if defined( __i386__ )
147    movss       %xmm0,          floatval
148    flds        floatval
149    add         $4,             %esp
150#endif
151    ret
152
153L_edgeCases:                            // if |x| < 1.0, jump back to mainline
154    jl          L_reentryForSmallX      // otherwise, we just return x
155#if defined( __i386__ )
156    movss       %xmm0,          floatval
157    flds        floatval
158    add         $4,             %esp
159#endif
160    ret
Configure Feed

Configure Feed