src/libm/Source/Intel/nearbyint.S at fixPythonPipStalling

overby.me / darling-nix
fork
this repo has no description
fork
darling-nix / src / libm / Source / Intel / nearbyint.S
at fixPythonPipStalling 202 lines 8.2 kB view raw
wrap content
Lubos Dolezel More progress 9y ago
a76db625
  1//  float nearbyintf(float x)
  2//
  3//  returns the argument rounded to an integral value using the prevailing
  4//  rounding mode, and without raising the inexact flag (C99 7.12.9.4).
  5//
  6//  -- Stephen Canon, January 2010
  7
  8#include <System/i386/cpu_capabilities.h>
  9
 10.text
 11.align 4
 12.globl _nearbyint
 13_nearbyint:
 14
 15// Fast path if SSE4.1 is available ------------------------------------------
 16
 17#if defined __i386__
 18    movsd      4(%esp),         %xmm0
 19    mov         _COMM_PAGE_CPU_CAPABILITIES,    %eax
 20    test        $(kHasSSE4_1),  %eax
 21    jz          L_noSSE4_1
 22    roundsd     $0xc,  %xmm0,   %xmm0
 23    movsd       %xmm0,         4(%esp)
 24    fldl       4(%esp)
 25    ret
 26#elif defined __x86_64__
 27    movabs      _COMM_PAGE_CPU_CAPABILITIES,    %eax
 28    test        $(kHasSSE4_1),  %eax
 29    jz          L_noSSE4_1
 30    roundsd     $0xc,  %xmm0,   %xmm0
 31    ret
 32#else
 33#error "This implementations supports i386 and x86_64 only"
 34#endif
 35
 36// Fallback path if SSE4.1 is not available ----------------------------------
 37//
 38// FPSCR rounding control bits 11:10
 39//
 40//  00  --  Round to nearest even
 41//  01  --  Round to -Inf
 42//  10  --  Round to +Inf
 43//  11  --  Round to zero
 44//  
 45// This path is adapted from Ian Ollmann's original implementation
 46
 47L_noSSE4_1:
 48#if defined __i386__
 49    mov        8(%esp),         %ecx    // high word of input
 50    sub         $4,             %esp    // make space for the fp control word
 51    #define     fpcw            (%esp)
 52    #define     floatval       8(%esp)  // address of argument to nearbyintf
 53#else
 54    movd        %xmm0,          %rcx
 55    shr         $32,            %rcx    // high word of input
 56    #define     fpcw          -4(%rsp)  // space for fp control word in red zone
 57#endif
 58    fnstcw      fpcw
 59    
 60    mov         %ecx,           %edx
 61    pcmpeqb     %xmm1,          %xmm1
 62    and         $0x7fffffff,    %ecx    // |x|
 63    psllq       $63,            %xmm1   // -0.0
 64    xor         %ecx,           %edx    // signbit of x
 65    movapd      %xmm1,          %xmm2
 66    sub         $0x3ff00000,    %ecx    // high word of |x| - exponent bias
 67    andpd       %xmm0,          %xmm2   // signbit of x
 68    cmp         $0x03400000,    %ecx    // if |x| < 1.0 or |x| >= 0x1.0p52 or isnan(x)
 69    jae         L_edgeCases             //    branch
 70    
 71    shr         $20,            %ecx    // exponent of x, in [0, 51]
 72    sub         $52,            %ecx
 73    neg         %ecx                    // number of fractional bits in x
 74    pcmpeqb     %xmm1,          %xmm1
 75    movd        %ecx,           %xmm7
 76    psllq       %xmm7,          %xmm1   // mask covering integral bits of x
 77    
 78L_reentryForSmallX:
 79    andpd       %xmm0,          %xmm1   // trunc(x)
 80    ucomisd     %xmm1,          %xmm0   // if x == trunc(x)
 81    je          L_roundToZero           //    return trunc(x)
 82    
 83    mov         fpcw,           %ecx
 84    and         $0xc00,         %ecx    // current rounding mode encoding
 85    sub         $0x400,         %ecx
 86    cmp         $0x400,         %ecx
 87    jg          L_roundToZero           // ecx = 0x800
 88    je          L_roundUp               // ecx = 0x400
 89    jb          L_roundDown             // ecx = 0x000
 90    
 91L_roundToNearest:                       // fall into default rounding path
 92    subsd       %xmm1,          %xmm0   // fractional part of x
 93    pcmpeqb     %xmm3,          %xmm3
 94    psllq       $55,            %xmm3
 95    psrlq       $2,             %xmm3   // 0.5
 96    xorpd       %xmm2,          %xmm0   // |fract(x)|
 97    ucomisd     %xmm3,          %xmm0   // if |fractional part| == 0.5
 98    je          L_exactHalfway          //    branch to handle exact halfway case
 99    
100    cmpltsd     %xmm3,          %xmm0   // |fract(x)| < 0.5 ? -1 : 0
101    addsd       %xmm3,          %xmm3   // 1.0
102    orpd        %xmm2,          %xmm3   // copysign(1.0,x)
103    andnpd      %xmm3,          %xmm0   // |fract(x)| < 0.5 ? 0.0 : copysign(1.0,x)
104    addsd       %xmm1,          %xmm0   // correctly rounded result
105    orpd        %xmm2,          %xmm0   // restore sign of zero
106#if defined( __i386__ )
107    movsd       %xmm0,          floatval
108    fldl        floatval
109    add         $4,             %esp
110#endif
111    ret
112    
113L_exactHalfway:
114    addsd       %xmm3,          %xmm3   // 1.0
115    pcmpeqb     %xmm4,          %xmm4
116    psubq       %xmm4,          %xmm7
117    psllq       %xmm7,          %xmm4   // mask covering signbit and bits larger than 2^1
118    orpd        %xmm2,          %xmm3   // copysign(1.0, x)
119    pandn       %xmm1,          %xmm4   // trunc(x) odd ? nonzero : zero
120    xorpd       %xmm5,          %xmm5
121    pcmpeqd     %xmm5,          %xmm4   // trunc(x) odd ? at least one dword is 0 : -1
122    pshufd      $0xe1,  %xmm4,  %xmm5
123    pand        %xmm5,          %xmm4   // trunc(x) odd ? 0 : -1
124    andnpd      %xmm3,          %xmm4   // trunc(x) odd ? copysign(1.0,x), 0
125    addsd       %xmm4,          %xmm1
126    orpd        %xmm2,          %xmm1   // reapply sign of zero
127    
128L_roundToZero:
129#if defined( __i386__ )
130    movsd       %xmm1,          floatval
131    fldl        floatval
132    add         $4,             %esp
133#else
134    movapd      %xmm1,          %xmm0
135#endif
136    ret
137    
138L_roundUp:                              // we already know that x is not integral
139    xorpd       %xmm3,          %xmm3
140    cmpltsd     %xmm0,          %xmm3   // x > 0 ? -1 : 0
141    cvtdq2pd    %xmm3,          %xmm3   // x > 0 ? -1.0 : 0.0
142    subsd       %xmm3,          %xmm1   // x > 0 ? trunc(x) + 1.0 : trunc(x)
143#if defined( __i386__ )
144    movsd       %xmm1,          floatval
145    fldl        floatval
146    add         $4,             %esp
147#else
148    movapd      %xmm1,          %xmm0
149#endif
150    ret
151
152L_roundDown:                            // we already know that x is not integral
153    psrlq       $63,            %xmm0   // x < 0 ? 1 : 0
154    cvtdq2pd    %xmm0,          %xmm3   // x < 0 ? 1.0 : 0.0
155    subsd       %xmm3,          %xmm1   // x < 0 ? trunc(x) - 1.0 : trunc(x)
156    pcmpeqb     %xmm0,          %xmm0
157    psrlq       $1,             %xmm0   // 0x7fffffffffffffff
158    orpd        %xmm2,          %xmm0   // x < 0 ? 0xffffffffffffffff : 0x7fffffffffffffff
159    andpd       %xmm1,          %xmm0
160#if defined( __i386__ )
161    movsd       %xmm0,          floatval
162    fldl        floatval
163    add         $4,             %esp
164#endif
165    ret
166
167L_edgeCases:                            // if |x| < 1.0, jump back to mainline
168    xorpd       %xmm7,          %xmm7   // initialize data in xmm7
169    jl          L_reentryForSmallX      // otherwise, we just return x
170#if defined( __i386__ )
171    movsd       %xmm0,          floatval
172    fldl        floatval
173    add         $4,             %esp
174#endif
175    ret
176
177/*
1787:  //Round to even nearest, half way case
179    addsd   %xmm6,                  %xmm6       // 1.0
180    pcmpeqb %xmm3,                  %xmm3       // -1ULL
181    psubq   %xmm3,                  %xmm7       // add one to the truncation mask shift constant
182    psllq   %xmm7,                  %xmm3       // prepare a new truncation mask with left edge past unit bit. (Works for 1.0 too, since least significant bit of exponent has odd parity)
183    orpd    %xmm2,                  %xmm6       // copysign( 1.0, x )
184    pandn   %xmm1,                  %xmm3       // if trunc(x) is odd, this bit will be non-zero (in the +-0.5, case we have +-0.5 here instead of just a bit set )
185    xorpd   %xmm4,                  %xmm4       // 0.0
186    pcmpeqd %xmm4,                  %xmm3       // check if each 32-bit chunk is equal to 0. Unforunately there is no 64-bit integer compare. Wed hit denorms here if we use double precision compare.
187    pshufd  $0xE1, %xmm3,           %xmm4       // swap both chunks
188    pand    %xmm4,                  %xmm3       // trunc(x) is odd ? 0 : -1U    (make sure the other chunk is also equal to 0)
189    andnpd  %xmm6,                  %xmm3       // trunc(x) is odd ? copysign( 1.0, x ) : 0.0f
190    addsd   %xmm3,                  %xmm1       // round
191
192#if defined( __i386__ )
193    movsd   %xmm1,                  (STACKP)
194    fldl    (STACKP)
195#else
196    movsd   %xmm1,                  %xmm0
197#endif    
198    ADDP    $(16-FRAME_SIZE),       STACKP
199    ret
200    
201    */
202
Configure Feed

Configure Feed