this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at fixPythonPipStalling 202 lines 8.2 kB view raw
1// float nearbyintf(float x) 2// 3// returns the argument rounded to an integral value using the prevailing 4// rounding mode, and without raising the inexact flag (C99 7.12.9.4). 5// 6// -- Stephen Canon, January 2010 7 8#include <System/i386/cpu_capabilities.h> 9 10.text 11.align 4 12.globl _nearbyint 13_nearbyint: 14 15// Fast path if SSE4.1 is available ------------------------------------------ 16 17#if defined __i386__ 18 movsd 4(%esp), %xmm0 19 mov _COMM_PAGE_CPU_CAPABILITIES, %eax 20 test $(kHasSSE4_1), %eax 21 jz L_noSSE4_1 22 roundsd $0xc, %xmm0, %xmm0 23 movsd %xmm0, 4(%esp) 24 fldl 4(%esp) 25 ret 26#elif defined __x86_64__ 27 movabs _COMM_PAGE_CPU_CAPABILITIES, %eax 28 test $(kHasSSE4_1), %eax 29 jz L_noSSE4_1 30 roundsd $0xc, %xmm0, %xmm0 31 ret 32#else 33#error "This implementations supports i386 and x86_64 only" 34#endif 35 36// Fallback path if SSE4.1 is not available ---------------------------------- 37// 38// FPSCR rounding control bits 11:10 39// 40// 00 -- Round to nearest even 41// 01 -- Round to -Inf 42// 10 -- Round to +Inf 43// 11 -- Round to zero 44// 45// This path is adapted from Ian Ollmann's original implementation 46 47L_noSSE4_1: 48#if defined __i386__ 49 mov 8(%esp), %ecx // high word of input 50 sub $4, %esp // make space for the fp control word 51 #define fpcw (%esp) 52 #define floatval 8(%esp) // address of argument to nearbyintf 53#else 54 movd %xmm0, %rcx 55 shr $32, %rcx // high word of input 56 #define fpcw -4(%rsp) // space for fp control word in red zone 57#endif 58 fnstcw fpcw 59 60 mov %ecx, %edx 61 pcmpeqb %xmm1, %xmm1 62 and $0x7fffffff, %ecx // |x| 63 psllq $63, %xmm1 // -0.0 64 xor %ecx, %edx // signbit of x 65 movapd %xmm1, %xmm2 66 sub $0x3ff00000, %ecx // high word of |x| - exponent bias 67 andpd %xmm0, %xmm2 // signbit of x 68 cmp $0x03400000, %ecx // if |x| < 1.0 or |x| >= 0x1.0p52 or isnan(x) 69 jae L_edgeCases // branch 70 71 shr $20, %ecx // exponent of x, in [0, 51] 72 sub $52, %ecx 73 neg %ecx // number of fractional bits in x 74 pcmpeqb %xmm1, %xmm1 75 movd %ecx, %xmm7 76 psllq %xmm7, %xmm1 // mask covering integral bits of x 77 78L_reentryForSmallX: 79 andpd %xmm0, %xmm1 // trunc(x) 80 ucomisd %xmm1, %xmm0 // if x == trunc(x) 81 je L_roundToZero // return trunc(x) 82 83 mov fpcw, %ecx 84 and $0xc00, %ecx // current rounding mode encoding 85 sub $0x400, %ecx 86 cmp $0x400, %ecx 87 jg L_roundToZero // ecx = 0x800 88 je L_roundUp // ecx = 0x400 89 jb L_roundDown // ecx = 0x000 90 91L_roundToNearest: // fall into default rounding path 92 subsd %xmm1, %xmm0 // fractional part of x 93 pcmpeqb %xmm3, %xmm3 94 psllq $55, %xmm3 95 psrlq $2, %xmm3 // 0.5 96 xorpd %xmm2, %xmm0 // |fract(x)| 97 ucomisd %xmm3, %xmm0 // if |fractional part| == 0.5 98 je L_exactHalfway // branch to handle exact halfway case 99 100 cmpltsd %xmm3, %xmm0 // |fract(x)| < 0.5 ? -1 : 0 101 addsd %xmm3, %xmm3 // 1.0 102 orpd %xmm2, %xmm3 // copysign(1.0,x) 103 andnpd %xmm3, %xmm0 // |fract(x)| < 0.5 ? 0.0 : copysign(1.0,x) 104 addsd %xmm1, %xmm0 // correctly rounded result 105 orpd %xmm2, %xmm0 // restore sign of zero 106#if defined( __i386__ ) 107 movsd %xmm0, floatval 108 fldl floatval 109 add $4, %esp 110#endif 111 ret 112 113L_exactHalfway: 114 addsd %xmm3, %xmm3 // 1.0 115 pcmpeqb %xmm4, %xmm4 116 psubq %xmm4, %xmm7 117 psllq %xmm7, %xmm4 // mask covering signbit and bits larger than 2^1 118 orpd %xmm2, %xmm3 // copysign(1.0, x) 119 pandn %xmm1, %xmm4 // trunc(x) odd ? nonzero : zero 120 xorpd %xmm5, %xmm5 121 pcmpeqd %xmm5, %xmm4 // trunc(x) odd ? at least one dword is 0 : -1 122 pshufd $0xe1, %xmm4, %xmm5 123 pand %xmm5, %xmm4 // trunc(x) odd ? 0 : -1 124 andnpd %xmm3, %xmm4 // trunc(x) odd ? copysign(1.0,x), 0 125 addsd %xmm4, %xmm1 126 orpd %xmm2, %xmm1 // reapply sign of zero 127 128L_roundToZero: 129#if defined( __i386__ ) 130 movsd %xmm1, floatval 131 fldl floatval 132 add $4, %esp 133#else 134 movapd %xmm1, %xmm0 135#endif 136 ret 137 138L_roundUp: // we already know that x is not integral 139 xorpd %xmm3, %xmm3 140 cmpltsd %xmm0, %xmm3 // x > 0 ? -1 : 0 141 cvtdq2pd %xmm3, %xmm3 // x > 0 ? -1.0 : 0.0 142 subsd %xmm3, %xmm1 // x > 0 ? trunc(x) + 1.0 : trunc(x) 143#if defined( __i386__ ) 144 movsd %xmm1, floatval 145 fldl floatval 146 add $4, %esp 147#else 148 movapd %xmm1, %xmm0 149#endif 150 ret 151 152L_roundDown: // we already know that x is not integral 153 psrlq $63, %xmm0 // x < 0 ? 1 : 0 154 cvtdq2pd %xmm0, %xmm3 // x < 0 ? 1.0 : 0.0 155 subsd %xmm3, %xmm1 // x < 0 ? trunc(x) - 1.0 : trunc(x) 156 pcmpeqb %xmm0, %xmm0 157 psrlq $1, %xmm0 // 0x7fffffffffffffff 158 orpd %xmm2, %xmm0 // x < 0 ? 0xffffffffffffffff : 0x7fffffffffffffff 159 andpd %xmm1, %xmm0 160#if defined( __i386__ ) 161 movsd %xmm0, floatval 162 fldl floatval 163 add $4, %esp 164#endif 165 ret 166 167L_edgeCases: // if |x| < 1.0, jump back to mainline 168 xorpd %xmm7, %xmm7 // initialize data in xmm7 169 jl L_reentryForSmallX // otherwise, we just return x 170#if defined( __i386__ ) 171 movsd %xmm0, floatval 172 fldl floatval 173 add $4, %esp 174#endif 175 ret 176 177/* 1787: //Round to even nearest, half way case 179 addsd %xmm6, %xmm6 // 1.0 180 pcmpeqb %xmm3, %xmm3 // -1ULL 181 psubq %xmm3, %xmm7 // add one to the truncation mask shift constant 182 psllq %xmm7, %xmm3 // prepare a new truncation mask with left edge past unit bit. (Works for 1.0 too, since least significant bit of exponent has odd parity) 183 orpd %xmm2, %xmm6 // copysign( 1.0, x ) 184 pandn %xmm1, %xmm3 // if trunc(x) is odd, this bit will be non-zero (in the +-0.5, case we have +-0.5 here instead of just a bit set ) 185 xorpd %xmm4, %xmm4 // 0.0 186 pcmpeqd %xmm4, %xmm3 // check if each 32-bit chunk is equal to 0. Unforunately there is no 64-bit integer compare. Wed hit denorms here if we use double precision compare. 187 pshufd $0xE1, %xmm3, %xmm4 // swap both chunks 188 pand %xmm4, %xmm3 // trunc(x) is odd ? 0 : -1U (make sure the other chunk is also equal to 0) 189 andnpd %xmm6, %xmm3 // trunc(x) is odd ? copysign( 1.0, x ) : 0.0f 190 addsd %xmm3, %xmm1 // round 191 192#if defined( __i386__ ) 193 movsd %xmm1, (STACKP) 194 fldl (STACKP) 195#else 196 movsd %xmm1, %xmm0 197#endif 198 ADDP $(16-FRAME_SIZE), STACKP 199 ret 200 201 */ 202