this repo has no description
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at fixPythonPipStalling 160 lines 6.0 kB view raw
1// float nearbyintf(float x) 2// 3// returns the argument rounded to an integral value using the prevailing 4// rounding mode, and without raising the inexact flag (C99 7.12.9.4). 5// 6// -- Stephen Canon, January 2010 7 8#include <System/i386/cpu_capabilities.h> 9 10.text 11.align 4 12.globl _nearbyintf 13_nearbyintf: 14 15// Fast path if SSE4.1 is available ------------------------------------------ 16 17#if defined __i386__ 18 movss 4(%esp), %xmm0 19 mov _COMM_PAGE_CPU_CAPABILITIES, %eax 20 test $(kHasSSE4_1), %eax 21 jz L_noSSE4_1 22 roundss $0xc, %xmm0, %xmm0 23 movss %xmm0, 4(%esp) 24 flds 4(%esp) 25 ret 26#elif defined __x86_64__ 27 movabs _COMM_PAGE_CPU_CAPABILITIES, %eax 28 test $(kHasSSE4_1), %eax 29 jz L_noSSE4_1 30 roundss $0xc, %xmm0, %xmm0 31 ret 32#else 33#error "This implementations supports i386 and x86_64 only" 34#endif 35 36// Fallback path if SSE4.1 is not available ---------------------------------- 37// 38// FPSCR rounding control bits 11:10 39// 40// 00 -- Round to nearest even 41// 01 -- Round to -Inf 42// 10 -- Round to +Inf 43// 11 -- Round to zero 44// 45// This path is adapted from Ian Ollmann's original implementation 46 47L_noSSE4_1: 48 movd %xmm0, %ecx 49#if defined __i386__ 50 sub $4, %esp // make space for the fp control word 51 #define fpcw (%esp) 52 #define floatval 8(%esp) // address of argument to nearbyintf 53#else 54 #define fpcw -4(%rsp) // space for fp control word in red zone 55#endif 56 fnstcw fpcw 57 58// Detect special cases and branch 59 mov %ecx, %edx 60 and $0x7fffffff, %ecx // |x| 61 xor %ecx, %edx // signbit of x 62 movd %edx, %xmm2 63 sub $0x3f800000, %ecx // |x| - exponent bias 64 mov $0x80000000, %eax 65 cmp $0x0b800000, %ecx // if |x| < 1.0 or |x| >= 0x1.0p23f or isnan(x) 66 jae L_edgeCases // branch 67 68 shr $23, %ecx // exponent of x, in [0,22] 69 add $8, %ecx 70 sar %cl, %eax // mask covering integral bits of x 71 72L_reentryForSmallX: 73 movd %eax, %xmm1 74 andps %xmm0, %xmm1 // trunc(x) 75 ucomiss %xmm1, %xmm0 // if x == trunc(x) 76 je L_roundToZero // return trunc(x) 77 78 mov fpcw, %ecx 79 and $0xc00, %ecx // current rounding mode encoding 80 sub $0x400, %ecx 81 cmp $0x400, %ecx 82 jg L_roundToZero // ecx = 0x800 83 je L_roundUp // ecx = 0x400 84 jb L_roundDown // ecx = 0x000 85 86L_roundToNearest: // fall into default rounding path 87 subss %xmm1, %xmm0 // fractional part of x 88 movd %xmm0, %ecx 89 or $0x3f000000, %edx // copysignf(0.5f,x) 90 xor %edx, %ecx // frac(x) ^ copysignf(0.5f,x) 91 jz L_exactHalfway // handle exact halfway cases separately 92 93 sub $0x00800000, %ecx // |frac(x)| > 0.5 ? negative : positive 94 or $0x00800000, %edx 95 sar $31, %ecx // |frac(x)| > 0.5 ? -1 : 0 96 and %ecx, %edx // |frac(x)| > 0.5 ? copysignf(1.0f,x) : 0 97 movd %edx, %xmm0 98 addss %xmm1, %xmm0 // |frac(x)| > 0.5 ? trunc(x) + copysign(1.0f,x) : trunc(x) 99 orps %xmm2, %xmm0 // apply signbit to get the right zero 100#if defined( __i386__ ) 101 movss %xmm0, floatval 102 flds floatval 103 add $4, %esp 104#endif 105 ret 106 107L_exactHalfway: 108 shl $1, %eax 109 addss %xmm0, %xmm0 // copysignf(1.0f,x) 110 or $0x80000000, %eax // mask covering signbit and bits larger than 2^1 111 movd %eax, %xmm3 112 pandn %xmm1, %xmm3 // trunc(x) odd ? non-zero : zero 113 pxor %xmm4, %xmm4 114 pcmpeqd %xmm4, %xmm3 // trunc(x) odd ? 0 : -1 115 andnps %xmm0, %xmm3 // trunc(x) odd ? copysignf(1.0f,x) : 0.0f 116 addss %xmm3, %xmm1 117 orps %xmm2, %xmm1 // apply signbit to result 118L_roundToZero: 119#if defined( __i386__ ) 120 movss %xmm1, floatval 121 flds floatval 122 add $4, %esp 123#else 124 movaps %xmm1, %xmm0 125#endif 126 ret 127 128L_roundUp: // we know that x is not integral 129 psrad $31, %xmm0 // x < 0 ? -1 : 0 130 pcmpeqb %xmm3, %xmm3 // -1 131 psubd %xmm3, %xmm0 // x < 0 ? 0 : 1 132 cvtdq2ps %xmm0, %xmm0 // x < 0 ? 0.0 : 1.0 133 addss %xmm1, %xmm0 // x < 0 ? trunc(x) : trunc(x) + 1 134 orps %xmm2, %xmm0 // apply signbit to get the right zero 135#if defined( __i386__ ) 136 movss %xmm0, floatval 137 flds floatval 138 add $4, %esp 139#endif 140 ret 141 142L_roundDown: // we know that x is not integral 143 psrad $31, %xmm0 // x < 0 ? -1 : 0 144 cvtdq2ps %xmm0, %xmm0 // x < 0 ? -1 : 0 145 addss %xmm1, %xmm0 // x < 0 ? trunc(x) - 1 : trunc(x) 146#if defined( __i386__ ) 147 movss %xmm0, floatval 148 flds floatval 149 add $4, %esp 150#endif 151 ret 152 153L_edgeCases: // if |x| < 1.0, jump back to mainline 154 jl L_reentryForSmallX // otherwise, we just return x 155#if defined( __i386__ ) 156 movss %xmm0, floatval 157 flds floatval 158 add $4, %esp 159#endif 160 ret