this repo has no description
1// float nearbyintf(float x)
2//
3// returns the argument rounded to an integral value using the prevailing
4// rounding mode, and without raising the inexact flag (C99 7.12.9.4).
5//
6// -- Stephen Canon, January 2010
7
8#include <System/i386/cpu_capabilities.h>
9
10.text
11.align 4
12.globl _nearbyintf
13_nearbyintf:
14
15// Fast path if SSE4.1 is available ------------------------------------------
16
17#if defined __i386__
18 movss 4(%esp), %xmm0
19 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
20 test $(kHasSSE4_1), %eax
21 jz L_noSSE4_1
22 roundss $0xc, %xmm0, %xmm0
23 movss %xmm0, 4(%esp)
24 flds 4(%esp)
25 ret
26#elif defined __x86_64__
27 movabs _COMM_PAGE_CPU_CAPABILITIES, %eax
28 test $(kHasSSE4_1), %eax
29 jz L_noSSE4_1
30 roundss $0xc, %xmm0, %xmm0
31 ret
32#else
33#error "This implementations supports i386 and x86_64 only"
34#endif
35
36// Fallback path if SSE4.1 is not available ----------------------------------
37//
38// FPSCR rounding control bits 11:10
39//
40// 00 -- Round to nearest even
41// 01 -- Round to -Inf
42// 10 -- Round to +Inf
43// 11 -- Round to zero
44//
45// This path is adapted from Ian Ollmann's original implementation
46
47L_noSSE4_1:
48 movd %xmm0, %ecx
49#if defined __i386__
50 sub $4, %esp // make space for the fp control word
51 #define fpcw (%esp)
52 #define floatval 8(%esp) // address of argument to nearbyintf
53#else
54 #define fpcw -4(%rsp) // space for fp control word in red zone
55#endif
56 fnstcw fpcw
57
58// Detect special cases and branch
59 mov %ecx, %edx
60 and $0x7fffffff, %ecx // |x|
61 xor %ecx, %edx // signbit of x
62 movd %edx, %xmm2
63 sub $0x3f800000, %ecx // |x| - exponent bias
64 mov $0x80000000, %eax
65 cmp $0x0b800000, %ecx // if |x| < 1.0 or |x| >= 0x1.0p23f or isnan(x)
66 jae L_edgeCases // branch
67
68 shr $23, %ecx // exponent of x, in [0,22]
69 add $8, %ecx
70 sar %cl, %eax // mask covering integral bits of x
71
72L_reentryForSmallX:
73 movd %eax, %xmm1
74 andps %xmm0, %xmm1 // trunc(x)
75 ucomiss %xmm1, %xmm0 // if x == trunc(x)
76 je L_roundToZero // return trunc(x)
77
78 mov fpcw, %ecx
79 and $0xc00, %ecx // current rounding mode encoding
80 sub $0x400, %ecx
81 cmp $0x400, %ecx
82 jg L_roundToZero // ecx = 0x800
83 je L_roundUp // ecx = 0x400
84 jb L_roundDown // ecx = 0x000
85
86L_roundToNearest: // fall into default rounding path
87 subss %xmm1, %xmm0 // fractional part of x
88 movd %xmm0, %ecx
89 or $0x3f000000, %edx // copysignf(0.5f,x)
90 xor %edx, %ecx // frac(x) ^ copysignf(0.5f,x)
91 jz L_exactHalfway // handle exact halfway cases separately
92
93 sub $0x00800000, %ecx // |frac(x)| > 0.5 ? negative : positive
94 or $0x00800000, %edx
95 sar $31, %ecx // |frac(x)| > 0.5 ? -1 : 0
96 and %ecx, %edx // |frac(x)| > 0.5 ? copysignf(1.0f,x) : 0
97 movd %edx, %xmm0
98 addss %xmm1, %xmm0 // |frac(x)| > 0.5 ? trunc(x) + copysign(1.0f,x) : trunc(x)
99 orps %xmm2, %xmm0 // apply signbit to get the right zero
100#if defined( __i386__ )
101 movss %xmm0, floatval
102 flds floatval
103 add $4, %esp
104#endif
105 ret
106
107L_exactHalfway:
108 shl $1, %eax
109 addss %xmm0, %xmm0 // copysignf(1.0f,x)
110 or $0x80000000, %eax // mask covering signbit and bits larger than 2^1
111 movd %eax, %xmm3
112 pandn %xmm1, %xmm3 // trunc(x) odd ? non-zero : zero
113 pxor %xmm4, %xmm4
114 pcmpeqd %xmm4, %xmm3 // trunc(x) odd ? 0 : -1
115 andnps %xmm0, %xmm3 // trunc(x) odd ? copysignf(1.0f,x) : 0.0f
116 addss %xmm3, %xmm1
117 orps %xmm2, %xmm1 // apply signbit to result
118L_roundToZero:
119#if defined( __i386__ )
120 movss %xmm1, floatval
121 flds floatval
122 add $4, %esp
123#else
124 movaps %xmm1, %xmm0
125#endif
126 ret
127
128L_roundUp: // we know that x is not integral
129 psrad $31, %xmm0 // x < 0 ? -1 : 0
130 pcmpeqb %xmm3, %xmm3 // -1
131 psubd %xmm3, %xmm0 // x < 0 ? 0 : 1
132 cvtdq2ps %xmm0, %xmm0 // x < 0 ? 0.0 : 1.0
133 addss %xmm1, %xmm0 // x < 0 ? trunc(x) : trunc(x) + 1
134 orps %xmm2, %xmm0 // apply signbit to get the right zero
135#if defined( __i386__ )
136 movss %xmm0, floatval
137 flds floatval
138 add $4, %esp
139#endif
140 ret
141
142L_roundDown: // we know that x is not integral
143 psrad $31, %xmm0 // x < 0 ? -1 : 0
144 cvtdq2ps %xmm0, %xmm0 // x < 0 ? -1 : 0
145 addss %xmm1, %xmm0 // x < 0 ? trunc(x) - 1 : trunc(x)
146#if defined( __i386__ )
147 movss %xmm0, floatval
148 flds floatval
149 add $4, %esp
150#endif
151 ret
152
153L_edgeCases: // if |x| < 1.0, jump back to mainline
154 jl L_reentryForSmallX // otherwise, we just return x
155#if defined( __i386__ )
156 movss %xmm0, floatval
157 flds floatval
158 add $4, %esp
159#endif
160 ret