this repo has no description
1// float nearbyintf(float x)
2//
3// returns the argument rounded to an integral value using the prevailing
4// rounding mode, and without raising the inexact flag (C99 7.12.9.4).
5//
6// -- Stephen Canon, January 2010
7
8#include <System/i386/cpu_capabilities.h>
9
10.text
11.align 4
12.globl _nearbyint
13_nearbyint:
14
15// Fast path if SSE4.1 is available ------------------------------------------
16
17#if defined __i386__
18 movsd 4(%esp), %xmm0
19 mov _COMM_PAGE_CPU_CAPABILITIES, %eax
20 test $(kHasSSE4_1), %eax
21 jz L_noSSE4_1
22 roundsd $0xc, %xmm0, %xmm0
23 movsd %xmm0, 4(%esp)
24 fldl 4(%esp)
25 ret
26#elif defined __x86_64__
27 movabs _COMM_PAGE_CPU_CAPABILITIES, %eax
28 test $(kHasSSE4_1), %eax
29 jz L_noSSE4_1
30 roundsd $0xc, %xmm0, %xmm0
31 ret
32#else
33#error "This implementations supports i386 and x86_64 only"
34#endif
35
36// Fallback path if SSE4.1 is not available ----------------------------------
37//
38// FPSCR rounding control bits 11:10
39//
40// 00 -- Round to nearest even
41// 01 -- Round to -Inf
42// 10 -- Round to +Inf
43// 11 -- Round to zero
44//
45// This path is adapted from Ian Ollmann's original implementation
46
47L_noSSE4_1:
48#if defined __i386__
49 mov 8(%esp), %ecx // high word of input
50 sub $4, %esp // make space for the fp control word
51 #define fpcw (%esp)
52 #define floatval 8(%esp) // address of argument to nearbyintf
53#else
54 movd %xmm0, %rcx
55 shr $32, %rcx // high word of input
56 #define fpcw -4(%rsp) // space for fp control word in red zone
57#endif
58 fnstcw fpcw
59
60 mov %ecx, %edx
61 pcmpeqb %xmm1, %xmm1
62 and $0x7fffffff, %ecx // |x|
63 psllq $63, %xmm1 // -0.0
64 xor %ecx, %edx // signbit of x
65 movapd %xmm1, %xmm2
66 sub $0x3ff00000, %ecx // high word of |x| - exponent bias
67 andpd %xmm0, %xmm2 // signbit of x
68 cmp $0x03400000, %ecx // if |x| < 1.0 or |x| >= 0x1.0p52 or isnan(x)
69 jae L_edgeCases // branch
70
71 shr $20, %ecx // exponent of x, in [0, 51]
72 sub $52, %ecx
73 neg %ecx // number of fractional bits in x
74 pcmpeqb %xmm1, %xmm1
75 movd %ecx, %xmm7
76 psllq %xmm7, %xmm1 // mask covering integral bits of x
77
78L_reentryForSmallX:
79 andpd %xmm0, %xmm1 // trunc(x)
80 ucomisd %xmm1, %xmm0 // if x == trunc(x)
81 je L_roundToZero // return trunc(x)
82
83 mov fpcw, %ecx
84 and $0xc00, %ecx // current rounding mode encoding
85 sub $0x400, %ecx
86 cmp $0x400, %ecx
87 jg L_roundToZero // ecx = 0x800
88 je L_roundUp // ecx = 0x400
89 jb L_roundDown // ecx = 0x000
90
91L_roundToNearest: // fall into default rounding path
92 subsd %xmm1, %xmm0 // fractional part of x
93 pcmpeqb %xmm3, %xmm3
94 psllq $55, %xmm3
95 psrlq $2, %xmm3 // 0.5
96 xorpd %xmm2, %xmm0 // |fract(x)|
97 ucomisd %xmm3, %xmm0 // if |fractional part| == 0.5
98 je L_exactHalfway // branch to handle exact halfway case
99
100 cmpltsd %xmm3, %xmm0 // |fract(x)| < 0.5 ? -1 : 0
101 addsd %xmm3, %xmm3 // 1.0
102 orpd %xmm2, %xmm3 // copysign(1.0,x)
103 andnpd %xmm3, %xmm0 // |fract(x)| < 0.5 ? 0.0 : copysign(1.0,x)
104 addsd %xmm1, %xmm0 // correctly rounded result
105 orpd %xmm2, %xmm0 // restore sign of zero
106#if defined( __i386__ )
107 movsd %xmm0, floatval
108 fldl floatval
109 add $4, %esp
110#endif
111 ret
112
113L_exactHalfway:
114 addsd %xmm3, %xmm3 // 1.0
115 pcmpeqb %xmm4, %xmm4
116 psubq %xmm4, %xmm7
117 psllq %xmm7, %xmm4 // mask covering signbit and bits larger than 2^1
118 orpd %xmm2, %xmm3 // copysign(1.0, x)
119 pandn %xmm1, %xmm4 // trunc(x) odd ? nonzero : zero
120 xorpd %xmm5, %xmm5
121 pcmpeqd %xmm5, %xmm4 // trunc(x) odd ? at least one dword is 0 : -1
122 pshufd $0xe1, %xmm4, %xmm5
123 pand %xmm5, %xmm4 // trunc(x) odd ? 0 : -1
124 andnpd %xmm3, %xmm4 // trunc(x) odd ? copysign(1.0,x), 0
125 addsd %xmm4, %xmm1
126 orpd %xmm2, %xmm1 // reapply sign of zero
127
128L_roundToZero:
129#if defined( __i386__ )
130 movsd %xmm1, floatval
131 fldl floatval
132 add $4, %esp
133#else
134 movapd %xmm1, %xmm0
135#endif
136 ret
137
138L_roundUp: // we already know that x is not integral
139 xorpd %xmm3, %xmm3
140 cmpltsd %xmm0, %xmm3 // x > 0 ? -1 : 0
141 cvtdq2pd %xmm3, %xmm3 // x > 0 ? -1.0 : 0.0
142 subsd %xmm3, %xmm1 // x > 0 ? trunc(x) + 1.0 : trunc(x)
143#if defined( __i386__ )
144 movsd %xmm1, floatval
145 fldl floatval
146 add $4, %esp
147#else
148 movapd %xmm1, %xmm0
149#endif
150 ret
151
152L_roundDown: // we already know that x is not integral
153 psrlq $63, %xmm0 // x < 0 ? 1 : 0
154 cvtdq2pd %xmm0, %xmm3 // x < 0 ? 1.0 : 0.0
155 subsd %xmm3, %xmm1 // x < 0 ? trunc(x) - 1.0 : trunc(x)
156 pcmpeqb %xmm0, %xmm0
157 psrlq $1, %xmm0 // 0x7fffffffffffffff
158 orpd %xmm2, %xmm0 // x < 0 ? 0xffffffffffffffff : 0x7fffffffffffffff
159 andpd %xmm1, %xmm0
160#if defined( __i386__ )
161 movsd %xmm0, floatval
162 fldl floatval
163 add $4, %esp
164#endif
165 ret
166
167L_edgeCases: // if |x| < 1.0, jump back to mainline
168 xorpd %xmm7, %xmm7 // initialize data in xmm7
169 jl L_reentryForSmallX // otherwise, we just return x
170#if defined( __i386__ )
171 movsd %xmm0, floatval
172 fldl floatval
173 add $4, %esp
174#endif
175 ret
176
177/*
1787: //Round to even nearest, half way case
179 addsd %xmm6, %xmm6 // 1.0
180 pcmpeqb %xmm3, %xmm3 // -1ULL
181 psubq %xmm3, %xmm7 // add one to the truncation mask shift constant
182 psllq %xmm7, %xmm3 // prepare a new truncation mask with left edge past unit bit. (Works for 1.0 too, since least significant bit of exponent has odd parity)
183 orpd %xmm2, %xmm6 // copysign( 1.0, x )
184 pandn %xmm1, %xmm3 // if trunc(x) is odd, this bit will be non-zero (in the +-0.5, case we have +-0.5 here instead of just a bit set )
185 xorpd %xmm4, %xmm4 // 0.0
186 pcmpeqd %xmm4, %xmm3 // check if each 32-bit chunk is equal to 0. Unforunately there is no 64-bit integer compare. Wed hit denorms here if we use double precision compare.
187 pshufd $0xE1, %xmm3, %xmm4 // swap both chunks
188 pand %xmm4, %xmm3 // trunc(x) is odd ? 0 : -1U (make sure the other chunk is also equal to 0)
189 andnpd %xmm6, %xmm3 // trunc(x) is odd ? copysign( 1.0, x ) : 0.0f
190 addsd %xmm3, %xmm1 // round
191
192#if defined( __i386__ )
193 movsd %xmm1, (STACKP)
194 fldl (STACKP)
195#else
196 movsd %xmm1, %xmm0
197#endif
198 ADDP $(16-FRAME_SIZE), STACKP
199 ret
200
201 */
202