···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
43 *
···3635 *
3736 * The following #defines are tightly coupled to the u-architecture:
3837 */
3939-4040-#ifdef DARLING
4141-# define _longcopy longcopy@PLT
4242-#endif
43384439#define kShort 80 // too short to bother with SSE (must be >=80)
4540#define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
43 *
···4241 * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
4342 * on P4s and probably other processors.
4443 */
4545-4646-#ifdef DARLING
4747-# define _memset memset
4848-# define _memset_pattern4 memset_pattern4
4949-# define _memset_pattern8 memset_pattern8
5050-# define _memset_pattern16 memset_pattern16
5151-# define _bzero bzero@PLT
5252-.type memset, @function
5353-#endif
54445545#define kShort 255 // for nonzero memset(), too short for commpage
5646
+1-7
src/libc/x86_64/string/strcmp.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
43 *
···3534// reading bytes past the difference. To avoid this, we never do a load
3635// that crosses a page boundary.
37363838-#ifdef DARLING
3939-# define _strcmp strcmp
4040-.type strcmp, @function
4141-#endif
4242-4337 .text
4438 .globl _strcmp
45394646- .align (2<<4)
4040+ .align 4
4741_strcmp: // int strcmp(const char *s1,const char *s2);
48424943// In order to avoid spurious page faults, we loop over:
+1-7
src/libc/x86_64/string/strcpy.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
43 *
···3736//
3837// We align the destination, because unaligned vector stores are slow.
39384040-#ifdef DARLING
4141-# define _strcpy strcpy
4242-.type strcpy, @function
4343-#endif
4444-4539 .text
4640 .globl _strcpy
47414848- .align (2<<4)
4242+ .align 4
4943_strcpy: // char *strcpy(const char *dst, const char *src);
5044 movq %rdi,%rcx // preserve dest ptr so we can return it
5145 movl %edi,%edx // copy low 4 bytes of dest ptr
+1-7
src/libc/x86_64/string/strlcat.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2007 Apple Inc. All rights reserved.
43 *
···4746// On Core2 class machines, this algorithm seems to be faster than the
4847// naive byte-by-byte version for operands longer than about 11 bytes.
49485050-#ifdef DARLING
5151-# define _strlcat strlcat
5252-.type strlcat, @function
5353-#endif
5454-5549 .text
5650 .globl _strlcat
5751···6458// %rsi = source ptr
6559// %rdx = size
66606767- .align (2<<4)
6161+ .align 4
6862_strlcat: // size_t *strlcat(char *dst, const char *src, size_t size);
6963 movl %edi,%ecx // copy buffer ptr
7064 movq %rdi,%r10 // save copies of buffer ptr and length
+1-7
src/libc/x86_64/string/strlcpy.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2007 Apple Inc. All rights reserved.
43 *
···4443// with one exception: 0x01 bytes preceeding the first zero are also
4544// mapped to 0x80.
46454747-#ifdef DARLING
4848-# define _strlcpy strlcpy
4949-.type strlcpy,@function
5050-#endif
5151-5246 .text
5347 .globl _strlcpy
5448···5751// %rsi = source ptr
5852// %rdx = length
59536060- .align (2<<4)
5454+ .align 4
6155_strlcpy: // size_t *strlcpy(char *dst, const char *src, size_t size);
6256 movl %esi,%ecx // copy source ptr
6357 movq %rdi,%r10 // copy dest ptr
+1-7
src/libc/x86_64/string/strlen.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2005-2007 Apple Inc. All rights reserved.
43 *
···3231 * We favor the fall-through (ie, short operand) path.
3332 */
34333535-#ifdef DARLING
3636-# define _strlen strlen
3737-.type strlen, @function
3838-#endif
3939-4034 .text
4135 .globl _strlen
4242- .align (2<<4), 0x90
3636+ .align 4, 0x90
4337_strlen: // size_t strlen(char *b);
4438 pxor %xmm0,%xmm0 // zero %xmm0
4539 movl %edi,%ecx // copy low half of ptr
+1-7
src/libc/x86_64/string/strncmp.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
43 *
···3534// reading bytes past the difference. To avoid this, we never do a load
3635// that crosses a page boundary.
37363838-#ifdef DARLING
3939-# define _strncmp strncmp
4040-.type strncmp, @function
4141-#endif
4242-4337#define kShort 20 // too short for vectors (must be >16)
44384539 .text
4640 .globl _strncmp
47414848- .align (2<<4)
4242+ .align 4
4943_strncmp: // int strncmp(const char *s1, const char *s2, size_t len);
5044 cmpq $(kShort),%rdx // worth accelerating?
5145 ja LNotShort // yes
-7
src/libc/x86_64/string/strncpy.S
···11-// Modified by Lubos Dolezel for Darling
21/*
32 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
43 *
···4241// Recall that strncpy() zero fills the remainder of the dest buffer,
4342// and does not terminate the string if its length is greater than or
4443// equal to n.
4545-4646-#ifdef DARLING
4747-# define _strncpy strncpy
4848-# define _bzero bzero@PLT
4949-.type strncpy, @function
5050-#endif
51445245#define kShort 31 // too short to bother with vector loop
5346
···2828S1: .double -3.333333463718495862077843807
292930303131- ////.const
3232- .align (1<<4)
3131+ .const
3232+ .align 4
33333434/* Define some coefficients for center polynomial (used for x in [-.62,
3535 +.62]). These are stored in pairs at aligned addresses for use in SIMD
···158158 showing that a valid result will be obtained regardless of which
159159 value rsqrtss provides.
160160*/
161161-#define _acosf acosf
162162- .align (1<<5)
161161+ .align 5
163162#if !defined DevelopmentInstrumentation
164163 // This is the regular name used in the deployed implementation.
165165- .globl acosf
166166- acosf:
164164+ .globl _acosf
165165+ _acosf:
167166#else
168167 // This is the name used for a special test version of the routine.
169168 .globl _acosfInstrumented
···2727S1: .double -3.333333463718495862077843807
282829293030- ////.const
3131- .align (1<<4)
3030+ .const
3131+ .align 4
32323333/* Define some coefficients for center polynomial (used for x in [-.57,
3434 +.57]). These are stored in pairs at aligned addresses for use in SIMD
···162162 showing that a valid result will be obtained regardless of which
163163 value rsqrtss provides.
164164*/
165165- .align (1<<5)
165165+ .align 5
166166#if !defined DevelopmentInstrumentation
167167 // This is the regular name used in the deployed implementation.
168168- .globl asinf
169169- asinf:
168168+ .globl _asinf
169169+ _asinf:
170170#else
171171 // This is the name used for a special test version of the routine.
172172 .globl _asinfInstrumented
+6-6
src/libm/Source/Intel/asinhf.S
···5252#include <machine/asm.h>
5353#include "abi.h"
54545555-////.const
5555+.const
56565757// Coefficients for 7th order polynomial approximation on [0, 1/4]
5858// The polynomail is computed in packed factored form as follows:
···6161// lo double: (cx * (x + alo)) * (x(x + b1lo) + b0lo)
6262//
6363// The high and low parts are then unpacked and multiplied.
6464-.align (1<<4)
6464+.align 4
6565asinhf_low: .quad 0x4008183efcaf7119, 0x4007eba0a6c21cf1 // b0hi, b0lo
6666 .quad 0x3ffe9547f4507ace, 0xbffd2c173c2ad586 // ahi, alo
6767 .quad 0x40021aa6afb79159, 0xc0015e2ed556dde5 // b1hi, b1lo
···69697070// Coefficients for rational approximation on [1/4, 4]
7171// p(x) and q(x) are computed side-by-side in packed arithmetic, then unpacked and divided.
7272-.align (1<<4)
7272+.align 4
7373asinhf_mid: .quad 0x3e4328ccef61bd30, 0x3f80f6f9cf323b3c // p[0], q[0]
7474 .quad 0x3f80f6e561f06785, 0x3f85b29a3e277523 // p[1], q[1]
7575 .quad 0x3f85b32a2f11b40a, 0x3f8e26416e925090 // p[2], q[2]
···7979 .quad 0x3f26c17c7b263d18, 0x3f001e11059bddca // p[6], q[6]
808081818282-.align (1<<4)
8282+.align 4
8383// Polynomial coefficients for the correction to log(2x) for the "large" case.
8484// The polynomial is computed in factored form as follows:
8585//
···348348 .quad 0x3fe61e3efda46467, 0x3fe0080402010080 // log(1.99609), 1/1.99609
349349350350.literal8
351351-.align (1<<3)
351351+.align 3
352352one: .quad 0x3ff0000000000000
353353onehalf: .quad 0x3fe0000000000000
354354onethird: .quad 0x3fd5555555555555
···360360#else
361361 #define RELATIVE_ADDR( _a ) (_a)-asinhf_body( %ecx )
362362363363-.align (1<<4)
363363+.align 4
364364asinhf_pic:
365365 movl (%esp), %ecx // Copy address of this instruction to %ecx
366366 ret
+6-6
src/libm/Source/Intel/atan2f.S
···2929C2: .double 0.0029352921857004596570518
303031313232- //.const
3333- .align (1<<4)
3232+ .const
3333+ .align 4
34343535/* Define some coefficients for center polynomial (used for x in [-1, +1]).
3636 These are stored in pairs at aligned addresses for use in SIMD
···4242C10: .double 5.4728447324456990092824269, 6.7197076223592378022736307
43434444// This needs to be 16-byte aligned because it is used in an orpd instruction.
4545- .align (1<<4)
4545+ .align 4
4646pPi: .double +3.141592653589793238462643 // pi.
47474848···209209 (x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies
210210 atan2f(y0, x0) <= atan2f(y1, x1).)
211211*/
212212- .align (1<<5)
213213- .globl atan2f
214214-atan2f:
212212+ .align 5
213213+ .globl _atan2f
214214+_atan2f:
215215216216 cvtss2sd Argy, y // Convert to double precision.
217217 cvtss2sd Argx, x
+5-5
src/libm/Source/Intel/atanf.S
···2121C2: .double 0.0029352921857004596570518
222223232424- //.const
2525- .align (1<<4)
2424+ .const
2525+ .align 4
26262727/* Define some coefficients for center polynomial (used for x in [-1, +1]).
2828 These are stored in pairs at aligned addresses for use in SIMD
···155155 Exhaustive testing proved this routine returns faithfully rounded
156156 results.
157157*/
158158- .align (1<<5)
158158+ .align 5
159159#if !defined DevelopmentInstrumentation
160160 // This is the regular name used in the deployed implementation.
161161- .globl atanf
162162- atanf:
161161+ .globl _atanf
162162+ _atanf:
163163#else
164164 // This is the name used for a special test version of the routine.
165165 .globl _atanfInstrumented
+5-5
src/libm/Source/Intel/atanhf.S
···1212#include <machine/asm.h>
1313#include "abi.h"
14141515-//.const
1616-.align (1<<4)
1515+.const
1616+.align 4
1717// Polynomial coefficients used for the 7/8 < |x| < 1 case. These are addressed by offset from big_table,
1818// so take care if you move things around.
1919···554554 // a = 7/8
555555556556.literal8
557557-.align (1<<3)
557557+.align 3
558558one_256th: .quad 0x3f70000000000000 // 1/256
559559one: .quad 0x3ff0000000000000
560560one_plus_eps: .quad 0x3ff0000000000001 // 1 + ulp
···567567neglog2_2: .quad 0xbfd62e42fefa39ef // -ln(2)/2
568568569569.literal4
570570-.align (1<<2)
570570+.align 2
571571f256: .long 0x43800000 // 256.0f
572572573573.text
···577577#elif defined( __i386__ )
578578 #define RELATIVE_ADDR( _a) (_a)-atanhf_body( CX_P )
579579 #define INDEX %edi
580580-.align (1<<4)
580580+.align 4
581581atanhf_pic:
582582 movl (%esp), %ecx // copy address of local_addr to %ecx
583583 ret
···1010#endif
11111212.text
1313-.globl hypot
1414-.globl cabs
1313+.globl _hypot
1414+.globl _cabs
15151616#if defined __i386__
1717···20202121// Entry point --------------------------------------------------------
22222323-.align (1<<4)
2424-cabs:
2525-hypot:
2323+.align 4
2424+_cabs:
2525+_hypot:
2626 mov 8(%esp), %eax
2727 mov 16(%esp), %edx
2828 mov ABSHI, %ecx
···5555// Special case handling ----------------------------------------------
56565757L_xHiIsZero:
5858- cmpl $0, 4(%esp) // is the low word of x zero?
5858+ cmp $0, 4(%esp) // is the low word of x zero?
5959 jnz L_returnFromXHiIsZero // if not, jump back to mainline
6060L_returnAbsY:
6161 and %ecx, 16(%esp)
···6363 ret
64646565L_yHiIsZero:
6666- cmpl $0, 12(%esp) // is the low word of y zero?
6666+ cmp $0, 12(%esp) // is the low word of y zero?
6767 jnz L_returnFromYHiIsZero // if not, jump back to mainline
6868L_returnAbsX:
6969 and %ecx, 8(%esp)
···7373L_xIsSpecial:
7474 cmp INFHI, %edx // check if y is infinity
7575 jnz L_returnAbsX
7676- cmpl $0, 12(%esp)
7676+ cmp $0, 12(%esp)
7777 jz L_returnAbsY
7878 jmp L_returnAbsX
7979···84848585// Entry point --------------------------------------------------------
86868787-.align (1<<4)
8888-cabs:
8989-hypot:
8787+.align 4
8888+_cabs:
8989+_hypot:
9090 movd %xmm0, %rax
9191 movd %xmm1, %rdx
9292 mov ABSMASK, %rcx
+9-9
src/libm/Source/Intel/hypotf.S
···1313#define INFINITY $0x7f800000
14141515.text
1616-.globl cabsf
1717-.globl hypotf
1616+.globl _cabsf
1717+.globl _hypotf
18181919// Entry points -------------------------------------------------------
20202121#if defined __i386__
2222-.align (1<<4)
2323-cabsf: // on i386, we can use the same code for
2424-hypotf: // hypotf and cabsf, because the arguments
2222+.align 4
2323+_cabsf: // on i386, we can use the same code for
2424+_hypotf: // hypotf and cabsf, because the arguments
2525 mov 4(%esp), %eax // come in at the same stack offsets
2626 mov 8(%esp), %edx //
2727 movss 4(%esp), %xmm0 // real at esp + 4
2828 movss 8(%esp), %xmm1 // imag at esp + 8
2929#else
3030-.align (1<<4) // however, on x86_64, the registers used
3131-cabsf: // are different. cabsf's arguments come
3030+.align 4 // however, on x86_64, the registers used
3131+_cabsf: // are different. cabsf's arguments come
3232 pshufd $0xfd, %xmm0, %xmm1 // in packed in xmm0.
3333-.align (1<<4)
3434-hypotf: //
3333+.align 4
3434+_hypotf: //
3535 movd %xmm0, %eax // hypotf, on the other hand, gets x in
3636 movd %xmm1, %edx // xmm0 and y in xmm1.
3737#endif
···1818#include <machine/asm.h>
1919#include "abi.h"
20202121-//.const
2222-.align (1<<5)
2121+.const
2222+.align 5
2323xone: .quad 0x3ff0000000000000, 0
2424frexp_exp_mask: .quad 0x7ff0000000000000, 0
2525frexp_mant_mask: .quad 0x800fffffffffffff, 0
···2828//log10_key_mask: .quad 0x07e0000000000000, 0
2929log1p_not_ulp_mask: .quad 0xfffffffffffffffe, 0
30303131-.align (1<<5)
3131+.align 5
3232lgel: .quad 0xB8AA3B295C17F0BC, 0x3fff
3333ln2l: .quad 0xB17217F7D1CF79AC, 0x3ffe //ln(2) rounded up to long double
3434···37373838c0: .quad 0xFFFFFFFFFFFFFFD7, 0xbffd //c0 = -.4999999999999999988974167423L
39394040-.align (1<<5)
4040+.align 5
4141a01: .double .827742667285236703751556405085096, -2.00038644890076831031534988283768 //a0,a1
4242b01: .double 1.51843353412997067893915870795354, 1.54454569915832086827096843200102 //b0,b1
43434444// The lookup table is in a funny format since it has 2 long double and a single.
4545 // {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 6-byte pad}
4646-.align (1<<5)
4646+.align 5
4747LOOKUP:
4848// This is the table for a, ap1, va, lg1pa: a = (float)k*scale, ap1 = a + 1, va = (long double)1./(1.+a), lg1pa = (long double)log2(1.+a)
4949// In C this would be
···375375.quad 0x3ff0000000000000, 0x0000000000000000 //{0x1p+0, 0x0p+0}, k=63
376376377377.literal8
378378-.align (1<<3)
378378+.align 3
379379one: .double 1.0
380380mone: .double -1.0
381381
+4-4
src/libm/Source/Intel/log10f.S
···99#include <machine/asm.h>
1010#include "abi.h"
11111212-//.const
1313-.align (1<<4)
1212+.const
1313+.align 4
14141515// 256 entry of Lookup table of values used for log10 calculation, generated as:
1616//
···300300301301302302.literal8
303303-.align (1<<3)
303303+.align 3
304304one: .double 1.0
305305onehalf: .double 0.5
306306onethird: .quad 0x3fd5555555555555 // 1/3
···321321 #define RELATIVE_ADDR2( _a, _i, _step) (_a)-rel_addr( CX_P, _i, _step )
322322323323//a short routine to get the local address
324324-.align (1<<4)
324324+.align 4
325325log10f_pic: movl (%esp), %ecx //copy address of local_addr to %ecx
326326 ret
327327#else
+4-4
src/libm/Source/Intel/log2f.S
···99#include <machine/asm.h>
1010#include "abi.h"
11111212-//.const
1313-.align (1<<4)
1212+.const
1313+.align 4
14141515// 256 entry of Lookup table of values used for log2 calculation, generated as:
1616//
···300300 .quad 0x3fefe8e4f15bd1a0, 0x3fe0080402010080 //log2(1.99609), 1/1.99609
301301302302.literal8
303303-.align (1<<3)
303303+.align 3
304304one: .double 1.0
305305onehalf: .double 0.5
306306onethird: .quad 0x3fd5555555555555 // 1/3
···321321 #define RELATIVE_ADDR2( _a, _i, _step) (_a)-rel_addr( CX_P, _i, _step )
322322323323//a short routine to get the local address
324324-.align (1<<4)
324324+.align 4
325325log2f_pic: movl (%esp), %ecx //copy address of local_addr to %ecx
326326 ret
327327#else
+5-5
src/libm/Source/Intel/logf.S
···1515#include <machine/asm.h>
1616#include "abi.h"
17171818-//.const
1919-.align (1<<4)
1818+.const
1919+.align 4
20202121// 511 entry of Lookup table of values used for log and log1p calculation, generated as:
2222//
···436436437437438438.literal8
439439-.align (1<<3)
439439+.align 3
440440one: .double 1.0
441441onehalf: .double 0.5
442442onethird: .quad 0x3fd5555555555555 // 1/3
···448448log2: .quad 0x3fe62e42fefa39efULL // ln(2)
449449450450.literal4
451451-.align (1<<2)
451451+.align 2
452452f256: .long 0x43800000 //256.0f
453453r256: .long 0x3b800000 //1.0f/256.0f
454454···465465 #define RELATIVE_ADDR2( _a, _i, _step) (_a)-rel_addr( CX_P, _i, _step )
466466467467//a short routine to get the local address
468468-.align (1<<4)
468468+.align 4
469469logf_pic: movl (%esp), %ecx //copy address of local_addr to %ecx
470470 ret
471471#else