18#if defined (_WIN32) || defined (__i386__)
19#define BT_USE_SSE_IN_API
27#if defined BT_USE_SIMD_VECTOR3
43#if defined BT_USE_SSE || defined _WIN32
45#define LOG2_ARRAY_SIZE 6
46#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
54 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
81 float4 v3 = vertices[3]; vertices += 4;
102 v3 = vertices[3]; vertices += 4;
123 v3 = vertices[3]; vertices += 4;
144 v3 = vertices[3]; vertices += 4;
193 for( ; index + 4 <= count / 4; index+=4 )
198 float4 v3 = vertices[3]; vertices += 4;
219 v3 = vertices[3]; vertices += 4;
240 v3 = vertices[3]; vertices += 4;
261 v3 = vertices[3]; vertices += 4;
294 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
295 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
296 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
297 movaps %[t0], %[max] // vertices[0] \n\
298 movlhps %[t1], %[max] // x0y0x1y1 \n\
299 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
300 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
301 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
302 movhlps %[t0], %[t1] // z0w0z1w1 \n\
303 movaps %[t3], %[t0] // vertices[2] \n\
304 movlhps %[t4], %[t0] // x2y2x3y3 \n\
305 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
306 movhlps %[t3], %[t4] // z2w2z3w3 \n\
307 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
308 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
309 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
310 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
311 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
312 addps %[t3], %[max] // x + y \n\
313 addps %[t1], %[max] // x + y + z \n\
314 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
315 maxps %[t2], %[max] // record max, restore max \n\
316 add $16, %[byteIndex] // advance loop counter\n\
326 for(
unsigned int i=0; i<
localCount/4; i++,index++)
357 float4 v0, v1, v2, x, y, z;
439 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
466 float4 v3 = vertices[3]; vertices += 4;
487 v3 = vertices[3]; vertices += 4;
508 v3 = vertices[3]; vertices += 4;
529 v3 = vertices[3]; vertices += 4;
578 for( ; index + 4 <= count / 4; index+=4 )
583 float4 v3 = vertices[3]; vertices += 4;
604 v3 = vertices[3]; vertices += 4;
625 v3 = vertices[3]; vertices += 4;
646 v3 = vertices[3]; vertices += 4;
681 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
682 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
683 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
684 movaps %[t0], %[min] // vertices[0] \n\
685 movlhps %[t1], %[min] // x0y0x1y1 \n\
686 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
687 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
688 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
689 movhlps %[t0], %[t1] // z0w0z1w1 \n\
690 movaps %[t3], %[t0] // vertices[2] \n\
691 movlhps %[t4], %[t0] // x2y2x3y3 \n\
692 movhlps %[t3], %[t4] // z2w2z3w3 \n\
693 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
694 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
695 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
696 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
697 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
698 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
699 addps %[t3], %[min] // x + y \n\
700 addps %[t1], %[min] // x + y + z \n\
701 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
702 minps %[t2], %[min] // record min, restore min \n\
703 add $16, %[byteIndex] // advance loop counter\n\
713 for(
unsigned int i=0; i<
localCount/4; i++,index++)
745 float4 v0, v1, v2, x, y, z;
823#elif defined BT_USE_NEON
825#define ARM_NEON_GCC_COMPATIBILITY 1
827#include <sys/types.h>
828#include <sys/sysctl.h>
889# define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); _r; })
892# define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); _r; })
910 for( ; i+8 <= count; i+= 8 )
971 for( ; i+4 <= count; i+= 4 )
1096 unsigned long i = 0;
1097 for( ; i + 8 <= count; i += 8 )
1152 for( ; i + 4 <= count; i += 4 )
1181 switch (count & 3) {
1283 unsigned long i = 0;
1295 for( ; i+8 <= count; i+= 8 )
1356 for( ; i+4 <= count; i+= 4 )
1479 unsigned long i = 0;
1480 for( ; i + 8 <= count; i += 8 )
1535 for( ; i + 4 <= count; i += 4 )
1564 switch (count & 3) {
1665 #error Unhandled __APPLE__ arch
const T & btMax(const T &a, const T &b)