SIMD.h

Go to the documentation of this file.
00001 /**<!-------------------------------------------------------------------->
00002    @file   SIMD.h
00003    @author Travis Fischer (fisch0920@gmail.com)
00004    @date   Fall 2008
00005    
00006    @brief
00007       Contains useful/common definitions for working with SSE intrinsics
00008    <!-------------------------------------------------------------------->**/
00009 
00010 #ifndef SIMD_H_
00011 #define SIMD_H_
00012 
00013 #include <common/common.h>
00014 
00015 #if MILTON_ENABLE_SSE
00016 
00017 #include <xmmintrin.h> // SSE1
00018 #include <emmintrin.h> // SSE2
00019 #include <pmmintrin.h> // SSE3
00020 
00021 typedef __m64     m64_t;
00022 typedef __m128    m128f_t;
00023 typedef __m128i   m128i_t;
00024 typedef __m128d   m128d_t;
00025 
00026 // alignment utilities
00027 #define SSE_ALIGN16_PRE       ALIGN_PRE(16)
00028 #define SSE_ALIGN16_POST      ALIGN_POST(16)
00029 
00030 #define DECLARE_ALIGNED_MEMORY_OPERATORS              \
00031    static inline void *operator new(size_t s) {       \
00032       return malloc_aligned((unsigned)s);             \
00033    }                                                  \
00034    static inline void *operator new[](size_t s) {     \
00035       return malloc_aligned((unsigned)s);             \
00036    }                                                  \
00037    static inline void operator delete(void* ptr) {    \
00038       free_aligned(ptr);                              \
00039    }                                                  \
00040    static inline void operator delete[](void* ptr) {  \
00041       free_aligned(ptr);                              \
00042    }
00043 
00044 #include <memory>
00045 
00046 extern void *operator new  (size_t size) throw (std::bad_alloc);
00047 extern void *operator new[](size_t size) throw (std::bad_alloc);
00048 
00049 #ifdef __cplusplus
00050    extern "C" {  // turn off name mangling
00051 #endif
00052       
00053       /**
00054        * @returns a pointer to a block of memory allocated with malloc which
00055        *    is aligned on a 16-byte boundary
00056        * @note returned memory should be freed with free_aligned
00057        * 
00058        * @see also _mm_malloc
00059        */
00060       extern void *malloc_aligned(unsigned n);
00061       
00062       /**
00063        * @brief
00064        *    frees the memory at the address given which is assumed to have 
00065        * been previously allocated with malloc_aligned
00066        * 
00067        * @see also _mm_free
00068        */
00069       extern void  free_aligned(void *ptr);
00070       
00071 #ifdef __cplusplus
00072    }  // end of extern "C"
00073 #endif
00074 
00075 
00076 /**
00077  * @brief
00078  *    128-bit SSE (Streaming SIMD Extension) registers require 16-byte 
00079  * alignment which necessitates special care when allocating objects containing
00080  * SSE data types both on the stack and on the heap (via the new operator)
00081  * 
00082  * @note
00083  *    SSE has been <b>enabled</b> in this build of Milton
00084  */
00085 struct SSE_ALIGN16_PRE SSEAligned {
00086    DECLARE_ALIGNED_MEMORY_OPERATORS
00087 } SSE_ALIGN16_POST;
00088 
00089 struct SSE_ALIGN16_PRE SimpleSSEVector : public SSEAligned {
00090    union {
00091       real_t  data[4];
00092       m128f_t vec;
00093       
00094       struct { real_t x, y, z, w; };
00095    };
00096    
00097    inline SimpleSSEVector(const m128f_t &v)
00098       : vec(v)
00099    { }
00100 } SSE_ALIGN16_POST;
00101 
00102 
00103 #define _mm_extract_epi32(x, imm)      \
00104    ((real_t)_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))))
00105 
00106 #define _mm_extract_f32i(vec, index)   \
00107    (SimpleSSEVector((vec)).data[(index)])
00108 
00109 #define _mm_extract_f32(vec)           \
00110    (SimpleSSEVector((vec)).data[0])
00111 
00112 
00113 #ifdef __cplusplus
00114    extern "C" {  // turn off name mangling
00115 #endif
00116       
00117       /**
00118        * @returns fuzzy element-wise equality between @p a and @p b
00119        */
00120       static inline m128f_t SSE_EQ(const m128f_t &a, const m128f_t &b) {
00121          // could use the _mm_cmpeq_ps func here, but it has the same problems as 
00122          // comparing a float; so, see if a is roughly close to b... 
00123          const m128f_t &a_plus_a_bit = _mm_add_ps(a, _mm_set_ps1(EPSILON));
00124          const m128f_t &a_minus_a_bit = _mm_sub_ps(a, _mm_set_ps1(EPSILON));
00125          
00126          const m128f_t &b_lt_a = _mm_cmplt_ps(b, a_plus_a_bit);
00127          const m128f_t &b_gt_a = _mm_cmpgt_ps(b, a_minus_a_bit);
00128          
00129          return _mm_and_ps(b_lt_a, b_gt_a);
00130       }
00131       
00132       /**
00133        * @returns true iff @a and @b are approximately equal
00134        */
00135       static inline bool SSE_EQb(const m128f_t &a, const m128f_t &b) {
00136          const SimpleSSEVector v(_mm_cmpneq_ps(SSE_EQ(a, b), _mm_setzero_ps()));
00137          
00138          return (v.data[0] != 0 && v.data[1] != 0 && v.data[2] != 0 && v.data[3] != 0);
00139       }
00140       
00141       /**
00142        * @returns fuzzy element-wise inequality between @p a and @p b
00143        */
00144       static inline m128f_t SSE_NEQ(const m128f_t &a, const m128f_t &b) {
00145          // could use the _mm_cmpeq_ps func here, but it has the same problems as 
00146          // comparing a float; so, see if a is roughly close to b... 
00147          const m128f_t &a_plus_a_bit = _mm_add_ps(a, _mm_set_ps1(EPSILON));
00148          const m128f_t &a_minus_a_bit = _mm_sub_ps(a, _mm_set_ps1(EPSILON));
00149          
00150          const m128f_t &b_gt_a = _mm_cmpgt_ps(b, a_plus_a_bit);
00151          const m128f_t &b_lt_a = _mm_cmplt_ps(b, a_minus_a_bit);
00152          
00153          return _mm_or_ps(b_lt_a, b_gt_a);
00154       }
00155       
00156       /**
00157        * @returns true iff @a and @b are not approximately equal
00158        */
00159       static inline bool SSE_NEQb(const m128f_t &a, const m128f_t &b) {
00160          const SimpleSSEVector v(_mm_cmpneq_ps(SSE_EQ(a, b), _mm_setzero_ps()));
00161          
00162          return (v.data[0] == 0 || v.data[1] == 0 || v.data[2] == 0 || v.data[3] == 0);
00163       }
00164       
00165 #ifdef __cplusplus
00166    }  // end of extern "C"
00167 #endif
00168 
00169 #else  // MILTON_ENABLE_SSE
00170 
00171 /**
00172  * @brief
00173  *    128-bit SSE (Streaming SIMD Extension) registers require 16-byte 
00174  * alignment which necessitates special care when allocating objects containing
00175  * SSE data types both on the stack and on the heap (via the new operator)
00176  * 
00177  * @note
00178  *    SSE has been <b>disabled</b> in this build of Milton, resulting in 
00179  * SSEAligned being just a dummy, placeholder class with no real functionality 
00180  * or effect
00181  */
00182 struct SSEAligned { };
00183 
00184 #endif // MILTON_ENABLE_SSE
00185 
00186 #endif // SIMD_H_
00187 

Generated on 28 Feb 2009 for Milton by doxygen 1.5.6