diff --git a/BPTCEncoder/src/RGBAEndpoints.cpp b/BPTCEncoder/src/RGBAEndpoints.cpp index fd64498..e7b11c0 100755 --- a/BPTCEncoder/src/RGBAEndpoints.cpp +++ b/BPTCEncoder/src/RGBAEndpoints.cpp @@ -67,7 +67,7 @@ static const float kFloatConversion[256] = { /////////////////////////////////////////////////////////////////////////////// static inline uint32 CountBitsInMask(uint8 n) { -#if defined(_WIN64) || defined(NO_INLINE_ASSEMBLY) +#if defined(_WIN64) || defined(__x86_64__) || defined(NO_INLINE_ASSEMBLY) if(!n) return 0; // no bits set if(!(n & (n-1))) return 1; // power of two @@ -85,14 +85,17 @@ static inline uint32 CountBitsInMask(uint8 n) { sub eax, ecx } #else - __asm__("mov %%eax, 8;" - "movzbl %0, %%ecx;" + uint32 ans; + __asm__("movl $8, %%eax;" + "movzbl %b1, %%ecx;" "bsf %%ecx, %%ecx;" - "sub %%eax, %%ecx;" - : // No output registers + "subl %%ecx, %%eax;" + "movl %%eax, %0;" + : "=Q"(ans) : "r"(n) : "%eax", "%ecx" ); + return ans; #endif #endif } @@ -105,7 +108,7 @@ static inline void clamp(ty &x, const ty &min, const ty &max) { // absolute distance. It turns out the compiler does a much // better job of optimizing this than we can, since we can't // translate the values to/from registers -static uint8 sad(uint8 a, uint8 b) { +static uint sad(uint8 a, uint8 b) { #if 0 __asm {