diff --git a/src/include/OpenImageIO/fmath.h b/src/include/OpenImageIO/fmath.h index 937fe1cbca..af450d2115 100644 --- a/src/include/OpenImageIO/fmath.h +++ b/src/include/OpenImageIO/fmath.h @@ -761,16 +761,44 @@ inline OIIO_HOSTDEVICE float sign (float x) /// equivalently to C++20 std::bit_cast, but it works prior to C++20 and /// it has the right decorators to work with Cuda. /// @version 2.4.1 -template -OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bitcast (const IN_TYPE& in) noexcept { - // NOTE: this is the only standards compliant way of doing this type of casting, - // luckily the compilers we care about know how to optimize away this idiom. - static_assert(sizeof(IN_TYPE) == sizeof(OUT_TYPE), +template +OIIO_FORCEINLINE OIIO_HOSTDEVICE To bitcast(const From& from) noexcept { + static_assert(sizeof(From) == sizeof(To), "bit_cast must be between objects of the same size"); - OUT_TYPE out; - memcpy ((void *)&out, &in, sizeof(IN_TYPE)); - return out; + // NOTE: this is the only standards compliant way of doing this type of + // casting. This seems to generate optimal code for gcc, clang, MSVS, and + // icx, for both scalar code and vectorized loops, but icc fails to + // vectorize without the intrinsics overrides below. + // + // If we ever find the memcpy isn't doing the job, we should try + // gcc/clang's __builtin_bit_cast and see if that's any better. Some day + // this may all be replaced with C++20 std::bit_cast, but we should not do + // so without checking that it works ok for vectorized loops. + To result; + memcpy ((void *)&result, &from, sizeof(From)); + return result; +} + +#if defined(__INTEL_COMPILER) +// For Intel icc, using the memcpy implementation above will cause a loop with +// a bitcast to fail to vectorize, but using the intrinsics below will allow +// it to vectorize. For icx, as well as gcc and clang, the same optimal code +// is generated (even in a vectorized loop) for memcpy. We can probably remove +// these intrinsics once we drop support for icc. +template<> OIIO_FORCEINLINE uint32_t bitcast(const float& val) noexcept { + return static_cast(_castf32_u32(val)); } +template<> OIIO_FORCEINLINE int32_t bitcast(const float& val) noexcept { + return static_cast(_castf32_u32(val)); +} +template<> OIIO_FORCEINLINE float bitcast(const uint32_t& val) noexcept { + return _castu32_f32(val); +} +template<> OIIO_FORCEINLINE float bitcast(const int32_t& val) noexcept { + return _castu32_f32(val); +} +#endif + #if OIIO_VERSION_LESS(3, 0, 0) /// Note: The C++20 std::bit_cast has the reverse order of the template @@ -787,42 +815,6 @@ OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE& in) { } #endif -#if defined(__x86_64__) && !defined(__CUDA_ARCH__) && \ - (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) \ - || OIIO_CLANG_VERSION >= 100000 || OIIO_APPLE_CLANG_VERSION >= 130000) -// On x86/x86_64 for certain compilers we can use Intel CPU intrinsics for -// some common bitcast cases that might be even more understandable to the -// compiler and generate better code without its getting confused about the -// memcpy in the general case. We're a bit conservative with the compiler -// version checks here, it may be that some earlier versions support these -// intrinsics. - -template<> OIIO_FORCEINLINE uint32_t bitcast(const float& val) noexcept { - return static_cast(_castf32_u32(val)); -} -template<> OIIO_FORCEINLINE int32_t bitcast(const float& val) noexcept { - return static_cast(_castf32_u32(val)); -} -template<> OIIO_FORCEINLINE float bitcast(const uint32_t& val) noexcept { - return _castu32_f32(val); -} -template<> OIIO_FORCEINLINE float bitcast(const int32_t& val) noexcept { - return _castu32_f32(val); -} -template<> OIIO_FORCEINLINE uint64_t bitcast(const double& val) noexcept { - return static_cast(_castf64_u64(val)); -} -template<> OIIO_FORCEINLINE int64_t bitcast(const double& val) noexcept { - return static_cast(_castf64_u64(val)); -} -template<> OIIO_FORCEINLINE double bitcast(const uint64_t& val) noexcept { - return _castu64_f64(val); -} -template<> OIIO_FORCEINLINE double bitcast(const int64_t& val) noexcept { - return _castu64_f64(val); -} -#endif - OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int (float x) { return bitcast(x);