Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for b4_SSE2 batched mode. #1825

Merged
merged 7 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,18 @@ jobs:
pybind11_ver: v2.7.0
simd: sse4.2
batched: b8_AVX2_noFMA
setenvs: export ENABLE_OPENVDB=0
- desc: gcc9/C++17 llvm13 py3.9 oiio-rel avx2
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
nametag: linux-vfx2021
runner: ubuntu-latest
container: aswftesting/ci-osl:2021-clang11
vfxyear: 2021
cxx_std: 17
openimageio_ver: v2.4.13.0
python_ver: 3.7
pybind11_ver: v2.7.0
simd: sse2
batched: b4_SSE2
- desc: gcc9/C++17 llvm13 py3.9 exr3.1 oiio-rel avx2
nametag: linux-vfx2022
runner: ubuntu-latest
container: aswftesting/ci-osl:2022-clang13
Expand Down
6 changes: 3 additions & 3 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as
well, but we don't officially support or test other than these platforms.

Shader execution is supported on the native architectures of those x86_64 and
aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode
requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs
using Cuda+OptiX.
aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode
requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on
NVIDIA GPUs using Cuda+OptiX.

Dependencies
------------
Expand Down
2 changes: 1 addition & 1 deletion src/cmake/compiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ endif ()
#
# The USE_BATCHED option may be set to indicate that support for batched
# SIMD shader execution be compiled along with targe specific libraries
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
set (BATCHED_SUPPORT_DEFINES "")
set (BATCHED_TARGET_LIBS "")
Expand Down
9 changes: 8 additions & 1 deletion src/include/OSL/batched_texture.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
== VecReg<8>::alignment,
"Expect alignment of data member to set alignment of struct");
static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
== VecReg<4>::alignment,
"Expect alignment of data member to set alignment of struct");

template<int WidthT> struct BatchedTextureOptions {
VaryingTextureOptions<WidthT> varying;
Expand Down Expand Up @@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
== VecReg<8>::alignment,
"Expect alignment of data member to set alignment of struct");
static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
== VecReg<4>::alignment,
"Expect alignment of data member to set alignment of struct");

#ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
// Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
// and safe to reinterpret_cast<TextureOptBatch*>
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
|| (OIIO::Tex::BatchWidth == 4),
"This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");

namespace validate_offsets {
Expand Down
3 changes: 3 additions & 0 deletions src/include/OSL/llvm_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
llvm::Constant* constant(uint32_t i);

/// Return an llvm::Constant holding the given integer constant.
llvm::Constant* constant4(int8_t i);
llvm::Constant* constant4(uint8_t i);
llvm::Constant* constant8(int8_t i);
llvm::Constant* constant8(uint8_t i);
llvm::Constant* constant16(int16_t i);
Expand Down Expand Up @@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {

llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);
Expand Down
1 change: 1 addition & 0 deletions src/include/OSL/rendererservices.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
/// Unless overridden, a nullptr is returned.
virtual BatchedRendererServices<16>* batched(WidthOf<16>);
virtual BatchedRendererServices<8>* batched(WidthOf<8>);
virtual BatchedRendererServices<4>* batched(WidthOf<4>);

protected:
TextureSystem* m_texturesys; // A place to hold a TextureSystem
Expand Down
4 changes: 4 additions & 0 deletions src/liboslexec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
list (APPEND TARGET_CXX_OPTS "-march=x86-64")
else ()
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
endif ()
Expand Down Expand Up @@ -454,6 +456,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
list (APPEND TARGET_CXX_OPTS "-march=haswell")
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
list (APPEND TARGET_CXX_OPTS "-march=x86-64")
else ()
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
endif ()
Expand Down
8 changes: 7 additions & 1 deletion src/liboslexec/batched_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1813,10 +1813,16 @@ struct Analyzer {
// specific BatchedRendererServices.
// Right here we don't know which width will be used,
// so we will just require all widths provide the same answer
auto rs4 = m_ba.renderer()->batched(WidthOf<4>());
auto rs8 = m_ba.renderer()->batched(WidthOf<8>());
auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
if (rs8 || rs16) {
if (rs4 || rs8 || rs16) {
get_attr_is_uniform = true;
if (rs4) {
get_attr_is_uniform
&= rs4->is_attribute_uniform(obj_name,
attr_name);
}
if (rs8) {
get_attr_is_uniform
&= rs8->is_attribute_uniform(obj_name,
Expand Down
1 change: 1 addition & 0 deletions src/liboslexec/batched_backendllvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
switch (vector_width()) {
case 16: m_true_mask_value = Mask<16>(true).value(); break;
case 8: m_true_mask_value = Mask<8>(true).value(); break;
case 4: m_true_mask_value = Mask<4>(true).value(); break;
default: OSL_ASSERT(0 && "unsupported vector width");
}
ll.dumpasm(shadingsys.m_llvm_dumpasm);
Expand Down
44 changes: 44 additions & 0 deletions src/liboslexec/batched_llvm_instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,33 @@ const char*
= "b8_AVX_";
#endif

#ifdef __OSL_SUPPORTS_b4_SSE2
template<>
const NameAndSignature
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
= {
# define DECL_INDIRECT(name, signature) \
NameAndSignature { #name, signature },
# define DECL(name, signature) DECL_INDIRECT(name, signature)
# define __OSL_WIDTH 4
# define __OSL_TARGET_ISA SSE2
// Don't allow order of xmacro includes be rearranged
// clang-format off
# include "wide/define_opname_macros.h"
# include "builtindecl_wide_xmacro.h"
# include "wide/undef_opname_macros.h"
// clang-format on
# undef __OSL_TARGET_ISA
# undef __OSL_WIDTH
# undef DECL
# undef DECL_INDIRECT
};
template<>
const char*
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
= "b4_SSE2_";
#endif



std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
Expand Down Expand Up @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
default: break;
}
break;
case 4:
switch (target_isa) {
#ifdef __OSL_SUPPORTS_b4_SSE2
case TargetISA::x64:
return RetType(
new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
#endif
default: break;
}
break;

default: OSL_ASSERT(0 && "unsupported vector width");
}
std::cerr << "Build is not configured to support TargetISA of "
Expand Down Expand Up @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
{
std::vector<unsigned int> offset_by_index;
switch (m_width) {
case 4:
build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
break;
case 8:
build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
break;
Expand Down Expand Up @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
{
std::vector<unsigned int> offset_by_index;
switch (m_width) {
case 4:
build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
break;
case 8:
build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
break;
Expand Down
1 change: 1 addition & 0 deletions src/liboslexec/batched_rendservices.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,5 +347,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
// Explicitly instantiate BatchedRendererServices template
template class OSLEXECPUBLIC BatchedRendererServices<16>;
template class OSLEXECPUBLIC BatchedRendererServices<8>;
template class OSLEXECPUBLIC BatchedRendererServices<4>;

OSL_NAMESPACE_EXIT
1 change: 1 addition & 0 deletions src/liboslexec/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
// Explicit template instantiation for supported batch sizes
template class ShadingContext::Batched<16>;
template class ShadingContext::Batched<8>;
template class ShadingContext::Batched<4>;
#endif


Expand Down
2 changes: 2 additions & 0 deletions src/liboslexec/llvm_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
// including this file will need its own static members defined. LLVM will
// assign IDs when they get registered, so this initialization value is not
// important.
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;

template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;

template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;
Expand Down
Loading
Loading