llvm · bogner · Sep 10, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 15, 2024
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
@@ -267,45 +267,38 @@ Examples:
                @llvm.dx.handle.fromHeap.tdx.RawBuffer_v4f32_1_0(
                    i32 2, i1 false)
 
-Buffer Loads and Stores
------------------------
-
-*relevant types: Buffers*
-
-We need to treat buffer loads and stores from "dx.TypedBuffer" and
-"dx.RawBuffer" separately. For TypedBuffer, we have ``llvm.dx.typedBufferLoad``
-and ``llvm.dx.typedBufferStore``, which load and store 16-byte "rows" of data
-via a simple index. For RawBuffer, we have ``llvm.dx.rawBufferPtr``, which
-return a pointer that can be indexed, loaded, and stored to as needed.
-
-The typed load and store operations always operate on exactly 16 bytes of data,
-so there are only a few valid overloads. For types that are 32-bits or smaller,
-we operate on 4-element vectors, such as ``<4 x i32>``, ``<4 x float>``, or
-``<4 x half>``. Note that in 16-bit cases each 16-bit value occupies 32-bits of
-storage. For 64-bit types we operate on 2-element vectors - ``<2 x double>`` or
-``<2 x i64>``. When a type like `Buffer<float>` is used at the HLSL level, it
-is expected that this will operate on a single float in each 16 byte row - that
-is, a load would use the ``<4 x float>`` variant and then extract the first
-element.
-
-.. note:: In DXC, trying to operate on a ``Buffer<double4>`` crashes the
-          compiler. We should probably just reject this in the frontend.
-
-The TypedBuffer intrinsics are lowered to the `bufferLoad`_ and `bufferStore`_
-operations, and the operations on the memory accessed by RawBufferPtr are
-lowered to `rawBufferLoad`_ and `rawBufferStore`_. Note that if we want to
-support DXIL versions prior to 1.2 we'll need to lower the RawBuffer loads and
-stores to the non-raw operations as well.
-
-.. note:: TODO: We need to account for `CheckAccessFullyMapped`_ here.
-
-   In DXIL the load operations always return an ``i32`` status value, but this
-   isn't very ergonomic when it isn't used. We can (1) bite the bullet and have
-   the loads return `{%ret_type, %i32}` all the time, (2) create a variant or
-   update the signature iff the status is used, or (3) hide this in a sideband
-   channel somewhere. I'm leaning towards (2), but could probably be convinced
-   that the ugliness of (1) is worth the simplicity.
-
+16-byte Loads, Samples, and Gathers
+-----------------------------------
+
+*relevant types: TypedBuffer, CBuffer, and Textures*
+
+TypedBuffer, CBuffer, and Texture loads, as well as samples and gathers, can
+return 1 to 4 elements from the given resource, to a maximum of 16 bytes of
+data. DXIL's modeling of this is influenced by DirectX and DXBC's history and
+it generally treats these operations as returning 4 32-bit values. For 16-bit
+elements the values are 16-bit values, and for 64-bit values the operations
+return 4 32-bit integers and combine them with further operations.
+
+In DXIL, these operations return `ResRet`_ and `CBufRet`_ values, are structs
+containing 4 elements of the same type, and in the case of `ResRet` a 5th
+element that is used by the `CheckAccessFullyMapped`_ operation.
+
+In LLVM IR the intrinsics will return the contained type of the resource
+instead. That is, ``llvm.dx.typedBufferLoad`` from a ``Buffer<float>`` would
+return a single float, from ``Buffer<float4>`` a vector of 4 floats, and from
+``Buffer<double2>`` a vector of two doubles, etc. The operations are then
+expanded out to match DXIL's format during lowering.
+
+In cases where we need ``CheckAccessFullyMapped``, we have a second intrinsic
+that returns an anonymous struct with element-0 being the contained type, and
+element-1 being the ``i1`` result of a ``CheckAccessFullyMapped`` call. We
+don't have a separate call to ``CheckAccessFullyMapped`` at all, since that's
+the only operation that can possibly be done on this value. In practice this
+may mean we insert a DXIL operation for the check when this was missing in the
+HLSL source, but this actually matches DXC's behaviour in practice.
+
+.. _ResRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
+.. _CBufRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#cbufferloadlegacy
 .. _CheckAccessFullyMapped: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/checkaccessfullymapped
 
 .. list-table:: ``@llvm.dx.typedBufferLoad``
@@ -317,7 +310,7 @@ stores to the non-raw operations as well.
      - Description
    * - Return value
      -
-     - A 4- or 2-element vector of the type of the buffer
+     - The contained type of the buffer
      - The data loaded from the buffer
    * - ``%buffer``
      - 0
@@ -332,16 +325,23 @@ Examples:
 
 .. code-block:: llvm
 
-   %ret = call <4 x float> @llvm.dx.typedBufferLoad.tdx.TypedBuffer_f32_0_0t(
-       target("dx.TypedBuffer", f32, 0, 0) %buffer, i32 %index)
-   %ret = call <4 x i32> @llvm.dx.typedBufferLoad.tdx.TypedBuffer_i32_0_0t(
-       target("dx.TypedBuffer", i32, 0, 0) %buffer, i32 %index)
-   %ret = call <4 x half> @llvm.dx.typedBufferLoad.tdx.TypedBuffer_f16_0_0t(
-       target("dx.TypedBuffer", f16, 0, 0) %buffer, i32 %index)
-   %ret = call <2 x double> @llvm.dx.typedBufferLoad.tdx.TypedBuffer_f64_0_0t(
-       target("dx.TypedBuffer", double, 0, 0) %buffer, i32 %index)
-
-.. list-table:: ``@llvm.dx.typedBufferStore``
+   %ret = call <4 x float>
+       @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
+           target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
+   %ret = call float
+       @llvm.dx.typedBufferLoad.f32.tdx.TypedBuffer_f32_0_0_0t(
+           target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 %index)
+   %ret = call <4 x i32>
+       @llvm.dx.typedBufferLoad.v4i32.tdx.TypedBuffer_v4i32_0_0_0t(
+           target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 %index)
+   %ret = call <4 x half>
+       @llvm.dx.typedBufferLoad.v4f16.tdx.TypedBuffer_v4f16_0_0_0t(
+           target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 %index)
+   %ret = call <2 x double>
+       @llvm.dx.typedBufferLoad.v2f64.tdx.TypedBuffer_v2f64_0_0t(
+           target("dx.TypedBuffer", <2 x double>, 0, 0, 0) %buffer, i32 %index)
+
+.. list-table:: ``@llvm.dx.typedBufferLoad.checkbit``
    :header-rows: 1
 
    * - Argument
@@ -350,33 +350,41 @@ Examples:
      - Description
    * - Return value
      -
-     - ``void``
-     -
+     - A structure of the contained type and the check bit
+     - The data loaded from the buffer and the check bit
    * - ``%buffer``
      - 0
      - ``target(dx.TypedBuffer, ...)``
-     - The buffer to store into
+     - The buffer to load from
    * - ``%index``
      - 1
      - ``i32``
      - Index into the buffer
-   * - ``%data``
-     - 2
-     - A 4- or 2-element vector of the type of the buffer
-     - The data to store
 
-Examples:
+Texture and Typed Buffer Stores
+-------------------------------
 
-.. code-block:: llvm
+*relevant types: Textures and TypedBuffer*
 
-   call void @llvm.dx.bufferStore.tdx.Buffer_f32_1_0t(
-       target("dx.TypedBuffer", f32, 1, 0) %buf, i32 %index, <4 x f32> %data)
-   call void @llvm.dx.bufferStore.tdx.Buffer_f16_1_0t(
-       target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
-   call void @llvm.dx.bufferStore.tdx.Buffer_f64_1_0t(
-       target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
+The `TextureStore`_ and `BufferStore`_ DXIL operations can only be used to
+write all 4 32-bit elements to a texture or a typed buffer. Note that both of
+these operations do have a mask parameter, but it is documented that it must
+cover all components for these types.
+
+The store operations that we define as intrinsics behave similarly, and will
+only accept writes to the whole of the contained type. This differs from the
+loads above, but this makes sense to do from a semantics preserving point of
+view. Thus, texture and buffer stores may only operate on 4-element vectors of
+types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and
+``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and
+``<2 x i64>``.
 
-.. list-table:: ``@llvm.dx.rawBufferPtr``
+.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
+.. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore
+
+Examples:
+
+.. list-table:: ``@llvm.dx.typedBufferStore``
    :header-rows: 1
 
    * - Argument
@@ -385,52 +393,28 @@ Examples:
      - Description
    * - Return value
      -
-     - ``ptr``
-     - Pointer to an element of the buffer
+     - ``void``
+     -
    * - ``%buffer``
      - 0
-     - ``target(dx.RawBuffer, ...)``
-     - The buffer to load from
+     - ``target(dx.TypedBuffer, ...)``
+     - The buffer to store into
    * - ``%index``
      - 1
      - ``i32``
      - Index into the buffer
+   * - ``%data``
+     - 2
+     - A 4- or 2-element vector of the type of the buffer
+     - The data to store
 
 Examples:
 
 .. code-block:: llvm
 
-   ; Load a float4 from a buffer
-   %buf = call ptr @llvm.dx.rawBufferPtr.tdx.RawBuffer_v4f32_0_0t(
-       target("dx.RawBuffer", <4 x f32>, 0, 0) %buffer, i32 %index)
-   %val = load <4 x float>, ptr %buf, align 16
-
-   ; Load the double from a struct containing an int, a float, and a double
-   %buf = call ptr @llvm.dx.rawBufferPtr.tdx.RawBuffer_sl_i32f32f64s_0_0t(
-       target("dx.RawBuffer", {i32, f32, f64}, 0, 0) %buffer, i32 %index)
-   %val = getelementptr inbounds {i32, f32, f64}, ptr %buf, i32 0, i32 2
-   %d = load double, ptr %val, align 8
-
-   ; Load a float from a byte address buffer
-   %buf = call ptr @llvm.dx.rawBufferPtr.tdx.RawBuffer_i8_0_0t(
-       target("dx.RawBuffer", i8, 0, 0) %buffer, i32 %index)
-   %val = getelementptr inbounds float, ptr %buf, i64 0
-   %f = load float, ptr %val, align 4
-
-   ; Store to a buffer containing float4
-   %addr = call ptr @llvm.dx.rawBufferPtr.tdx.RawBuffer_v4f32_0_0t(
-       target("dx.RawBuffer", <4 x f32>, 0, 0) %buffer, i32 %index)
-   store <4 x float> %val, ptr %addr
-
-   ; Store the double in a struct containing an int, a float, and a double
-   %buf = call ptr @llvm.dx.rawBufferPtr.tdx.RawBuffer_sl_i32f32f64s_0_0t(
-       target("dx.RawBuffer", {i32, f32, f64}, 0, 0) %buffer, i32 %index)
-   %addr = getelementptr inbounds {i32, f32, f64}, ptr %buf, i32 0, i32 2
-   store double %d, ptr %addr
-
-   ; Store a float into a byte address buffer
-   %buf = call ptr @llvm.dx.rawBufferPtr.tdx.RawBuffer_i8_0_0t(
-       target("dx.RawBuffer", i8, 0, 0) %buffer, i32 %index)
-   %addr = getelementptr inbounds float, ptr %buf, i64 0
-   store float %f, ptr %val
-
+   call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f32_1_0_0t(
+       target("dx.TypedBuffer", f32, 1, 0) %buf, i32 %index, <4 x f32> %data)
+   call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f16_1_0_0t(
+       target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
+   call void @llvm.dx.typedBufferStore.tdx.Buffer_v2f64_1_0_0t(
+       target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -30,6 +30,11 @@ def int_dx_handle_fromBinding
           [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
           [IntrNoMem]>;
 
+def int_dx_typedBufferLoad
+    : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
+def int_dx_typedBufferStore
+    : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>;
+
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
@@ -40,7 +40,10 @@ def Int64Ty : DXILOpParamType;
 def HalfTy : DXILOpParamType;
 def FloatTy : DXILOpParamType;
 def DoubleTy : DXILOpParamType;
-def ResRetTy : DXILOpParamType;
+def ResRetHalfTy : DXILOpParamType;
+def ResRetFloatTy : DXILOpParamType;
+def ResRetInt16Ty : DXILOpParamType;
+def ResRetInt32Ty : DXILOpParamType;
 def HandleTy : DXILOpParamType;
 def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
@@ -693,6 +696,29 @@ def CreateHandle : DXILOp<57, createHandle> {
   let stages = [Stages<DXIL1_0, [all_stages]>, Stages<DXIL1_6, [removed]>];
 }
 
+def BufferLoad : DXILOp<68, bufferLoad> {
+  let Doc = "reads from a TypedBuffer";
+  // Handle, Coord0, Coord1
+  let arguments = [HandleTy, Int32Ty, Int32Ty];
+  let result = OverloadTy;
+  let overloads =
+      [Overloads<DXIL1_0,
+                 [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
+def BufferStore : DXILOp<69, bufferStore> {
+  let Doc = "writes to an RWTypedBuffer";
+  // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask
+  let arguments = [
+    HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy,
+    Int8Ty
+  ];
+  let result = VoidTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy, Int16Ty, Int32Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def ThreadId :  DXILOp<93, threadId> {
   let Doc = "Reads the thread ID";
   let LLVMIntrinsic = int_dx_thread_id;

diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -120,8 +120,12 @@ static OverloadKind getOverloadKind(Type *Ty) {
   }
   case Type::PointerTyID:
     return OverloadKind::UserDefineType;
-  case Type::StructTyID:
-    return OverloadKind::ObjectType;
+  case Type::StructTyID: {
+    // TODO: This is a hack. As described in DXILEmitter.cpp, we need to rework
+    // how we're handling overloads and remove the `OverloadKind` proxy enum.
+    StructType *ST = cast<StructType>(Ty);
+    return getOverloadKind(ST->getElementType(0));
+  }
   default:
     return OverloadKind::UNDEFINED;
   }
@@ -194,10 +198,11 @@ static StructType *getOrCreateStructType(StringRef Name,
   return StructType::create(Ctx, EltTys, Name);
 }
 
-static StructType *getResRetType(Type *OverloadTy, LLVMContext &Ctx) {
-  OverloadKind Kind = getOverloadKind(OverloadTy);
+static StructType *getResRetType(Type *ElementTy) {
+  LLVMContext &Ctx = ElementTy->getContext();
+  OverloadKind Kind = getOverloadKind(ElementTy);
   std::string TypeName = constructOverloadTypeName(Kind, "dx.types.ResRet.");
-  Type *FieldTypes[5] = {OverloadTy, OverloadTy, OverloadTy, OverloadTy,
+  Type *FieldTypes[5] = {ElementTy, ElementTy, ElementTy, ElementTy,
                          Type::getInt32Ty(Ctx)};
   return getOrCreateStructType(TypeName, FieldTypes, Ctx);
 }
@@ -247,8 +252,14 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return Type::getInt64Ty(Ctx);
   case OpParamType::OverloadTy:
     return OverloadTy;
-  case OpParamType::ResRetTy:
-    return getResRetType(OverloadTy, Ctx);
+  case OpParamType::ResRetHalfTy:
+    return getResRetType(Type::getHalfTy(Ctx));
+  case OpParamType::ResRetFloatTy:
+    return getResRetType(Type::getFloatTy(Ctx));
+  case OpParamType::ResRetInt16Ty:
+    return getResRetType(Type::getInt16Ty(Ctx));
+  case OpParamType::ResRetInt32Ty:
+    return getResRetType(Type::getInt32Ty(Ctx));
   case OpParamType::HandleTy:
     return getHandleType(Ctx);
   case OpParamType::ResBindTy:
@@ -390,6 +401,7 @@ Expected<CallInst *> DXILOpBuilder::tryCreateOp(dxil::OpCode OpCode,
       return makeOpError(OpCode, "Wrong number of arguments");
     OverloadTy = Args[ArgIndex]->getType();
   }
+
   FunctionType *DXILOpFT =
       getDXILOpFunctionType(OpCode, M.getContext(), OverloadTy);
 
@@ -450,6 +462,10 @@ CallInst *DXILOpBuilder::createOp(dxil::OpCode OpCode, ArrayRef<Value *> Args,
   return *Result;
 }
 
+StructType *DXILOpBuilder::getResRetType(Type *ElementTy) {
+  return ::getResRetType(ElementTy);
+}
+
 StructType *DXILOpBuilder::getHandleType() {
   return ::getHandleType(IRB.getContext());
 }

diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h
@@ -46,6 +46,8 @@ class DXILOpBuilder {
   Expected<CallInst *> tryCreateOp(dxil::OpCode Op, ArrayRef<Value *> Args,
                                    Type *RetTy = nullptr);
 
+  /// Get a `%dx.types.ResRet` type with the given element type.
+  StructType *getResRetType(Type *ElementTy);
   /// Get the `%dx.types.Handle` type.
   StructType *getHandleType();