From 3517a119fc8a4e4cc2cdbb839ebd1cab951e6f7c Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 12 Jul 2024 17:09:48 -0500 Subject: [PATCH] [NVPTX] Implement variadic functions using IR lowering (#96015) Summary: This patch implements support for variadic functions for NVPTX targets. The implementation here mainly follows what was done to implement it for AMDGPU in https://github.com/llvm/llvm-project/pull/93362. We change the NVPTX codegen to lower all variadic arguments to functions by-value. This creates a flattened set of arguments that the IR lowering pass converts into a struct with the proper alignment. The behavior of this function was determined by iteratively checking what the NVCC copmiler generates for its output. See examples like https://godbolt.org/z/KavfTGY93. I have noted the main methods that NVIDIA uses to lower variadic functions. 1. All arguments are passed in a pointer to aggregate. 2. The minimum alignment for a plain argument is 4 bytes. 3. Alignment is dictated by the underlying type 4. Structs are flattened and do not have their alignment changed. 5. NVPTX never passes any arguments indirectly, even very large ones. This patch passes the tests in the `libc` project currently, including support for `sprintf`. --- clang/lib/Basic/Targets/NVPTX.h | 3 +- clang/lib/CodeGen/Targets/NVPTX.cpp | 12 +- clang/test/CodeGen/variadic-nvptx.c | 94 ++++ libc/config/gpu/entrypoints.txt | 15 +- libc/test/src/__support/CMakeLists.txt | 21 +- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 + llvm/lib/Transforms/IPO/ExpandVariadics.cpp | 40 +- llvm/test/CodeGen/NVPTX/variadics-backend.ll | 427 ++++++++++++++++++ llvm/test/CodeGen/NVPTX/variadics-lowering.ll | 348 ++++++++++++++ 9 files changed, 930 insertions(+), 32 deletions(-) create mode 100644 clang/test/CodeGen/variadic-nvptx.c create mode 100644 llvm/test/CodeGen/NVPTX/variadics-backend.ll create mode 100644 llvm/test/CodeGen/NVPTX/variadics-lowering.ll diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index be43bb04fa2ed75..25dc979d882fd99 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -119,8 +119,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { } BuiltinVaListKind getBuiltinVaListKind() const override { - // FIXME: implement - return TargetInfo::CharPtrBuiltinVaList; + return TargetInfo::VoidPtrBuiltinVaList; } bool isValidCPUName(StringRef Name) const override { diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 423485c9ca16e8e..ec7f1c439b18819 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -203,8 +203,11 @@ ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType Ty) const { void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); - for (auto &I : FI.arguments()) - I.info = classifyArgumentType(I.type); + + for (auto &&[ArgumentsCount, I] : llvm::enumerate(FI.arguments())) + I.info = ArgumentsCount < FI.getNumRequiredArgs() + ? classifyArgumentType(I.type) + : ABIArgInfo::getDirect(); // Always honor user-specified calling convention. if (FI.getCallingConvention() != llvm::CallingConv::C) @@ -215,7 +218,10 @@ void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const { RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, AggValueSlot Slot) const { - llvm_unreachable("NVPTX does not support varargs"); + return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*IsIndirect=*/false, + getContext().getTypeInfoInChars(Ty), + CharUnits::fromQuantity(1), + /*AllowHigherAlign=*/true, Slot); } void NVPTXTargetCodeGenInfo::setTargetAttributes( diff --git a/clang/test/CodeGen/variadic-nvptx.c b/clang/test/CodeGen/variadic-nvptx.c new file mode 100644 index 000000000000000..45e22ecc7bc1960 --- /dev/null +++ b/clang/test/CodeGen/variadic-nvptx.c @@ -0,0 +1,94 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -emit-llvm -o - %s | FileCheck %s + +extern void varargs_simple(int, ...); + +// CHECK-LABEL: define dso_local void @foo( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[C:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[S:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[L:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[F:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[D:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4 +// CHECK-NEXT: [[V:%.*]] = alloca <4 x i32>, align 16 +// CHECK-NEXT: [[T:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1 +// CHECK-NEXT: store i8 1, ptr [[C]], align 1 +// CHECK-NEXT: store i16 1, ptr [[S]], align 2 +// CHECK-NEXT: store i32 1, ptr [[I]], align 4 +// CHECK-NEXT: store i64 1, ptr [[L]], align 8 +// CHECK-NEXT: store float 1.000000e+00, ptr [[F]], align 4 +// CHECK-NEXT: store double 1.000000e+00, ptr [[D]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[C]], align 1 +// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[S]], align 2 +// CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[L]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F]], align 4 +// CHECK-NEXT: [[CONV2:%.*]] = fpext float [[TMP4]] to double +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[D]], align 8 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, i32 noundef [[CONV]], i32 noundef [[CONV1]], i32 noundef [[TMP2]], i64 noundef [[TMP3]], double noundef [[CONV2]], double noundef [[TMP5]]) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 @__const.foo.a, i64 12, i1 false) +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[A]], i32 0, i32 0 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[A]], i32 0, i32 1 +// CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[A]], i32 0, i32 2 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, i32 [[TMP7]], i8 [[TMP9]], i32 [[TMP11]]) +// CHECK-NEXT: store <4 x i32> , ptr [[V]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr [[V]], align 16 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, <4 x i32> noundef [[TMP12]]) +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP13]], align 1 +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1 +// CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 +// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1 +// CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1 +// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP21]], align 1 +// CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1 +// CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, i8 [[TMP14]], i8 [[TMP16]], i8 [[TMP18]], i8 [[TMP20]], i32 noundef 0, i8 [[TMP22]], i8 [[TMP24]]) +// CHECK-NEXT: ret void +// +void foo() { + char c = '\x1'; + short s = 1; + int i = 1; + long l = 1; + float f = 1.f; + double d = 1.; + varargs_simple(0, c, s, i, l, f, d); + + struct {int x; char c; int y;} a = {1, '\x1', 1}; + varargs_simple(0, a); + + typedef int __attribute__((ext_vector_type(4))) int4; + int4 v = {1, 1, 1, 1}; + varargs_simple(0, v); + + struct {char c, d;} t; + varargs_simple(0, t, t, 0, t); +} + +typedef struct {long x; long y;} S; +extern void varargs_complex(S, S, ...); + +// CHECK-LABEL: define dso_local void @bar( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[S]], ptr align 8 @__const.bar.s, i64 16, i1 false) +// CHECK-NEXT: call void (ptr, ptr, ...) @varargs_complex(ptr noundef byval([[STRUCT_S]]) align 8 [[S]], ptr noundef byval([[STRUCT_S]]) align 8 [[S]], i32 noundef 1, i64 noundef 1, double noundef 1.000000e+00) +// CHECK-NEXT: ret void +// +void bar() { + S s = {1l, 1l}; + varargs_complex(s, s, 1, 1l, 1.0); +} diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 166144d63434490..be4af4d2168968f 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -1,13 +1,3 @@ -if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) - set(extra_entrypoints - # stdio.h entrypoints - libc.src.stdio.snprintf - libc.src.stdio.sprintf - libc.src.stdio.vsnprintf - libc.src.stdio.vsprintf - ) -endif() - set(TARGET_LIBC_ENTRYPOINTS # assert.h entrypoints libc.src.assert.__assert_fail @@ -185,9 +175,12 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.errno.errno # stdio.h entrypoints - ${extra_entrypoints} libc.src.stdio.clearerr libc.src.stdio.fclose + libc.src.stdio.sprintf + libc.src.stdio.snprintf + libc.src.stdio.vsprintf + libc.src.stdio.vsnprintf libc.src.stdio.feof libc.src.stdio.ferror libc.src.stdio.fflush diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index ee0db6b2503851b..4df055c9654eace 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -131,18 +131,15 @@ add_libc_test( libc.src.__support.uint128 ) -# NVPTX does not support varargs currently. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) - add_libc_test( - arg_list_test - SUITE - libc-support-tests - SRCS - arg_list_test.cpp - DEPENDS - libc.src.__support.arg_list - ) -endif() +add_libc_test( + arg_list_test + SUITE + libc-support-tests + SRCS + arg_list_test.cpp + DEPENDS + libc.src.__support.arg_list +) if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 152f200b9d0f36e..097e29527eed9fe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -33,6 +33,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" @@ -342,6 +343,7 @@ void NVPTXPassConfig::addIRPasses() { } addPass(createAtomicExpandLegacyPass()); + addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); addPass(createNVPTXCtorDtorLoweringLegacyPass()); // === LSR and other generic IR passes === diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index d340bc041ccdad5..b5b590e2b7acf2c 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -456,8 +456,8 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder, // Replace known calls to the variadic with calls to the va_list equivalent for (User *U : make_early_inc_range(VariadicWrapper->users())) { if (CallBase *CB = dyn_cast(U)) { - Value *calledOperand = CB->getCalledOperand(); - if (VariadicWrapper == calledOperand) + Value *CalledOperand = CB->getCalledOperand(); + if (VariadicWrapper == CalledOperand) Changed |= expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(), FixedArityReplacement); @@ -938,6 +938,33 @@ struct Amdgpu final : public VariadicABIInfo { } }; +struct NVPTX final : public VariadicABIInfo { + + bool enableForTarget() override { return true; } + + bool vaListPassedInSSARegister() override { return true; } + + Type *vaListType(LLVMContext &Ctx) override { + return PointerType::getUnqual(Ctx); + } + + Type *vaListParameterType(Module &M) override { + return PointerType::getUnqual(M.getContext()); + } + + Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder, + AllocaInst *, Value *Buffer) override { + return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M)); + } + + VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override { + // NVPTX expects natural alignment in all cases. The variadic call ABI will + // handle promoting types to their appropriate size and alignment. + Align A = DL.getABITypeAlign(Parameter); + return {A, false}; + } +}; + struct Wasm final : public VariadicABIInfo { bool enableForTarget() override { @@ -967,8 +994,8 @@ struct Wasm final : public VariadicABIInfo { if (A < MinAlign) A = Align(MinAlign); - if (auto s = dyn_cast(Parameter)) { - if (s->getNumElements() > 1) { + if (auto *S = dyn_cast(Parameter)) { + if (S->getNumElements() > 1) { return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true}; } } @@ -988,6 +1015,11 @@ std::unique_ptr VariadicABIInfo::create(const Triple &T) { return std::make_unique(); } + case Triple::nvptx: + case Triple::nvptx64: { + return std::make_unique(); + } + default: return {}; } diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll new file mode 100644 index 000000000000000..0e0c89d3e0214f8 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -0,0 +1,427 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64-- -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 < %s | FileCheck %s --check-prefix=CHECK-PTX +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} + +%struct.S1 = type { i32, i8, i64 } +%struct.S2 = type { i64, i64 } + +@__const.bar.s1 = private unnamed_addr constant %struct.S1 { i32 1, i8 1, i64 1 }, align 8 +@__const.qux.s = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8 + +define dso_local i32 @variadics1(i32 noundef %first, ...) { +; CHECK-PTX-LABEL: variadics1( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<11>; +; CHECK-PTX-NEXT: .reg .b64 %rd<11>; +; CHECK-PTX-NEXT: .reg .f64 %fd<7>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: ld.param.u32 %r1, [variadics1_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics1_param_1]; +; CHECK-PTX-NEXT: ld.u32 %r2, [%rd1]; +; CHECK-PTX-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-PTX-NEXT: ld.u32 %r4, [%rd1+4]; +; CHECK-PTX-NEXT: add.s32 %r5, %r3, %r4; +; CHECK-PTX-NEXT: ld.u32 %r6, [%rd1+8]; +; CHECK-PTX-NEXT: add.s32 %r7, %r5, %r6; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 19; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r7; +; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4; +; CHECK-PTX-NEXT: cvt.u32.u64 %r8, %rd6; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd3, 15; +; CHECK-PTX-NEXT: and.b64 %rd8, %rd7, -8; +; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd8]; +; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r8; +; CHECK-PTX-NEXT: add.rn.f64 %fd3, %fd2, %fd1; +; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r9, %fd3; +; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, 15; +; CHECK-PTX-NEXT: and.b64 %rd10, %rd9, -8; +; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd10]; +; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r9; +; CHECK-PTX-NEXT: add.rn.f64 %fd6, %fd5, %fd4; +; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd6; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r10; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 4 + store ptr %argp.next, ptr %vlist, align 8 + %0 = load i32, ptr %argp.cur, align 4 + %add = add nsw i32 %first, %0 + %argp.cur1 = load ptr, ptr %vlist, align 8 + %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i64 4 + store ptr %argp.next2, ptr %vlist, align 8 + %1 = load i32, ptr %argp.cur1, align 4 + %add3 = add nsw i32 %add, %1 + %argp.cur4 = load ptr, ptr %vlist, align 8 + %argp.next5 = getelementptr inbounds i8, ptr %argp.cur4, i64 4 + store ptr %argp.next5, ptr %vlist, align 8 + %2 = load i32, ptr %argp.cur4, align 4 + %add6 = add nsw i32 %add3, %2 + %argp.cur7 = load ptr, ptr %vlist, align 8 + %3 = getelementptr inbounds i8, ptr %argp.cur7, i32 7 + %argp.cur7.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %3, i64 -8) + %argp.next8 = getelementptr inbounds i8, ptr %argp.cur7.aligned, i64 8 + store ptr %argp.next8, ptr %vlist, align 8 + %4 = load i64, ptr %argp.cur7.aligned, align 8 + %conv = sext i32 %add6 to i64 + %add9 = add nsw i64 %conv, %4 + %conv10 = trunc i64 %add9 to i32 + %argp.cur11 = load ptr, ptr %vlist, align 8 + %5 = getelementptr inbounds i8, ptr %argp.cur11, i32 7 + %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %5, i64 -8) + %argp.next12 = getelementptr inbounds i8, ptr %argp.cur11.aligned, i64 8 + store ptr %argp.next12, ptr %vlist, align 8 + %6 = load double, ptr %argp.cur11.aligned, align 8 + %conv13 = sitofp i32 %conv10 to double + %add14 = fadd double %conv13, %6 + %conv15 = fptosi double %add14 to i32 + %argp.cur16 = load ptr, ptr %vlist, align 8 + %7 = getelementptr inbounds i8, ptr %argp.cur16, i32 7 + %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %7, i64 -8) + %argp.next17 = getelementptr inbounds i8, ptr %argp.cur16.aligned, i64 8 + store ptr %argp.next17, ptr %vlist, align 8 + %8 = load double, ptr %argp.cur16.aligned, align 8 + %conv18 = sitofp i32 %conv15 to double + %add19 = fadd double %conv18, %8 + %conv20 = fptosi double %add19 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv20 +} + +declare void @llvm.va_start.p0(ptr) + +declare ptr @llvm.ptrmask.p0.i64(ptr, i64) + +declare void @llvm.va_end.p0(ptr) + +define dso_local i32 @foo() { +; CHECK-PTX-LABEL: foo( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot1; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: mov.u64 %rd1, 4294967297; +; CHECK-PTX-NEXT: st.u64 [%SP+0], %rd1; +; CHECK-PTX-NEXT: mov.b32 %r1, 1; +; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; +; CHECK-PTX-NEXT: mov.u64 %rd2, 1; +; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd2; +; CHECK-PTX-NEXT: mov.u64 %rd3, 4607182418800017408; +; CHECK-PTX-NEXT: st.u64 [%SP+24], %rd3; +; CHECK-PTX-NEXT: st.u64 [%SP+32], %rd3; +; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-PTX-NEXT: { // callseq 0, 0 +; CHECK-PTX-NEXT: .param .b32 param0; +; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd4; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics1, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 0 +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: ret; +entry: + %conv = sext i8 1 to i32 + %conv1 = sext i16 1 to i32 + %conv2 = fpext float 1.000000e+00 to double + %call = call i32 (i32, ...) @variadics1(i32 noundef 1, i32 noundef %conv, i32 noundef %conv1, i32 noundef 1, i64 noundef 1, double noundef %conv2, double noundef 1.000000e+00) + ret i32 %call +} + +define dso_local i32 @variadics2(i32 noundef %first, ...) { +; CHECK-PTX-LABEL: variadics2( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 2 .b8 __local_depot2[4]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b16 %rs<6>; +; CHECK-PTX-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-NEXT: .reg .b64 %rd<11>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot2; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [variadics2_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics2_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.u32 %r2, [%rd3]; +; CHECK-PTX-NEXT: or.b64 %rd4, %rd3, 4; +; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4]; +; CHECK-PTX-NEXT: or.b64 %rd5, %rd3, 5; +; CHECK-PTX-NEXT: or.b64 %rd6, %rd3, 7; +; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd6]; +; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs1; +; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd5]; +; CHECK-PTX-NEXT: or.b64 %rd7, %rd3, 6; +; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd7]; +; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8; +; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2; +; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs5; +; CHECK-PTX-NEXT: ld.u64 %rd8, [%rd3+8]; +; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; +; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd9, %r5; +; CHECK-PTX-NEXT: add.s64 %rd10, %rd9, %rd8; +; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd10; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + %s1.sroa.3 = alloca [3 x i8], align 1 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %s1.sroa.0.0.copyload = load i32, ptr %argp.cur.aligned, align 8 + %s1.sroa.2.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 4 + %s1.sroa.2.0.copyload = load i8, ptr %s1.sroa.2.0.argp.cur.aligned.sroa_idx, align 4 + %s1.sroa.3.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 5 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 %s1.sroa.3.0.argp.cur.aligned.sroa_idx, i64 3, i1 false) + %s1.sroa.31.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + %s1.sroa.31.0.copyload = load i64, ptr %s1.sroa.31.0.argp.cur.aligned.sroa_idx, align 8 + %add = add nsw i32 %first, %s1.sroa.0.0.copyload + %conv = sext i8 %s1.sroa.2.0.copyload to i32 + %add1 = add nsw i32 %add, %conv + %conv2 = sext i32 %add1 to i64 + %add3 = add nsw i64 %conv2, %s1.sroa.31.0.copyload + %conv4 = trunc i64 %add3 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv4 +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +define dso_local i32 @bar() { +; CHECK-PTX-LABEL: bar( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b16 %rs<10>; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<8>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: mov.u64 %rd1, __const_$_bar_$_s1; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [%rd2]; +; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1; +; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs2; +; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 5; +; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd3]; +; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3; +; CHECK-PTX-NEXT: add.s64 %rd4, %rd1, 6; +; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [%rd4]; +; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5; +; CHECK-PTX-NEXT: shl.b16 %rs7, %rs6, 8; +; CHECK-PTX-NEXT: or.b16 %rs8, %rs7, %rs4; +; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs8; +; CHECK-PTX-NEXT: mov.b32 %r1, 1; +; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; +; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8; +; CHECK-PTX-NEXT: or.b64 %rd6, %rd5, 4; +; CHECK-PTX-NEXT: mov.u16 %rs9, 1; +; CHECK-PTX-NEXT: st.u8 [%rd6], %rs9; +; CHECK-PTX-NEXT: mov.u64 %rd7, 1; +; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd7; +; CHECK-PTX-NEXT: { // callseq 1, 0 +; CHECK-PTX-NEXT: .param .b32 param0; +; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd5; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics2, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 1 +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: ret; +entry: + %s1.sroa.3 = alloca [3 x i8], align 1 + %s1.sroa.0.0.copyload = load i32, ptr @__const.bar.s1, align 8 + %s1.sroa.2.0.copyload = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) + %s1.sroa.31.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 + %call = call i32 (i32, ...) @variadics2(i32 noundef 1, i32 %s1.sroa.0.0.copyload, i8 %s1.sroa.2.0.copyload, i64 %s1.sroa.31.0.copyload) + ret i32 %call +} + +define dso_local i32 @variadics3(i32 noundef %first, ...) { +; CHECK-PTX-LABEL: variadics3( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics3_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 15; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -16; +; CHECK-PTX-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd3]; +; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2; +; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3; +; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 15 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -16) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load <4 x i32>, ptr %argp.cur.aligned, align 16 + call void @llvm.va_end.p0(ptr %vlist) + %2 = extractelement <4 x i32> %1, i64 0 + %3 = extractelement <4 x i32> %1, i64 1 + %add = add nsw i32 %2, %3 + %4 = extractelement <4 x i32> %1, i64 2 + %add1 = add nsw i32 %add, %4 + %5 = extractelement <4 x i32> %1, i64 3 + %add2 = add nsw i32 %add1, %5 + ret i32 %add2 +} + +define dso_local i32 @baz() { +; CHECK-PTX-LABEL: baz( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot5[16]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot5; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: mov.b32 %r1, 1; +; CHECK-PTX-NEXT: st.v4.u32 [%SP+0], {%r1, %r1, %r1, %r1}; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-PTX-NEXT: { // callseq 2, 0 +; CHECK-PTX-NEXT: .param .b32 param0; +; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd1; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics3, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 2 +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: ret; +entry: + %call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef ) + ret i32 %call +} + +define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { +; CHECK-PTX-LABEL: variadics4( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<9>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics4_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: ld.param.u64 %rd5, [variadics4_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0+8]; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6; +; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4; +; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd8; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load i64, ptr %argp.cur.aligned, align 8 + %x1 = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 0 + %2 = load i64, ptr %x1, align 8 + %y = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 1 + %3 = load i64, ptr %y, align 8 + %add = add nsw i64 %2, %3 + %add2 = add nsw i64 %add, %1 + %conv = trunc i64 %add2 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv +} + +define dso_local void @qux() { +; CHECK-PTX-LABEL: qux( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b64 %rd<7>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot7; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: ld.global.nc.u64 %rd1, [__const_$_qux_$_s]; +; CHECK-PTX-NEXT: st.u64 [%SP+0], %rd1; +; CHECK-PTX-NEXT: mov.u64 %rd2, __const_$_qux_$_s; +; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 8; +; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: st.u64 [%SP+8], %rd4; +; CHECK-PTX-NEXT: mov.u64 %rd5, 1; +; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5; +; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 16; +; CHECK-PTX-NEXT: { // callseq 3, 0 +; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; +; CHECK-PTX-NEXT: st.param.b64 [param0+0], %rd1; +; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd4; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd6; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics4, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 3 +; CHECK-PTX-NEXT: ret; +entry: + %s = alloca %struct.S2, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.qux.s, i64 16, i1 false) + %call = call i32 (ptr, ...) @variadics4(ptr noundef byval(%struct.S2) align 8 %s, i64 noundef 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/variadics-lowering.ll b/llvm/test/CodeGen/NVPTX/variadics-lowering.ll new file mode 100644 index 000000000000000..e40fdff6c19cd32 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/variadics-lowering.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=nvptx64-- --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s + +%struct.S1 = type { i32, i8, i64 } +%struct.S2 = type { i64, i64 } + +@__const.bar.s1 = private unnamed_addr constant %struct.S1 { i32 1, i8 1, i64 1 }, align 8 +@__const.qux.s = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8 + +define dso_local i32 @variadics1(i32 noundef %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics1( +; CHECK-SAME: i32 noundef [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 4 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[FIRST]], [[TMP0]] +; CHECK-NEXT: [[ARGP_CUR1:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_NEXT2:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR1]], i64 4 +; CHECK-NEXT: store ptr [[ARGP_NEXT2]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARGP_CUR1]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[ARGP_CUR4:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR4]], i64 4 +; CHECK-NEXT: store ptr [[ARGP_NEXT5]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGP_CUR4]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD3]], [[TMP2]] +; CHECK-NEXT: [[ARGP_CUR7:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR7]], i32 7 +; CHECK-NEXT: [[ARGP_CUR7_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP3]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT8:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR7_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT8]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARGP_CUR7_ALIGNED]], align 8 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[ADD6]] to i64 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i64 [[CONV]], [[TMP4]] +; CHECK-NEXT: [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32 +; CHECK-NEXT: [[ARGP_CUR11:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR11]], i32 7 +; CHECK-NEXT: [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP5]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT12]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARGP_CUR11_ALIGNED]], align 8 +; CHECK-NEXT: [[CONV13:%.*]] = sitofp i32 [[CONV10]] to double +; CHECK-NEXT: [[ADD14:%.*]] = fadd double [[CONV13]], [[TMP6]] +; CHECK-NEXT: [[CONV15:%.*]] = fptosi double [[ADD14]] to i32 +; CHECK-NEXT: [[ARGP_CUR16:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR16]], i32 7 +; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP7]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT17:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR16_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT17]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARGP_CUR16_ALIGNED]], align 8 +; CHECK-NEXT: [[CONV18:%.*]] = sitofp i32 [[CONV15]] to double +; CHECK-NEXT: [[ADD19:%.*]] = fadd double [[CONV18]], [[TMP8]] +; CHECK-NEXT: [[CONV20:%.*]] = fptosi double [[ADD19]] to i32 +; CHECK-NEXT: ret i32 [[CONV20]] +; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 4 + store ptr %argp.next, ptr %vlist, align 8 + %0 = load i32, ptr %argp.cur, align 4 + %add = add nsw i32 %first, %0 + %argp.cur1 = load ptr, ptr %vlist, align 8 + %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i64 4 + store ptr %argp.next2, ptr %vlist, align 8 + %1 = load i32, ptr %argp.cur1, align 4 + %add3 = add nsw i32 %add, %1 + %argp.cur4 = load ptr, ptr %vlist, align 8 + %argp.next5 = getelementptr inbounds i8, ptr %argp.cur4, i64 4 + store ptr %argp.next5, ptr %vlist, align 8 + %2 = load i32, ptr %argp.cur4, align 4 + %add6 = add nsw i32 %add3, %2 + %argp.cur7 = load ptr, ptr %vlist, align 8 + %3 = getelementptr inbounds i8, ptr %argp.cur7, i32 7 + %argp.cur7.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %3, i64 -8) + %argp.next8 = getelementptr inbounds i8, ptr %argp.cur7.aligned, i64 8 + store ptr %argp.next8, ptr %vlist, align 8 + %4 = load i64, ptr %argp.cur7.aligned, align 8 + %conv = sext i32 %add6 to i64 + %add9 = add nsw i64 %conv, %4 + %conv10 = trunc i64 %add9 to i32 + %argp.cur11 = load ptr, ptr %vlist, align 8 + %5 = getelementptr inbounds i8, ptr %argp.cur11, i32 7 + %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %5, i64 -8) + %argp.next12 = getelementptr inbounds i8, ptr %argp.cur11.aligned, i64 8 + store ptr %argp.next12, ptr %vlist, align 8 + %6 = load double, ptr %argp.cur11.aligned, align 8 + %conv13 = sitofp i32 %conv10 to double + %add14 = fadd double %conv13, %6 + %conv15 = fptosi double %add14 to i32 + %argp.cur16 = load ptr, ptr %vlist, align 8 + %7 = getelementptr inbounds i8, ptr %argp.cur16, i32 7 + %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %7, i64 -8) + %argp.next17 = getelementptr inbounds i8, ptr %argp.cur16.aligned, i64 8 + store ptr %argp.next17, ptr %vlist, align 8 + %8 = load double, ptr %argp.cur16.aligned, align 8 + %conv18 = sitofp i32 %conv15 to double + %add19 = fadd double %conv18, %8 + %conv20 = fptosi double %add19 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv20 +} + +declare void @llvm.va_start.p0(ptr) + +declare ptr @llvm.ptrmask.p0.i64(ptr, i64) + +declare void @llvm.va_end.p0(ptr) + +define dso_local i32 @foo() { +; CHECK-LABEL: define dso_local i32 @foo() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[FOO_VARARG:%.*]], align 8 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 1 to i32 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 1 to i32 +; CHECK-NEXT: [[CONV2:%.*]] = fpext float 1.000000e+00 to double +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store i32 [[CONV]], ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; CHECK-NEXT: store i32 [[CONV1]], ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 2 +; CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 4 +; CHECK-NEXT: store i64 1, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 5 +; CHECK-NEXT: store double [[CONV2]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 6 +; CHECK-NEXT: store double 1.000000e+00, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics1(i32 noundef 1, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %conv = sext i8 1 to i32 + %conv1 = sext i16 1 to i32 + %conv2 = fpext float 1.000000e+00 to double + %call = call i32 (i32, ...) @variadics1(i32 noundef 1, i32 noundef %conv, i32 noundef %conv1, i32 noundef 1, i64 noundef 1, double noundef %conv2, double noundef 1.000000e+00) + ret i32 %call +} + +define dso_local i32 @variadics2(i32 noundef %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics2( +; CHECK-SAME: i32 noundef [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[S1_SROA_3:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +; CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 16 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[S1_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[ARGP_CUR_ALIGNED]], align 8 +; CHECK-NEXT: [[S1_SROA_2_0_ARGP_CUR_ALIGNED_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 4 +; CHECK-NEXT: [[S1_SROA_2_0_COPYLOAD:%.*]] = load i8, ptr [[S1_SROA_2_0_ARGP_CUR_ALIGNED_SROA_IDX]], align 4 +; CHECK-NEXT: [[S1_SROA_3_0_ARGP_CUR_ALIGNED_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 5 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1_SROA_3]], ptr align 1 [[S1_SROA_3_0_ARGP_CUR_ALIGNED_SROA_IDX]], i64 3, i1 false) +; CHECK-NEXT: [[S1_SROA_31_0_ARGP_CUR_ALIGNED_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 8 +; CHECK-NEXT: [[S1_SROA_31_0_COPYLOAD:%.*]] = load i64, ptr [[S1_SROA_31_0_ARGP_CUR_ALIGNED_SROA_IDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[FIRST]], [[S1_SROA_0_0_COPYLOAD]] +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[S1_SROA_2_0_COPYLOAD]] to i32 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD]], [[CONV]] +; CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i64 [[CONV2]], [[S1_SROA_31_0_COPYLOAD]] +; CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[ADD3]] to i32 +; CHECK-NEXT: ret i32 [[CONV4]] +; +entry: + %vlist = alloca ptr, align 8 + %s1.sroa.3 = alloca [3 x i8], align 1 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %s1.sroa.0.0.copyload = load i32, ptr %argp.cur.aligned, align 8 + %s1.sroa.2.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 4 + %s1.sroa.2.0.copyload = load i8, ptr %s1.sroa.2.0.argp.cur.aligned.sroa_idx, align 4 + %s1.sroa.3.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 5 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 %s1.sroa.3.0.argp.cur.aligned.sroa_idx, i64 3, i1 false) + %s1.sroa.31.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + %s1.sroa.31.0.copyload = load i64, ptr %s1.sroa.31.0.argp.cur.aligned.sroa_idx, align 8 + %add = add nsw i32 %first, %s1.sroa.0.0.copyload + %conv = sext i8 %s1.sroa.2.0.copyload to i32 + %add1 = add nsw i32 %add, %conv + %conv2 = sext i32 %add1 to i64 + %add3 = add nsw i64 %conv2, %s1.sroa.31.0.copyload + %conv4 = trunc i64 %add3 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv4 +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +define dso_local i32 @bar() { +; CHECK-LABEL: define dso_local i32 @bar() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S1_SROA_3:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[BAR_VARARG:%.*]], align 8 +; CHECK-NEXT: [[S1_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr @__const.bar.s1, align 8 +; CHECK-NEXT: [[S1_SROA_2_0_COPYLOAD:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1_SROA_3]], ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) +; CHECK-NEXT: [[S1_SROA_31_0_COPYLOAD:%.*]] = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store i32 [[S1_SROA_0_0_COPYLOAD]], ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; CHECK-NEXT: store i8 [[S1_SROA_2_0_COPYLOAD]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 3 +; CHECK-NEXT: store i64 [[S1_SROA_31_0_COPYLOAD]], ptr [[TMP2]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics2(i32 noundef 1, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %s1.sroa.3 = alloca [3 x i8], align 1 + %s1.sroa.0.0.copyload = load i32, ptr @__const.bar.s1, align 8 + %s1.sroa.2.0.copyload = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) + %s1.sroa.31.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 + %call = call i32 (i32, ...) @variadics2(i32 noundef 1, i32 %s1.sroa.0.0.copyload, i8 %s1.sroa.2.0.copyload, i64 %s1.sroa.31.0.copyload) + ret i32 %call +} + +define dso_local i32 @variadics3(i32 noundef %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics3( +; CHECK-SAME: i32 noundef [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15 +; CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -16) +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 16 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARGP_CUR_ALIGNED]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i64 2 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 3 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[ADD1]], [[TMP5]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 15 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -16) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load <4 x i32>, ptr %argp.cur.aligned, align 16 + call void @llvm.va_end.p0(ptr %vlist) + %2 = extractelement <4 x i32> %1, i64 0 + %3 = extractelement <4 x i32> %1, i64 1 + %add = add nsw i32 %2, %3 + %4 = extractelement <4 x i32> %1, i64 2 + %add1 = add nsw i32 %add, %4 + %5 = extractelement <4 x i32> %1, i64 3 + %add2 = add nsw i32 %add1, %5 + ret i32 %add2 +} + +define dso_local i32 @baz() { +; CHECK-LABEL: define dso_local i32 @baz() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[BAZ_VARARG:%.*]], align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[BAZ_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP0]], align 16 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics3(i32 noundef 1, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef ) + ret i32 %call +} + +define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics4( +; CHECK-SAME: ptr noundef byval([[STRUCT_S2:%.*]]) align 8 [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +; CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARGP_CUR_ALIGNED]], align 8 +; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[FIRST]], i32 0, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[X1]], align 8 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[FIRST]], i32 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[Y]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i64 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[ADD2]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load i64, ptr %argp.cur.aligned, align 8 + %x1 = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 0 + %2 = load i64, ptr %x1, align 8 + %y = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 1 + %3 = load i64, ptr %y, align 8 + %add = add nsw i64 %2, %3 + %add2 = add nsw i64 %add, %1 + %conv = trunc i64 %add2 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv +} + +define dso_local void @qux() { +; CHECK-LABEL: define dso_local void @qux() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S2:%.*]], align 8 +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[QUX_VARARG:%.*]], align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[S]], ptr align 8 @__const.qux.s, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[QUX_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store i64 1, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics4(ptr noundef byval([[STRUCT_S2]]) align 8 [[S]], ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret void +; +entry: + %s = alloca %struct.S2, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.qux.s, i64 16, i1 false) + %call = call i32 (ptr, ...) @variadics4(ptr noundef byval(%struct.S2) align 8 %s, i64 noundef 1) + ret void +}