diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h index be43bb04fa2ed75..25dc979d882fd99 100644 --- a/clang/lib/Basic/Targets/NVPTX.h +++ b/clang/lib/Basic/Targets/NVPTX.h @@ -119,8 +119,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo { } BuiltinVaListKind getBuiltinVaListKind() const override { - // FIXME: implement - return TargetInfo::CharPtrBuiltinVaList; + return TargetInfo::VoidPtrBuiltinVaList; } bool isValidCPUName(StringRef Name) const override { diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 423485c9ca16e8e..ec7f1c439b18819 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -203,8 +203,11 @@ ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType Ty) const { void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const { if (!getCXXABI().classifyReturnType(FI)) FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); - for (auto &I : FI.arguments()) - I.info = classifyArgumentType(I.type); + + for (auto &&[ArgumentsCount, I] : llvm::enumerate(FI.arguments())) + I.info = ArgumentsCount < FI.getNumRequiredArgs() + ? classifyArgumentType(I.type) + : ABIArgInfo::getDirect(); // Always honor user-specified calling convention. if (FI.getCallingConvention() != llvm::CallingConv::C) @@ -215,7 +218,10 @@ void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const { RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, AggValueSlot Slot) const { - llvm_unreachable("NVPTX does not support varargs"); + return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*IsIndirect=*/false, + getContext().getTypeInfoInChars(Ty), + CharUnits::fromQuantity(1), + /*AllowHigherAlign=*/true, Slot); } void NVPTXTargetCodeGenInfo::setTargetAttributes( diff --git a/clang/test/CodeGen/variadic-nvptx.c b/clang/test/CodeGen/variadic-nvptx.c new file mode 100644 index 000000000000000..45e22ecc7bc1960 --- /dev/null +++ b/clang/test/CodeGen/variadic-nvptx.c @@ -0,0 +1,94 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -emit-llvm -o - %s | FileCheck %s + +extern void varargs_simple(int, ...); + +// CHECK-LABEL: define dso_local void @foo( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[C:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[S:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[L:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[F:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[D:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4 +// CHECK-NEXT: [[V:%.*]] = alloca <4 x i32>, align 16 +// CHECK-NEXT: [[T:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1 +// CHECK-NEXT: store i8 1, ptr [[C]], align 1 +// CHECK-NEXT: store i16 1, ptr [[S]], align 2 +// CHECK-NEXT: store i32 1, ptr [[I]], align 4 +// CHECK-NEXT: store i64 1, ptr [[L]], align 8 +// CHECK-NEXT: store float 1.000000e+00, ptr [[F]], align 4 +// CHECK-NEXT: store double 1.000000e+00, ptr [[D]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[C]], align 1 +// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[S]], align 2 +// CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[L]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[F]], align 4 +// CHECK-NEXT: [[CONV2:%.*]] = fpext float [[TMP4]] to double +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[D]], align 8 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, i32 noundef [[CONV]], i32 noundef [[CONV1]], i32 noundef [[TMP2]], i64 noundef [[TMP3]], double noundef [[CONV2]], double noundef [[TMP5]]) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 @__const.foo.a, i64 12, i1 false) +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[A]], i32 0, i32 0 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[A]], i32 0, i32 1 +// CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[A]], i32 0, i32 2 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, i32 [[TMP7]], i8 [[TMP9]], i32 [[TMP11]]) +// CHECK-NEXT: store <4 x i32> , ptr [[V]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr [[V]], align 16 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, <4 x i32> noundef [[TMP12]]) +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP13]], align 1 +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1 +// CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 +// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1 +// CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1 +// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[TMP21]], align 1 +// CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1 +// CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +// CHECK-NEXT: call void (i32, ...) @varargs_simple(i32 noundef 0, i8 [[TMP14]], i8 [[TMP16]], i8 [[TMP18]], i8 [[TMP20]], i32 noundef 0, i8 [[TMP22]], i8 [[TMP24]]) +// CHECK-NEXT: ret void +// +void foo() { + char c = '\x1'; + short s = 1; + int i = 1; + long l = 1; + float f = 1.f; + double d = 1.; + varargs_simple(0, c, s, i, l, f, d); + + struct {int x; char c; int y;} a = {1, '\x1', 1}; + varargs_simple(0, a); + + typedef int __attribute__((ext_vector_type(4))) int4; + int4 v = {1, 1, 1, 1}; + varargs_simple(0, v); + + struct {char c, d;} t; + varargs_simple(0, t, t, 0, t); +} + +typedef struct {long x; long y;} S; +extern void varargs_complex(S, S, ...); + +// CHECK-LABEL: define dso_local void @bar( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[S]], ptr align 8 @__const.bar.s, i64 16, i1 false) +// CHECK-NEXT: call void (ptr, ptr, ...) @varargs_complex(ptr noundef byval([[STRUCT_S]]) align 8 [[S]], ptr noundef byval([[STRUCT_S]]) align 8 [[S]], i32 noundef 1, i64 noundef 1, double noundef 1.000000e+00) +// CHECK-NEXT: ret void +// +void bar() { + S s = {1l, 1l}; + varargs_complex(s, s, 1, 1l, 1.0); +} diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 166144d63434490..be4af4d2168968f 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -1,13 +1,3 @@ -if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) - set(extra_entrypoints - # stdio.h entrypoints - libc.src.stdio.snprintf - libc.src.stdio.sprintf - libc.src.stdio.vsnprintf - libc.src.stdio.vsprintf - ) -endif() - set(TARGET_LIBC_ENTRYPOINTS # assert.h entrypoints libc.src.assert.__assert_fail @@ -185,9 +175,12 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.errno.errno # stdio.h entrypoints - ${extra_entrypoints} libc.src.stdio.clearerr libc.src.stdio.fclose + libc.src.stdio.sprintf + libc.src.stdio.snprintf + libc.src.stdio.vsprintf + libc.src.stdio.vsnprintf libc.src.stdio.feof libc.src.stdio.ferror libc.src.stdio.fflush diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index ee0db6b2503851b..4df055c9654eace 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -131,18 +131,15 @@ add_libc_test( libc.src.__support.uint128 ) -# NVPTX does not support varargs currently. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) - add_libc_test( - arg_list_test - SUITE - libc-support-tests - SRCS - arg_list_test.cpp - DEPENDS - libc.src.__support.arg_list - ) -endif() +add_libc_test( + arg_list_test + SUITE + libc-support-tests + SRCS + arg_list_test.cpp + DEPENDS + libc.src.__support.arg_list +) if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 152f200b9d0f36e..097e29527eed9fe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -33,6 +33,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" @@ -342,6 +343,7 @@ void NVPTXPassConfig::addIRPasses() { } addPass(createAtomicExpandLegacyPass()); + addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); addPass(createNVPTXCtorDtorLoweringLegacyPass()); // === LSR and other generic IR passes === diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index d340bc041ccdad5..b5b590e2b7acf2c 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -456,8 +456,8 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder, // Replace known calls to the variadic with calls to the va_list equivalent for (User *U : make_early_inc_range(VariadicWrapper->users())) { if (CallBase *CB = dyn_cast(U)) { - Value *calledOperand = CB->getCalledOperand(); - if (VariadicWrapper == calledOperand) + Value *CalledOperand = CB->getCalledOperand(); + if (VariadicWrapper == CalledOperand) Changed |= expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(), FixedArityReplacement); @@ -938,6 +938,33 @@ struct Amdgpu final : public VariadicABIInfo { } }; +struct NVPTX final : public VariadicABIInfo { + + bool enableForTarget() override { return true; } + + bool vaListPassedInSSARegister() override { return true; } + + Type *vaListType(LLVMContext &Ctx) override { + return PointerType::getUnqual(Ctx); + } + + Type *vaListParameterType(Module &M) override { + return PointerType::getUnqual(M.getContext()); + } + + Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder, + AllocaInst *, Value *Buffer) override { + return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M)); + } + + VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override { + // NVPTX expects natural alignment in all cases. The variadic call ABI will + // handle promoting types to their appropriate size and alignment. + Align A = DL.getABITypeAlign(Parameter); + return {A, false}; + } +}; + struct Wasm final : public VariadicABIInfo { bool enableForTarget() override { @@ -967,8 +994,8 @@ struct Wasm final : public VariadicABIInfo { if (A < MinAlign) A = Align(MinAlign); - if (auto s = dyn_cast(Parameter)) { - if (s->getNumElements() > 1) { + if (auto *S = dyn_cast(Parameter)) { + if (S->getNumElements() > 1) { return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true}; } } @@ -988,6 +1015,11 @@ std::unique_ptr VariadicABIInfo::create(const Triple &T) { return std::make_unique(); } + case Triple::nvptx: + case Triple::nvptx64: { + return std::make_unique(); + } + default: return {}; } diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll new file mode 100644 index 000000000000000..0e0c89d3e0214f8 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -0,0 +1,427 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64-- -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 < %s | FileCheck %s --check-prefix=CHECK-PTX +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %} + +%struct.S1 = type { i32, i8, i64 } +%struct.S2 = type { i64, i64 } + +@__const.bar.s1 = private unnamed_addr constant %struct.S1 { i32 1, i8 1, i64 1 }, align 8 +@__const.qux.s = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8 + +define dso_local i32 @variadics1(i32 noundef %first, ...) { +; CHECK-PTX-LABEL: variadics1( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<11>; +; CHECK-PTX-NEXT: .reg .b64 %rd<11>; +; CHECK-PTX-NEXT: .reg .f64 %fd<7>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: ld.param.u32 %r1, [variadics1_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics1_param_1]; +; CHECK-PTX-NEXT: ld.u32 %r2, [%rd1]; +; CHECK-PTX-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-PTX-NEXT: ld.u32 %r4, [%rd1+4]; +; CHECK-PTX-NEXT: add.s32 %r5, %r3, %r4; +; CHECK-PTX-NEXT: ld.u32 %r6, [%rd1+8]; +; CHECK-PTX-NEXT: add.s32 %r7, %r5, %r6; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 19; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r7; +; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4; +; CHECK-PTX-NEXT: cvt.u32.u64 %r8, %rd6; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd3, 15; +; CHECK-PTX-NEXT: and.b64 %rd8, %rd7, -8; +; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd8]; +; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r8; +; CHECK-PTX-NEXT: add.rn.f64 %fd3, %fd2, %fd1; +; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r9, %fd3; +; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, 15; +; CHECK-PTX-NEXT: and.b64 %rd10, %rd9, -8; +; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd10]; +; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r9; +; CHECK-PTX-NEXT: add.rn.f64 %fd6, %fd5, %fd4; +; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd6; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r10; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 4 + store ptr %argp.next, ptr %vlist, align 8 + %0 = load i32, ptr %argp.cur, align 4 + %add = add nsw i32 %first, %0 + %argp.cur1 = load ptr, ptr %vlist, align 8 + %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i64 4 + store ptr %argp.next2, ptr %vlist, align 8 + %1 = load i32, ptr %argp.cur1, align 4 + %add3 = add nsw i32 %add, %1 + %argp.cur4 = load ptr, ptr %vlist, align 8 + %argp.next5 = getelementptr inbounds i8, ptr %argp.cur4, i64 4 + store ptr %argp.next5, ptr %vlist, align 8 + %2 = load i32, ptr %argp.cur4, align 4 + %add6 = add nsw i32 %add3, %2 + %argp.cur7 = load ptr, ptr %vlist, align 8 + %3 = getelementptr inbounds i8, ptr %argp.cur7, i32 7 + %argp.cur7.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %3, i64 -8) + %argp.next8 = getelementptr inbounds i8, ptr %argp.cur7.aligned, i64 8 + store ptr %argp.next8, ptr %vlist, align 8 + %4 = load i64, ptr %argp.cur7.aligned, align 8 + %conv = sext i32 %add6 to i64 + %add9 = add nsw i64 %conv, %4 + %conv10 = trunc i64 %add9 to i32 + %argp.cur11 = load ptr, ptr %vlist, align 8 + %5 = getelementptr inbounds i8, ptr %argp.cur11, i32 7 + %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %5, i64 -8) + %argp.next12 = getelementptr inbounds i8, ptr %argp.cur11.aligned, i64 8 + store ptr %argp.next12, ptr %vlist, align 8 + %6 = load double, ptr %argp.cur11.aligned, align 8 + %conv13 = sitofp i32 %conv10 to double + %add14 = fadd double %conv13, %6 + %conv15 = fptosi double %add14 to i32 + %argp.cur16 = load ptr, ptr %vlist, align 8 + %7 = getelementptr inbounds i8, ptr %argp.cur16, i32 7 + %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %7, i64 -8) + %argp.next17 = getelementptr inbounds i8, ptr %argp.cur16.aligned, i64 8 + store ptr %argp.next17, ptr %vlist, align 8 + %8 = load double, ptr %argp.cur16.aligned, align 8 + %conv18 = sitofp i32 %conv15 to double + %add19 = fadd double %conv18, %8 + %conv20 = fptosi double %add19 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv20 +} + +declare void @llvm.va_start.p0(ptr) + +declare ptr @llvm.ptrmask.p0.i64(ptr, i64) + +declare void @llvm.va_end.p0(ptr) + +define dso_local i32 @foo() { +; CHECK-PTX-LABEL: foo( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot1; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: mov.u64 %rd1, 4294967297; +; CHECK-PTX-NEXT: st.u64 [%SP+0], %rd1; +; CHECK-PTX-NEXT: mov.b32 %r1, 1; +; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; +; CHECK-PTX-NEXT: mov.u64 %rd2, 1; +; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd2; +; CHECK-PTX-NEXT: mov.u64 %rd3, 4607182418800017408; +; CHECK-PTX-NEXT: st.u64 [%SP+24], %rd3; +; CHECK-PTX-NEXT: st.u64 [%SP+32], %rd3; +; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 0; +; CHECK-PTX-NEXT: { // callseq 0, 0 +; CHECK-PTX-NEXT: .param .b32 param0; +; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd4; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics1, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 0 +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: ret; +entry: + %conv = sext i8 1 to i32 + %conv1 = sext i16 1 to i32 + %conv2 = fpext float 1.000000e+00 to double + %call = call i32 (i32, ...) @variadics1(i32 noundef 1, i32 noundef %conv, i32 noundef %conv1, i32 noundef 1, i64 noundef 1, double noundef %conv2, double noundef 1.000000e+00) + ret i32 %call +} + +define dso_local i32 @variadics2(i32 noundef %first, ...) { +; CHECK-PTX-LABEL: variadics2( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 2 .b8 __local_depot2[4]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b16 %rs<6>; +; CHECK-PTX-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-NEXT: .reg .b64 %rd<11>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot2; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [variadics2_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics2_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.u32 %r2, [%rd3]; +; CHECK-PTX-NEXT: or.b64 %rd4, %rd3, 4; +; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4]; +; CHECK-PTX-NEXT: or.b64 %rd5, %rd3, 5; +; CHECK-PTX-NEXT: or.b64 %rd6, %rd3, 7; +; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd6]; +; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs1; +; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd5]; +; CHECK-PTX-NEXT: or.b64 %rd7, %rd3, 6; +; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd7]; +; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8; +; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2; +; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs5; +; CHECK-PTX-NEXT: ld.u64 %rd8, [%rd3+8]; +; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; +; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd9, %r5; +; CHECK-PTX-NEXT: add.s64 %rd10, %rd9, %rd8; +; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd10; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r6; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + %s1.sroa.3 = alloca [3 x i8], align 1 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %s1.sroa.0.0.copyload = load i32, ptr %argp.cur.aligned, align 8 + %s1.sroa.2.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 4 + %s1.sroa.2.0.copyload = load i8, ptr %s1.sroa.2.0.argp.cur.aligned.sroa_idx, align 4 + %s1.sroa.3.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 5 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 %s1.sroa.3.0.argp.cur.aligned.sroa_idx, i64 3, i1 false) + %s1.sroa.31.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + %s1.sroa.31.0.copyload = load i64, ptr %s1.sroa.31.0.argp.cur.aligned.sroa_idx, align 8 + %add = add nsw i32 %first, %s1.sroa.0.0.copyload + %conv = sext i8 %s1.sroa.2.0.copyload to i32 + %add1 = add nsw i32 %add, %conv + %conv2 = sext i32 %add1 to i64 + %add3 = add nsw i64 %conv2, %s1.sroa.31.0.copyload + %conv4 = trunc i64 %add3 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv4 +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +define dso_local i32 @bar() { +; CHECK-PTX-LABEL: bar( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b16 %rs<10>; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<8>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: mov.u64 %rd1, __const_$_bar_$_s1; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [%rd2]; +; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1; +; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs2; +; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 5; +; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd3]; +; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3; +; CHECK-PTX-NEXT: add.s64 %rd4, %rd1, 6; +; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [%rd4]; +; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5; +; CHECK-PTX-NEXT: shl.b16 %rs7, %rs6, 8; +; CHECK-PTX-NEXT: or.b16 %rs8, %rs7, %rs4; +; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs8; +; CHECK-PTX-NEXT: mov.b32 %r1, 1; +; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; +; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 8; +; CHECK-PTX-NEXT: or.b64 %rd6, %rd5, 4; +; CHECK-PTX-NEXT: mov.u16 %rs9, 1; +; CHECK-PTX-NEXT: st.u8 [%rd6], %rs9; +; CHECK-PTX-NEXT: mov.u64 %rd7, 1; +; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd7; +; CHECK-PTX-NEXT: { // callseq 1, 0 +; CHECK-PTX-NEXT: .param .b32 param0; +; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd5; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics2, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 1 +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: ret; +entry: + %s1.sroa.3 = alloca [3 x i8], align 1 + %s1.sroa.0.0.copyload = load i32, ptr @__const.bar.s1, align 8 + %s1.sroa.2.0.copyload = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) + %s1.sroa.31.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 + %call = call i32 (i32, ...) @variadics2(i32 noundef 1, i32 %s1.sroa.0.0.copyload, i8 %s1.sroa.2.0.copyload, i64 %s1.sroa.31.0.copyload) + ret i32 %call +} + +define dso_local i32 @variadics3(i32 noundef %first, ...) { +; CHECK-PTX-LABEL: variadics3( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics3_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 15; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -16; +; CHECK-PTX-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd3]; +; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2; +; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3; +; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r7; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 15 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -16) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load <4 x i32>, ptr %argp.cur.aligned, align 16 + call void @llvm.va_end.p0(ptr %vlist) + %2 = extractelement <4 x i32> %1, i64 0 + %3 = extractelement <4 x i32> %1, i64 1 + %add = add nsw i32 %2, %3 + %4 = extractelement <4 x i32> %1, i64 2 + %add1 = add nsw i32 %add, %4 + %5 = extractelement <4 x i32> %1, i64 3 + %add2 = add nsw i32 %add1, %5 + ret i32 %add2 +} + +define dso_local i32 @baz() { +; CHECK-PTX-LABEL: baz( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot5[16]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot5; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: mov.b32 %r1, 1; +; CHECK-PTX-NEXT: st.v4.u32 [%SP+0], {%r1, %r1, %r1, %r1}; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-PTX-NEXT: { // callseq 2, 0 +; CHECK-PTX-NEXT: .param .b32 param0; +; CHECK-PTX-NEXT: st.param.b32 [param0+0], 1; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd1; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics3, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 2 +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-PTX-NEXT: ret; +entry: + %call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef ) + ret i32 %call +} + +define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { +; CHECK-PTX-LABEL: variadics4( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<9>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics4_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: ld.param.u64 %rd5, [variadics4_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0+8]; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6; +; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4; +; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd8; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-PTX-NEXT: ret; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load i64, ptr %argp.cur.aligned, align 8 + %x1 = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 0 + %2 = load i64, ptr %x1, align 8 + %y = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 1 + %3 = load i64, ptr %y, align 8 + %add = add nsw i64 %2, %3 + %add2 = add nsw i64 %add, %1 + %conv = trunc i64 %add2 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv +} + +define dso_local void @qux() { +; CHECK-PTX-LABEL: qux( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; +; CHECK-PTX-NEXT: .reg .b64 %SP; +; CHECK-PTX-NEXT: .reg .b64 %SPL; +; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b64 %rd<7>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: // %entry +; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot7; +; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; +; CHECK-PTX-NEXT: ld.global.nc.u64 %rd1, [__const_$_qux_$_s]; +; CHECK-PTX-NEXT: st.u64 [%SP+0], %rd1; +; CHECK-PTX-NEXT: mov.u64 %rd2, __const_$_qux_$_s; +; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 8; +; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: st.u64 [%SP+8], %rd4; +; CHECK-PTX-NEXT: mov.u64 %rd5, 1; +; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5; +; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 16; +; CHECK-PTX-NEXT: { // callseq 3, 0 +; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; +; CHECK-PTX-NEXT: st.param.b64 [param0+0], %rd1; +; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd4; +; CHECK-PTX-NEXT: .param .b64 param1; +; CHECK-PTX-NEXT: st.param.b64 [param1+0], %rd6; +; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: call.uni (retval0), +; CHECK-PTX-NEXT: variadics4, +; CHECK-PTX-NEXT: ( +; CHECK-PTX-NEXT: param0, +; CHECK-PTX-NEXT: param1 +; CHECK-PTX-NEXT: ); +; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; CHECK-PTX-NEXT: } // callseq 3 +; CHECK-PTX-NEXT: ret; +entry: + %s = alloca %struct.S2, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.qux.s, i64 16, i1 false) + %call = call i32 (ptr, ...) @variadics4(ptr noundef byval(%struct.S2) align 8 %s, i64 noundef 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/variadics-lowering.ll b/llvm/test/CodeGen/NVPTX/variadics-lowering.ll new file mode 100644 index 000000000000000..e40fdff6c19cd32 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/variadics-lowering.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=nvptx64-- --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s + +%struct.S1 = type { i32, i8, i64 } +%struct.S2 = type { i64, i64 } + +@__const.bar.s1 = private unnamed_addr constant %struct.S1 { i32 1, i8 1, i64 1 }, align 8 +@__const.qux.s = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8 + +define dso_local i32 @variadics1(i32 noundef %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics1( +; CHECK-SAME: i32 noundef [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 4 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[FIRST]], [[TMP0]] +; CHECK-NEXT: [[ARGP_CUR1:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_NEXT2:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR1]], i64 4 +; CHECK-NEXT: store ptr [[ARGP_NEXT2]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARGP_CUR1]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[ARGP_CUR4:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR4]], i64 4 +; CHECK-NEXT: store ptr [[ARGP_NEXT5]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGP_CUR4]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD3]], [[TMP2]] +; CHECK-NEXT: [[ARGP_CUR7:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR7]], i32 7 +; CHECK-NEXT: [[ARGP_CUR7_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP3]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT8:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR7_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT8]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARGP_CUR7_ALIGNED]], align 8 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[ADD6]] to i64 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i64 [[CONV]], [[TMP4]] +; CHECK-NEXT: [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32 +; CHECK-NEXT: [[ARGP_CUR11:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR11]], i32 7 +; CHECK-NEXT: [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP5]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT12]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARGP_CUR11_ALIGNED]], align 8 +; CHECK-NEXT: [[CONV13:%.*]] = sitofp i32 [[CONV10]] to double +; CHECK-NEXT: [[ADD14:%.*]] = fadd double [[CONV13]], [[TMP6]] +; CHECK-NEXT: [[CONV15:%.*]] = fptosi double [[ADD14]] to i32 +; CHECK-NEXT: [[ARGP_CUR16:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR16]], i32 7 +; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP7]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT17:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR16_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT17]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[ARGP_CUR16_ALIGNED]], align 8 +; CHECK-NEXT: [[CONV18:%.*]] = sitofp i32 [[CONV15]] to double +; CHECK-NEXT: [[ADD19:%.*]] = fadd double [[CONV18]], [[TMP8]] +; CHECK-NEXT: [[CONV20:%.*]] = fptosi double [[ADD19]] to i32 +; CHECK-NEXT: ret i32 [[CONV20]] +; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 4 + store ptr %argp.next, ptr %vlist, align 8 + %0 = load i32, ptr %argp.cur, align 4 + %add = add nsw i32 %first, %0 + %argp.cur1 = load ptr, ptr %vlist, align 8 + %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i64 4 + store ptr %argp.next2, ptr %vlist, align 8 + %1 = load i32, ptr %argp.cur1, align 4 + %add3 = add nsw i32 %add, %1 + %argp.cur4 = load ptr, ptr %vlist, align 8 + %argp.next5 = getelementptr inbounds i8, ptr %argp.cur4, i64 4 + store ptr %argp.next5, ptr %vlist, align 8 + %2 = load i32, ptr %argp.cur4, align 4 + %add6 = add nsw i32 %add3, %2 + %argp.cur7 = load ptr, ptr %vlist, align 8 + %3 = getelementptr inbounds i8, ptr %argp.cur7, i32 7 + %argp.cur7.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %3, i64 -8) + %argp.next8 = getelementptr inbounds i8, ptr %argp.cur7.aligned, i64 8 + store ptr %argp.next8, ptr %vlist, align 8 + %4 = load i64, ptr %argp.cur7.aligned, align 8 + %conv = sext i32 %add6 to i64 + %add9 = add nsw i64 %conv, %4 + %conv10 = trunc i64 %add9 to i32 + %argp.cur11 = load ptr, ptr %vlist, align 8 + %5 = getelementptr inbounds i8, ptr %argp.cur11, i32 7 + %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %5, i64 -8) + %argp.next12 = getelementptr inbounds i8, ptr %argp.cur11.aligned, i64 8 + store ptr %argp.next12, ptr %vlist, align 8 + %6 = load double, ptr %argp.cur11.aligned, align 8 + %conv13 = sitofp i32 %conv10 to double + %add14 = fadd double %conv13, %6 + %conv15 = fptosi double %add14 to i32 + %argp.cur16 = load ptr, ptr %vlist, align 8 + %7 = getelementptr inbounds i8, ptr %argp.cur16, i32 7 + %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %7, i64 -8) + %argp.next17 = getelementptr inbounds i8, ptr %argp.cur16.aligned, i64 8 + store ptr %argp.next17, ptr %vlist, align 8 + %8 = load double, ptr %argp.cur16.aligned, align 8 + %conv18 = sitofp i32 %conv15 to double + %add19 = fadd double %conv18, %8 + %conv20 = fptosi double %add19 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv20 +} + +declare void @llvm.va_start.p0(ptr) + +declare ptr @llvm.ptrmask.p0.i64(ptr, i64) + +declare void @llvm.va_end.p0(ptr) + +define dso_local i32 @foo() { +; CHECK-LABEL: define dso_local i32 @foo() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[FOO_VARARG:%.*]], align 8 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 1 to i32 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 1 to i32 +; CHECK-NEXT: [[CONV2:%.*]] = fpext float 1.000000e+00 to double +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store i32 [[CONV]], ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; CHECK-NEXT: store i32 [[CONV1]], ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 2 +; CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 4 +; CHECK-NEXT: store i64 1, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 5 +; CHECK-NEXT: store double [[CONV2]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[FOO_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 6 +; CHECK-NEXT: store double 1.000000e+00, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics1(i32 noundef 1, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %conv = sext i8 1 to i32 + %conv1 = sext i16 1 to i32 + %conv2 = fpext float 1.000000e+00 to double + %call = call i32 (i32, ...) @variadics1(i32 noundef 1, i32 noundef %conv, i32 noundef %conv1, i32 noundef 1, i64 noundef 1, double noundef %conv2, double noundef 1.000000e+00) + ret i32 %call +} + +define dso_local i32 @variadics2(i32 noundef %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics2( +; CHECK-SAME: i32 noundef [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[S1_SROA_3:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +; CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 16 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[S1_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[ARGP_CUR_ALIGNED]], align 8 +; CHECK-NEXT: [[S1_SROA_2_0_ARGP_CUR_ALIGNED_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 4 +; CHECK-NEXT: [[S1_SROA_2_0_COPYLOAD:%.*]] = load i8, ptr [[S1_SROA_2_0_ARGP_CUR_ALIGNED_SROA_IDX]], align 4 +; CHECK-NEXT: [[S1_SROA_3_0_ARGP_CUR_ALIGNED_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 5 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1_SROA_3]], ptr align 1 [[S1_SROA_3_0_ARGP_CUR_ALIGNED_SROA_IDX]], i64 3, i1 false) +; CHECK-NEXT: [[S1_SROA_31_0_ARGP_CUR_ALIGNED_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 8 +; CHECK-NEXT: [[S1_SROA_31_0_COPYLOAD:%.*]] = load i64, ptr [[S1_SROA_31_0_ARGP_CUR_ALIGNED_SROA_IDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[FIRST]], [[S1_SROA_0_0_COPYLOAD]] +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[S1_SROA_2_0_COPYLOAD]] to i32 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD]], [[CONV]] +; CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i64 [[CONV2]], [[S1_SROA_31_0_COPYLOAD]] +; CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[ADD3]] to i32 +; CHECK-NEXT: ret i32 [[CONV4]] +; +entry: + %vlist = alloca ptr, align 8 + %s1.sroa.3 = alloca [3 x i8], align 1 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %s1.sroa.0.0.copyload = load i32, ptr %argp.cur.aligned, align 8 + %s1.sroa.2.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 4 + %s1.sroa.2.0.copyload = load i8, ptr %s1.sroa.2.0.argp.cur.aligned.sroa_idx, align 4 + %s1.sroa.3.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 5 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 %s1.sroa.3.0.argp.cur.aligned.sroa_idx, i64 3, i1 false) + %s1.sroa.31.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + %s1.sroa.31.0.copyload = load i64, ptr %s1.sroa.31.0.argp.cur.aligned.sroa_idx, align 8 + %add = add nsw i32 %first, %s1.sroa.0.0.copyload + %conv = sext i8 %s1.sroa.2.0.copyload to i32 + %add1 = add nsw i32 %add, %conv + %conv2 = sext i32 %add1 to i64 + %add3 = add nsw i64 %conv2, %s1.sroa.31.0.copyload + %conv4 = trunc i64 %add3 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv4 +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) + +define dso_local i32 @bar() { +; CHECK-LABEL: define dso_local i32 @bar() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S1_SROA_3:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[BAR_VARARG:%.*]], align 8 +; CHECK-NEXT: [[S1_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr @__const.bar.s1, align 8 +; CHECK-NEXT: [[S1_SROA_2_0_COPYLOAD:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1_SROA_3]], ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) +; CHECK-NEXT: [[S1_SROA_31_0_COPYLOAD:%.*]] = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store i32 [[S1_SROA_0_0_COPYLOAD]], ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; CHECK-NEXT: store i8 [[S1_SROA_2_0_COPYLOAD]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[BAR_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 3 +; CHECK-NEXT: store i64 [[S1_SROA_31_0_COPYLOAD]], ptr [[TMP2]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics2(i32 noundef 1, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %s1.sroa.3 = alloca [3 x i8], align 1 + %s1.sroa.0.0.copyload = load i32, ptr @__const.bar.s1, align 8 + %s1.sroa.2.0.copyload = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false) + %s1.sroa.31.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8 + %call = call i32 (i32, ...) @variadics2(i32 noundef 1, i32 %s1.sroa.0.0.copyload, i8 %s1.sroa.2.0.copyload, i64 %s1.sroa.31.0.copyload) + ret i32 %call +} + +define dso_local i32 @variadics3(i32 noundef %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics3( +; CHECK-SAME: i32 noundef [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15 +; CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -16) +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 16 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARGP_CUR_ALIGNED]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i64 2 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 3 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[ADD1]], [[TMP5]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 15 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -16) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load <4 x i32>, ptr %argp.cur.aligned, align 16 + call void @llvm.va_end.p0(ptr %vlist) + %2 = extractelement <4 x i32> %1, i64 0 + %3 = extractelement <4 x i32> %1, i64 1 + %add = add nsw i32 %2, %3 + %4 = extractelement <4 x i32> %1, i64 2 + %add1 = add nsw i32 %add, %4 + %5 = extractelement <4 x i32> %1, i64 3 + %add2 = add nsw i32 %add1, %5 + ret i32 %add2 +} + +define dso_local i32 @baz() { +; CHECK-LABEL: define dso_local i32 @baz() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[BAZ_VARARG:%.*]], align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[BAZ_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP0]], align 16 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics3(i32 noundef 1, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef ) + ret i32 %call +} + +define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { +; CHECK-LABEL: define dso_local i32 @variadics4( +; CHECK-SAME: ptr noundef byval([[STRUCT_S2:%.*]]) align 8 [[FIRST:%.*]], ptr [[VARARGS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VLIST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr [[VARARGS]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +; CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -8) +; CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i64 8 +; CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[VLIST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARGP_CUR_ALIGNED]], align 8 +; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[FIRST]], i32 0, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[X1]], align 8 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_S2]], ptr [[FIRST]], i32 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[Y]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i64 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[ADD2]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %vlist = alloca ptr, align 8 + call void @llvm.va_start.p0(ptr %vlist) + %argp.cur = load ptr, ptr %vlist, align 8 + %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7 + %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8) + %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8 + store ptr %argp.next, ptr %vlist, align 8 + %1 = load i64, ptr %argp.cur.aligned, align 8 + %x1 = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 0 + %2 = load i64, ptr %x1, align 8 + %y = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 1 + %3 = load i64, ptr %y, align 8 + %add = add nsw i64 %2, %3 + %add2 = add nsw i64 %add, %1 + %conv = trunc i64 %add2 to i32 + call void @llvm.va_end.p0(ptr %vlist) + ret i32 %conv +} + +define dso_local void @qux() { +; CHECK-LABEL: define dso_local void @qux() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S2:%.*]], align 8 +; CHECK-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[QUX_VARARG:%.*]], align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[S]], ptr align 8 @__const.qux.s, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[QUX_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; CHECK-NEXT: store i64 1, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @variadics4(ptr noundef byval([[STRUCT_S2]]) align 8 [[S]], ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; CHECK-NEXT: ret void +; +entry: + %s = alloca %struct.S2, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.qux.s, i64 16, i1 false) + %call = call i32 (ptr, ...) @variadics4(ptr noundef byval(%struct.S2) align 8 %s, i64 noundef 1) + ret void +}